## Event types across EventKG

In [3]:
import io
import os
import yaml
import requests
from tqdm import tqdm
import pandas as pd

In [4]:
from settings import FOLDER_PATH
from src.hdt_interface import HDTInterface

DATASET = "yago"

with open(os.path.join(FOLDER_PATH, "dataset-config", f"{DATASET}.yaml"),
          encoding='utf-8') as file:
    dataset_config = yaml.load(file, Loader=yaml.FullLoader)

DATASET_TO_FOLDER = {
    "wikidata": "wikidata-2021-03-05",
    "dbpedia": "dbpedia-snapshot-2021-09",
    "yago": "yago-2020-02-24"
}

nested_dataset = 0 if DATASET == "wikidata" else 1
filter_kb = 1 if DATASET == "dbpedia" else 0

interface = HDTInterface(dataset_config=dataset_config, default_pred=[],
                         folder_hdt=DATASET_TO_FOLDER[DATASET],
                         nested_dataset=nested_dataset,
                         filter_kb=filter_kb)

In [13]:
df = pd.read_csv(f"df_events_{DATASET}.csv")

### Number of events per coarser events

In [4]:
import pandas as pd
import plotly.express as px

fig = px.histogram(df[df.nbSubEvent > 10], x='nbSubEvent', histnorm='percent')
fig.show()

In [5]:
import plotly.graph_objects as go

import numpy as np

x = df[df.nbSubEvent > 10].nbSubEvent.values
fig = go.Figure(data=[go.Histogram(x=x, cumulative_enabled=True, histnorm='percent')])

fig.show()

In [7]:
for val in [1, 5, 10, 30, 50, 100]:
    print(f"# of events with strictly more than {val} sub events: {df[df.nbSubEvent > val].shape[0]}")

# of events with strictly more than 1 sub events: 5944
# of events with strictly more than 5 sub events: 1805
# of events with strictly more than 10 sub events: 993
# of events with strictly more than 30 sub events: 399
# of events with strictly more than 50 sub events: 158
# of events with strictly more than 100 sub events: 84


### Type of events from EventKG

In [14]:
ENDPOINT = "http://eventkginterface.l3s.uni-hannover.de/sparql"
HEADERS = {
    "Accept": "text/csv"
}

In [15]:
QUERY_TYPE_TEMPLATE = """
SELECT DISTINCT ?eventType 
WHERE
{
?event owl:sameAs <event-to-replace> .
?event rdf:type sem:Event .
?event rdf:type ?eventType .
FILTER( strStarts( str(?eventType), "http://dbpedia") ) .
}
"""

In [16]:
def get_response_df(event, nb):
    # response = requests.get(
    #     ENDPOINT, headers=HEADERS,
    #     params={"query": QUERY_TYPE_TEMPLATE.replace("event-to-replace", event)})
    # curr_df = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
    # curr_df["event"] = event
    # curr_df["nbSubEvent"] = nb

    predicate = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
    types = set(interface.run_request(params=dict(subject=event, predicate=predicate),
                                      filter_pred=[predicate], filter_keep=True))
    return pd.DataFrame({"event": [event] * len(types),
                         "eventType": [x[2] for x in types],
                         "nbSubEvent": [nb] * len(types)})

In [17]:
def get_all_event_types(df_events):
    final_df = pd.DataFrame(columns=['eventType', 'event', "nbSubEvent"])
    with tqdm(total=df_events.shape[0]) as pbar:
        for _, row in df_events.iterrows():
            pbar.update(1)
            final_df = pd.concat(
                [
                    final_df,
                    get_response_df(event=row.eventKG, nb=row.nbSubEvent)],
                axis=0)
        pbar.close()
    return final_df

In [18]:
threshold = 10
df_events = df[df.nbSubEvent > threshold]
df_types = get_all_event_types(df_events)

100%|██████████| 993/993 [00:36<00:00, 27.19it/s]


In [19]:
df_types.to_csv(f"events_types_{DATASET}.csv")

In [20]:
threshold = 10
df_types = pd.read_csv(f"events_types_{DATASET}.csv")
df_types = df_types[[col for col in df_types.columns if "Unnamed" not in col]]
print(f"# of events with strictly more than {threshold} sub events: {df_events.shape[0]}")
print(f"# of these events with a type: {df_types.event.unique().shape[0]}")
print(f"# of unique event types: {df_types.eventType.unique().shape[0]}")
df_types

# of events with strictly more than 10 sub events: 993
# of these events with a type: 826
# of unique event types: 75


Unnamed: 0,eventType,event,nbSubEvent
0,http://yago-knowledge.org/resource/War,http://yago-knowledge.org/resource/Coalition_Wars,1410
1,http://schema.org/Thing,http://yago-knowledge.org/resource/Coalition_Wars,1410
2,http://yago-knowledge.org/resource/War,http://yago-knowledge.org/resource/French_Revo...,749
3,http://schema.org/Thing,http://yago-knowledge.org/resource/French_Revo...,749
4,http://schema.org/Thing,http://yago-knowledge.org/resource/French_Revo...,716
...,...,...,...
2221,http://schema.org/Thing,http://yago-knowledge.org/resource/Weightlifti...,11
2222,http://schema.org/Thing,http://yago-knowledge.org/resource/Western_Sah...,11
2223,http://yago-knowledge.org/resource/Conflict_(p...,http://yago-knowledge.org/resource/Western_Sah...,11
2224,http://schema.org/Event,http://yago-knowledge.org/resource/Wrestling_a...,11


In [21]:
grouped = df_types.groupby('eventType').agg({"event": "nunique", "nbSubEvent": ["max", "min"]})
grouped.sort_values(by=('event', 'nunique'), ascending=False)

Unnamed: 0_level_0,event,nbSubEvent,nbSubEvent
Unnamed: 0_level_1,nunique,max,min
eventType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
http://schema.org/Thing,808,1410,11
http://schema.org/Event,591,347,11
http://schema.org/EventSeries,166,244,11
http://yago-knowledge.org/resource/War,116,1410,11
http://yago-knowledge.org/resource/Season_(sports),74,131,11
...,...,...,...
http://yago-knowledge.org/resource/Military_occupation,1,15,15
http://yago-knowledge.org/resource/Naval_warfare,1,23,23
http://yago-knowledge.org/resource/Political_movement,1,43,43
http://yago-knowledge.org/resource/Process_(science),1,12,12


In [22]:
grouped.to_csv(f"grouped_event_types_{DATASET}.csv")

In [23]:
example = {
    "yago": "http://yago-knowledge.org/resource/Historical_period",
    "dbpedia": "http://dbpedia.org/ontology/Election"
}

In [24]:
df_types[df_types.eventType == example[DATASET]].event.values

array(['http://yago-knowledge.org/resource/Sengoku_period',
       'http://yago-knowledge.org/resource/Italian_unification'],
      dtype=object)

In [25]:
events = df_types[df_types.eventType == example[DATASET]].event.values
df_types[df_types.event.isin(events)]

Unnamed: 0,eventType,event,nbSubEvent
121,http://schema.org/Thing,http://yago-knowledge.org/resource/Sengoku_period,143
122,http://yago-knowledge.org/resource/Historical_...,http://yago-knowledge.org/resource/Sengoku_period,143
585,http://yago-knowledge.org/resource/Political_m...,http://yago-knowledge.org/resource/Italian_uni...,43
586,http://schema.org/Thing,http://yago-knowledge.org/resource/Italian_uni...,43
587,http://yago-knowledge.org/resource/Social_move...,http://yago-knowledge.org/resource/Italian_uni...,43
588,http://yago-knowledge.org/resource/Historical_...,http://yago-knowledge.org/resource/Italian_uni...,43


In [26]:
df_types[df_types.event.isin(events)].groupby('eventType').agg({"event": "nunique"})

Unnamed: 0_level_0,event
eventType,Unnamed: 1_level_1
http://schema.org/Thing,2
http://yago-knowledge.org/resource/Historical_period,2
http://yago-knowledge.org/resource/Political_movement,1
http://yago-knowledge.org/resource/Social_movement,1


In [35]:
manual_filter_out = {
    "dbpedia": [
        "http://dbpedia.org/ontology/WomensTennisAssociationTournament",
        "http://dbpedia.org/ontology/Building",
        "http://dbpedia.org/ontology/Settlement",
        "http://dbpedia.org/ontology/Rebellion",
        "http://dbpedia.org/ontology/Painting",
        "http://dbpedia.org/ontology/Media",
        "http://dbpedia.org/ontology/Profession",
        "http://dbpedia.org/ontology/OldTerritory",
        "http://dbpedia.org/ontology/HistoricalPeriod",
        "http://dbpedia.org/ontology/FootballMatch",
        "http://dbpedia.org/ontology/Award",
        "http://dbpedia.org/ontology/Work",
        "http://dbpedia.org/ontology/Name",
        "http://dbpedia.org/ontology/InternationalFootballLeagueEvent",
        "http://dbpedia.org/ontology/Article",
        "http://dbpedia.org/ontology/Sport",
        "http://dbpedia.org/ontology/Activity"
    ],

    "yago": [
        "http://schema.org/EventSeries",
        "http://schema.org/Festival",
        "http://schema.org/Landform",
        "http://yago-knowledge.org/resource/Atlantic_hurricane_season"
    ]
} 

filtering_regex = {
    "yago": [
        # from types
        "Sports", "Championship", "Athletic", "_Open", "Game", "Boxing",
        "FIFA", "Cup", "Fencing", "festival", "Cycling", "Gymnastics",
        "Judo", "Multi-sport", "Rowing", "Olympic", "sports", "Paralympic",
        "tennis", "Weightlifting", "Wrestling", "Swimming",
        # from event names
        "election", "World_Tour", "ProTour", "season", "kampioenschappen",
        "Olympische", "Gymnastique", "World_Ranking", "Ronde_van_Nederland",
        "IAAF", "ATP", "Serie", 'Award'
    ]
}

In [36]:
def filter_types(grouped):
    res = grouped[~grouped.index.isin(manual_filter_out[DATASET])]
        
    for regex in filtering_regex[DATASET]:
        res = res[~res.index.str.contains(regex)]

    return res.index.unique()

keep_type = filter_types(grouped)

In [42]:
print(f"<#> of events: {df_events.shape[0]}")
print(f"<#> of events with types: {df_types.event.unique().shape[0]}")

event_wrong_type_class = df_types[~df_types.eventType.isin(keep_type)].event.unique()
events_right_type_cand = set(df_types.event.unique()).difference(set(event_wrong_type_class))
events_wrong_type_lit = [elt for elt in events_right_type_cand if any(x in elt for x in filtering_regex[DATASET])]
events_right_type = events_right_type_cand.difference(set(events_wrong_type_lit))

print(f"<#> of events with wrong type: {event_wrong_type_class.shape[0] + len(events_wrong_type_lit)}")
print(f"<#> of events with right type: {len(events_right_type)}")

<#> of events: 993
<#> of events with types: 826
<#> of events with wrong type: 578
<#> of events with right type: 248


In [38]:
events_no_label = set(df_events.eventKG.unique()).difference(df_types.event.unique())
df_events[df_events.eventKG.isin(events_no_label)]

Unnamed: 0.1,Unnamed: 0,eventKG,nbSubEvent
0,0,http://yago-knowledge.org/resource/World_War_II,1465
52,52,http://yago-knowledge.org/resource/War_in_Afgh...,141
62,62,http://yago-knowledge.org/resource/Middle-East...,120
68,68,http://yago-knowledge.org/resource/Iraqi_insur...,113
73,73,http://yago-knowledge.org/resource/War_in_Nort...,108
...,...,...,...
984,984,http://yago-knowledge.org/resource/Syria–Leban...,11
985,985,http://yago-knowledge.org/resource/United_Stat...,11
988,988,http://yago-knowledge.org/resource/Western_All...,11
991,991,http://yago-knowledge.org/resource/de/Mainfeldzug,11


In [39]:
def filter_events_no_labels(df):
    print(f"Before filter: {df.shape[0]} events")
    for regex in filtering_regex[DATASET]:
        df = df[~df.eventKG.str.contains(regex)]
    print(f"After filter: {df.shape[0]} events")
    return df

filtered_events_no_label = filter_events_no_labels(df_events[df_events.eventKG.isin(events_no_label)])

Before filter: 167 events
After filter: 58 events


In [40]:
events_to_keep = list(filtered_events_no_label.eventKG.unique()) + \
    list(events_right_type)

print(f"# of events to keep: {len(events_to_keep)}")

df_filtered = df_events[df_events.eventKG.isin(events_to_keep)]
df_filtered = df_filtered[[col for col in df_filtered.columns if col != "Unnamed: 0"]]
df_filtered.head(5)

# of events to keep: 306


Unnamed: 0,eventKG,nbSubEvent
0,http://yago-knowledge.org/resource/World_War_II,1465
1,http://yago-knowledge.org/resource/Coalition_Wars,1410
2,http://yago-knowledge.org/resource/French_Revo...,749
3,http://yago-knowledge.org/resource/French_Revo...,716
4,http://yago-knowledge.org/resource/World_War_I,695


In [41]:
df_filtered.to_csv(f"filtered_events_{DATASET}.csv")