# EventKG - Retrieving events for experiments

The aim of the notebook is to automatically retrieve events for the search experiment, with other information like start and end dates.

Before running the notebook, ensure to have the followings:
* EventKG downloaded and preprocessed, cf. `eventkg-filtering.ipynb`
* Subset of EventKG loaded in [GraphDB](https://graphdb.ontotext.com)
* GraphDB endpoint active (Repositories name `eventkg`)

In [46]:
import io
import os
import json
import yaml
import requests
import psutil
import wandb
from tqdm import tqdm

import ray
from ray.util.multiprocessing.pool import Pool
import pandas as pd
from settings import FOLDER_PATH, WANDB_USER
from src.hdt_interface import HDTInterface

In [47]:
DATASET_TO_START_URI = {
    "dbpedia": "http://dbpedia",
    "wikidata": "http://www.wikidata",
    "yago": "http://yago"
}

In [48]:
HEADERS = {
    "Accept": "text/csv"
}

ENDPOINT = "http://localhost:7200/repositories/eventkg"

NB_EVENTS = 1333

# Folder where data necessary to run experiments will be saved
# This folder should contain the following sub folders: `config`, `gs_events` and `referents`
FOLDER_SAVE_DATA = os.path.join(FOLDER_PATH, "data-test", "yago")

NB_CPUS = psutil.cpu_count(logical=False)
DATASET = "yago"

In [49]:
with open(os.path.join(FOLDER_PATH, "dataset-config", f"{DATASET}.yaml"),
          encoding='utf-8') as file:
    dataset_config = yaml.load(file, Loader=yaml.FullLoader)

DATASET_TO_FOLDER = {
    "wikidata": "wikidata-2021-03-05",
    "dbpedia": "dbpedia-snapshot-2021-09",
    "yago": "yago-2020-02-24"
}

nested_dataset = 0 if DATASET == "wikidata" else 1
filter_kb = 1 if DATASET == "dbpedia" else 0

interface = HDTInterface(dataset_config=dataset_config, default_pred=[],
                         folder_hdt=DATASET_TO_FOLDER[DATASET],
                         nested_dataset=nested_dataset,
                         filter_kb=filter_kb)

## 1. Retrieving events with the most sub events

Using SPARQL Query + GraphDB endpoint

In [6]:
QUERY_RETRIEVE_EVENTS = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT ?eventKG (COUNT(DISTINCT ?subEventKG) as ?nbSubEvent)
WHERE {
    
 ?event sem:hasSubEvent* ?subEvent .
 ?event sem:hasBeginTimeStamp ?startTimeEvent .
 ?event sem:hasEndTimeStamp ?endTimeEvent .
 ?event owl:sameAs ?eventKG .

 ?subEvent owl:sameAs ?subEventKG .
 ?subEvent sem:hasBeginTimeStamp ?startTimeSubEvent .
 ?subEvent sem:hasEndTimeStamp ?endTimeSubEvent .
 
 FILTER (?endTimeSubEvent >= ?startTimeEvent) .
 FILTER (?startTimeSubEvent <= ?endTimeEvent) .
 FILTER( strStarts( str(?eventKG), "<dataset_start_uri>" ) ) .
 FILTER( strStarts( str(?subEventKG), "<dataset_start_uri>" ) ) .
}
GROUP BY ?eventKG
ORDER BY DESC(?nbSubEvent)
"""

QUERY_RETRIEVE_EVENTS = \
    f"{QUERY_RETRIEVE_EVENTS.replace('<dataset_start_uri>', DATASET_TO_START_URI[DATASET])}\nLIMIT {NB_EVENTS}"


In [7]:
response = requests.get(ENDPOINT, headers=HEADERS,
                        params={"query": QUERY_RETRIEVE_EVENTS})

In [18]:
df_events = pd.read_csv(
    io.StringIO(response.content.decode('utf-8'))
)
df_events

Unnamed: 0,linkDBpediaEn
0,http://yago-knowledge.org/resource/Japanese_in...
1,http://yago-knowledge.org/resource/Battle_of_C...
2,http://yago-knowledge.org/resource/Battle_of_C...
3,http://yago-knowledge.org/resource/Battle_of_Y...
4,http://yago-knowledge.org/resource/Hsinchu_Cam...
5,http://yago-knowledge.org/resource/Capitulatio...
6,http://yago-knowledge.org/resource/Battle_of_B...
7,http://yago-knowledge.org/resource/Battle_of_K...
8,http://yago-knowledge.org/resource/Battle_of_C...


## 2. Retrieving info for each selected event

* Ground truth events from EventKG 
* Referents (URI mapping)
* Start/End dates


### 2.1. Ground truth for each event

Ground truth = event part of that event in EventKG

In [9]:
df_events.to_csv(f"df_events_{DATASET.lower()}.csv")

In [50]:
df_events = pd.read_csv(f"df_events_{DATASET.lower()}.csv")
df_events = df_events[[col for col in df_events.columns if col != "Unnamed: 0"]]

print(f"# of events: {df_events.shape[0]}")
print(f"# of events with more than 10 sub events: {df_events[df_events.nbSubEvent >= 10].shape[0]}")

# of events: 76682
# of events with more than 10 sub events: 1064


In [51]:
df_events

Unnamed: 0,eventKG,nbSubEvent
0,http://yago-knowledge.org/resource/World_War_II,1465
1,http://yago-knowledge.org/resource/Coalition_Wars,1410
2,http://yago-knowledge.org/resource/French_Revo...,749
3,http://yago-knowledge.org/resource/French_Revo...,716
4,http://yago-knowledge.org/resource/World_War_I,695
...,...,...
76677,http://yago-knowledge.org/resource/Ōnin_War,1
76678,http://yago-knowledge.org/resource/Ōtsu_incident,1
76679,http://yago-knowledge.org/resource/Šiauliai_Of...,1
76680,http://yago-knowledge.org/resource/Štrpci_mass...,1


In [52]:
QUERY_GROUND_TRUTH_TEMPLATE = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT(?subEventKG as ?linkDBpediaEn)
WHERE {
    
?event owl:sameAs <event-to-replace> .
?event sem:hasSubEvent* ?subEvent .
?subEvent owl:sameAs ?subEventKG .
    
?event sem:hasBeginTimeStamp ?startTimeEvent .
?event sem:hasEndTimeStamp ?endTimeEvent .

?subEvent sem:hasBeginTimeStamp ?startTimeSubEvent .
?subEvent sem:hasEndTimeStamp ?endTimeSubEvent .
    
FILTER( strStarts( str(?subEventKG), "<dataset_start_uri>" ) ) .
FILTER (?endTimeSubEvent >= ?startTimeEvent) .
FILTER (?startTimeSubEvent <= ?endTimeEvent) .
}
"""

QUERY_GROUND_TRUTH_TEMPLATE = QUERY_GROUND_TRUTH_TEMPLATE.replace('<dataset_start_uri>', DATASET_TO_START_URI[DATASET])

In [53]:
events = df_events[df_events.nbSubEvent >= 10].eventKG.values
for i in tqdm(range(len(events))):
    event = events[i]
    name = event.split("/")[-1]
    query = QUERY_GROUND_TRUTH_TEMPLATE.replace(
        "event-to-replace", event
    )
    response = requests.get(ENDPOINT, headers=HEADERS,
                            params={"query": query})
    pd.read_csv(io.StringIO(response.content.decode('utf-8'))) \
        .to_csv(os.path.join(FOLDER_SAVE_DATA,
                             "gs_events",
                             f"{name}.csv"))

100%|██████████| 1064/1064 [00:50<00:00, 21.13it/s]


### 2.2. URI referents for each sub event - Only for DBpedia

Due to differences in dataset version, URIs can vary over time, the aim of this section is to retrieve a unique ID referent for each set of URIs.


In [6]:
from src.get_equivalent_url import get_equivalent_url

In [7]:
def add_equivalent_url(df_path, save_path, dataset):
    if not os.path.exists(save_path):
        get_equivalent_url(df_path=df_path, save_path=save_path, dataset=dataset)

In [9]:
#get_equivalent_url('/Users/ines/Projects/graph_search_framework/data-test/gs_events/2004_Summer_Olympics.csv',
#                   '/Users/ines/Projects/graph_search_framework/data-test/referents/2004_Summer_Olympics.json')

In [8]:
csv_folder = os.path.join(FOLDER_SAVE_DATA, "gs_events")
json_folder = os.path.join(FOLDER_SAVE_DATA, "referents")

csv_files = os.listdir(csv_folder)

args = [
    (os.path.join(csv_folder, csv_file),
     os.path.join(
         json_folder,
         f"{os.path.splitext(csv_file)[0]}.json"),
     DATASET) \
             for csv_file in csv_files
]

pool = Pool(processes=NB_CPUS)
pool.starmap(add_equivalent_url, args)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

### 2.3. Start and End dates of each event

Minimum start date among all start dates, maximum end date among all end dates.

Start date must be before end date.

In [54]:
QUERY_DATES_TEMPLATE = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (min(?startTimeEvent) as ?min) (max(?endTimeEvent) as ?max)
WHERE {
    
 ?event owl:sameAs <event-to-replace> .
 ?event sem:hasSubEvent* ?subEvent .
 ?event sem:hasBeginTimeStamp ?startTimeEvent .
 OPTIONAL { ?event sem:hasEndTimeStamp ?endTimeEvent . }
 ?event owl:sameAs ?eventKG .

 FILTER( strStarts( str(?eventKG), "<dataset_start_uri>" ) ) .
}
GROUP BY ?eventKG
HAVING (max(?endTimeEvent) > min(?startTimeEvent))
"""

QUERY_DATES_TEMPLATE = QUERY_DATES_TEMPLATE.replace('<dataset_start_uri>', DATASET_TO_START_URI[DATASET])

In [55]:
query = QUERY_DATES_TEMPLATE.replace(
        "event-to-replace", "http://yago-knowledge.org/resource/World_War_II")
response = requests.get(ENDPOINT, headers=HEADERS,
                    params={"query": query})
pd.read_csv(io.StringIO(response.content.decode('utf-8')))

Unnamed: 0,min,max
0,1939-09-01,1945-09-02


In [56]:
def get_dates(event):
    query = QUERY_DATES_TEMPLATE.replace(
        "event-to-replace", event)
    response = requests.get(ENDPOINT, headers=HEADERS,
                        params={"query": query})
    return pd.read_csv(io.StringIO(response.content.decode('utf-8')))

In [57]:
if ray.is_initialized() == True:
    ray.shutdown()
pool = Pool(processes=NB_CPUS)
result = pool.map(get_dates, df_events[df_events.nbSubEvent >= 10].eventKG.values)
if ray.is_initialized() == True:
    ray.shutdown()

2022-09-21 14:49:31,789	INFO pool.py:517 -- Starting local ray cluster


In [58]:
result[0]

Unnamed: 0,min,max
0,1939-09-01,1945-09-02


In [18]:
from src.triply_interface import TriplInterface

interface = TriplInterface()

def retrieve_date_triply(node):
    predicate = "http://dbpedia.org/ontology/startDate"
    triples = interface.run_request(params=dict(subject=node, predicate=predicate), filter_pred=[], filter_keep=False)
    if len(triples) == 0:
        return None, None
    elif len(triples) == 1:
        return str(triples[0][2]), str(triples[0][2])
    else:
        dates = [str(elt[2]) for elt in triples]
        return min(dates), max(dates)

In [59]:
manual_dates = {
    "2014_United_States_elections": {"start": "2014-11-04", "end": "2014-11-04"},
    "2018_United_States_elections": {"start": "2018-11-06", "end": "2018-11-06"},
    "Arab–Israeli_conflict": {"start": "1948-05-15", "end": "2021-12-31"},
    "War_on_terror": {"start": "2011-09-11", "end": "2021-12-31"},
    "Iraqi_conflict_(2003–present)": {"start": "2003-03-20", "end": "2021-12-31"},
    "War_on_Terror": {"start": "2001-09-15", "end": "2022-12-31"}
}

In [60]:
def store_changing_config(dico):
    counter = 0
    for i, event in enumerate(df_events[df_events.nbSubEvent >= 10].eventKG.values):
        curr_df = result[i]
        name = event.split("/")[-1]
        start, end = None, None
        if curr_df.shape[0] != 0:
            start = curr_df["min"].values[0]
            end = curr_df["max"].values[0]
        else:
            #start, end = retrieve_date_triply(node=event)
            if not (start and end) and name in manual_dates:
                start, end = manual_dates.get(name).get("start"), manual_dates.get(name).get("end")
        if start and end:
            dico[event] = {
                "start": event,
                "start_date": start,
                "end_date": end,
                #"gold_standard": os.path.join(FOLDER_SAVE_DATA, "gs_events", f"{name}.csv"),
                #"referents": os.path.join(FOLDER_SAVE_DATA, "referents", f"{name}.json"),
                "gold_standard": f"{name}.csv",
                "referents": f"{name}.json",
                "name_exp": name,
            }
        else:
            counter += 1
            print(f"Dates for {name} could not be found")
    print(f"{counter} events could not be processed further")
    return dico

dico_config = store_changing_config(dico={})

Dates for French_legislative_election,_1945 could not be found
Dates for United_States_House_of_Representatives_elections,_2014 could not be found
Dates for United_States_Senate_elections,_1992 could not be found
Dates for United_States_Senate_elections,_1998 could not be found
Dates for United_States_Senate_elections,_2004 could not be found
Dates for United_States_Senate_elections,_2016 could not be found
Dates for United_States_elections,_2014 could not be found
Dates for United_States_Senate_elections,_2000 could not be found
Dates for United_States_Senate_elections,_2012 could not be found
Dates for United_States_Senate_elections,_2014 could not be found
Dates for United_States_Senate_elections,_2008 could not be found
Dates for United_States_Senate_elections,_2018 could not be found
Dates for United_States_House_of_Representatives_elections,_2016 could not be found
Dates for United_States_Senate_elections,_2006 could not be found
Dates for Irish_local_elections,_2014 could not be

## 3. Prepare configuration files for sweep for each event

"predicate_filter": ["http://dbpedia.org/ontology/wikiPageWikiLink",
                         "http://dbpedia.org/ontology/wikiPageRedirects",
                         "http://dbpedia.org/ontology/wikiPageDisambiguates",
                         "http://www.w3.org/2000/01/rdf-schema#seeAlso",
                         "http://xmlns.com/foaf/0.1/depiction",
                         "http://xmlns.com/foaf/0.1/isPrimaryTopicOf",
                         "http://dbpedia.org/ontology/thumbnail",
                         "http://dbpedia.org/ontology/wikiPageExternalLink",
                         "http://dbpedia.org/ontology/wikiPageID",
                         "http://dbpedia.org/ontology/wikiPageLength",
                         "http://dbpedia.org/ontology/wikiPageRevisionID",
                         "http://dbpedia.org/property/wikiPageUsesTemplate",
                         "http://www.w3.org/2002/07/owl#sameAs",
                         "http://www.w3.org/ns/prov#wasDerivedFrom",
                         "http://dbpedia.org/ontology/wikiPageWikiLinkText",
                         "http://dbpedia.org/ontology/wikiPageOutDegree",
                         "http://dbpedia.org/ontology/abstract",
                         "http://www.w3.org/2000/01/rdf-schema#comment",
                         "http://www.w3.org/2000/01/rdf-schema#label"],

In [64]:
BASE_CONFIG = {
    "rdf_type": {
        "event": "http://schema.org/Event"
    },
    "predicate_filter": [
    "http://www.w3.org/2000/01/rdf-schema#label",
    "http://schema.org/sameAs",
    "http://schema.org/alternateName",
    "http://www.w3.org/2000/01/rdf-schema#comment",
    "http://www.w3.org/2000/01/rdf-schema#seeAlso",
    "http://www.w3.org/ns/prov#wasDerivedFrom",
    "http://schema.org/image",
    "http://schema.org/about"
    ],
    "start": "http://dbpedia.org/resource/WTA_Tier_II_tournaments",
    "start_date": "1990-01-01",
    "end_date": "2008-12-31",
    "iterations": 5,
    "type_ranking": "pred_object_freq",
    "type_interface": "hdt",
    "gold_standard": "./data/gs_events/events_WTA_Tier_II_tournaments.csv",
    "referents": "./data/referents/referents_WTA_Tier_II_tournaments.json",
    "type_metrics": ["precision", "recall", "f1"],
    "ordering": {
        "domain_range": 1
    },
    "filtering": {
        "what": 1,
        "where": 1,
        "when": 1,
        "who": 1
    },
    "name_exp": "wta_tier_ii_tournament",
    "dataset_type": "yago",
    "dataset_path": "yago-2020-02-24"
}

In [65]:
for event in df_events[df_events.nbSubEvent >= 10].eventKG.values:
    name = event.split("/")[-1]
    if event in dico_config:
        BASE_CONFIG.update(dico_config[event])
        with open(os.path.join(FOLDER_SAVE_DATA, "config", f"{name}.json"), "w", encoding='utf-8') as openfile:
            json.dump(BASE_CONFIG, openfile, indent=4)