# EventKG - Retrieving events for experiments

The aim of the notebook is to automatically retrieve events for the search experiment, with other information like start and end dates.

Before running the notebook, ensure to have the followings:
* EventKG downloaded and preprocessed, cf. `eventkg-filtering.ipynb`
* Subset of EventKG loaded in [GraphDB](https://graphdb.ontotext.com)
* GraphDB endpoint active (Repositories name `eventkg`)

In [None]:
import io
import os
import json
import yaml
import requests
import psutil
from tqdm import tqdm

import ray
from ray.util.multiprocessing.pool import Pool
import pandas as pd
from settings import FOLDER_PATH
from src.hdt_interface import HDTInterface

## 1. Entering and loading variables

In [None]:
# <TO-DO: change if necessary>
FOLDER_SAVE_DATA = os.path.join(FOLDER_PATH, "data-all")
DATASET = "yago"

DATASET_TO_FOLDER = {
    "wikidata": "wikidata-2021-03-05",
    "dbpedia": "dbpedia-snapshot-2021-09",
    "yago": "yago-2020-02-24"
}

nested_dataset = 0 if DATASET == "wikidata" else 1
filter_kb = 1 if DATASET == "dbpedia" else 0

ENDPOINT = "http://localhost:7200/repositories/eventkg"

In [None]:
DATASET_TO_START_URI = {
    "dbpedia": "http://dbpedia",
    "wikidata": "http://www.wikidata",
    "yago": "http://yago"
}

HEADERS = {
    "Accept": "text/csv"
}

NB_CPUS = psutil.cpu_count(logical=False)

In [None]:
# Creating folder if necessary
if not os.path.exists(FOLDER_SAVE_DATA):
    os.makedirs(FOLDER_SAVE_DATA)
if not os.path.exists(os.path.join(FOLDER_SAVE_DATA, DATASET)):
    os.makedirs(os.path.join(FOLDER_SAVE_DATA, DATASET))
for folder in ["config", "gs_events", "referents", "other"]:
    path_folder = os.path.join(FOLDER_SAVE_DATA, DATASET, folder)
    if not os.path.exists(path_folder):
        os.makedirs(path_folder)

In [None]:
# Loading params for search
with open(os.path.join(FOLDER_PATH, "dataset-config", f"{DATASET}.yaml"),
          encoding='utf-8') as file:
    dataset_config = yaml.load(file, Loader=yaml.FullLoader)


interface = HDTInterface(dataset_config=dataset_config, default_pred=[],
                         folder_hdt=DATASET_TO_FOLDER[DATASET],
                         nested_dataset=nested_dataset,
                         filter_kb=filter_kb)

## 1. Retrieving events with the most sub events

Using SPARQL Query + GraphDB endpoint

In [None]:
QUERY_RETRIEVE_EVENTS = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT ?eventKG (COUNT(DISTINCT ?subEventKG) as ?nbSubEvent)
WHERE {
    
 ?event sem:hasSubEvent* ?subEvent .
 ?event sem:hasBeginTimeStamp ?startTimeEvent .
 ?event sem:hasEndTimeStamp ?endTimeEvent .
 ?event owl:sameAs ?eventKG .

 ?subEvent owl:sameAs ?subEventKG .
 ?subEvent sem:hasBeginTimeStamp ?startTimeSubEvent .
 ?subEvent sem:hasEndTimeStamp ?endTimeSubEvent .
 
 FILTER (?endTimeSubEvent >= ?startTimeEvent) .
 FILTER (?startTimeSubEvent <= ?endTimeEvent) .
 FILTER( strStarts( str(?eventKG), "<dataset_start_uri>" ) ) .
 FILTER( strStarts( str(?subEventKG), "<dataset_start_uri>" ) ) .
}
GROUP BY ?eventKG
ORDER BY DESC(?nbSubEvent)
"""

QUERY_RETRIEVE_EVENTS = \
    f"{QUERY_RETRIEVE_EVENTS.replace('<dataset_start_uri>', DATASET_TO_START_URI[DATASET])}"


In [None]:
response = requests.get(ENDPOINT, headers=HEADERS,
                        params={"query": QUERY_RETRIEVE_EVENTS})

In [None]:
df_events = pd.read_csv(
    io.StringIO(response.content.decode('utf-8'))
)
df_events.head(3)

## 2. Retrieving info for each selected event

* Ground truth events from EventKG 
* Referents (URI mapping)
* Start/End dates


### 2.1. Ground truth for each event

Ground truth = event part of that event in EventKG

In [None]:
save_path = os.path.join(FOLDER_SAVE_DATA, DATASET, "other", "events_sub_events.csv")
df_events.to_csv(save_path)

In [None]:
df_events = pd.read_csv(save_path)
df_events = df_events[[col for col in df_events.columns if col != "Unnamed: 0"]]

print(f"# of events: {df_events.shape[0]}")
print(f"# of events with more than 10 sub events: {df_events[df_events.nbSubEvent >= 10].shape[0]}")

In [None]:
df_events.head(5)

In [None]:
QUERY_GROUND_TRUTH_TEMPLATE = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT(?subEventKG as ?linkDBpediaEn)
WHERE {
    
?event owl:sameAs <event-to-replace> .
?event sem:hasSubEvent* ?subEvent .
?subEvent owl:sameAs ?subEventKG .
    
?event sem:hasBeginTimeStamp ?startTimeEvent .
?event sem:hasEndTimeStamp ?endTimeEvent .

?subEvent sem:hasBeginTimeStamp ?startTimeSubEvent .
?subEvent sem:hasEndTimeStamp ?endTimeSubEvent .
    
FILTER( strStarts( str(?subEventKG), "<dataset_start_uri>" ) ) .
FILTER (?endTimeSubEvent >= ?startTimeEvent) .
FILTER (?startTimeSubEvent <= ?endTimeEvent) .
}
"""

QUERY_GROUND_TRUTH_TEMPLATE = QUERY_GROUND_TRUTH_TEMPLATE.replace('<dataset_start_uri>', DATASET_TO_START_URI[DATASET])

In [None]:
events = df_events[df_events.nbSubEvent >= 10].eventKG.values
for i in tqdm(range(len(events))):
    event = events[i]
    name = event.split("/")[-1]
    query = QUERY_GROUND_TRUTH_TEMPLATE.replace(
        "event-to-replace", event
    )
    response = requests.get(ENDPOINT, headers=HEADERS,
                            params={"query": query})
    pd.read_csv(io.StringIO(response.content.decode('utf-8'))) \
        .to_csv(os.path.join(FOLDER_SAVE_DATA, DATASET,
                             "gs_events",
                             f"{name}.csv"))

### 2.2. URI referents for each sub event - Only for DBpedia

Due to differences in dataset version, URIs can vary over time, the aim of this section is to retrieve a unique ID referent for each set of URIs.


In [None]:
from src.get_equivalent_url import get_equivalent_url

In [None]:
def add_equivalent_url(df_path, save_path, dataset):
    if not os.path.exists(save_path):
        get_equivalent_url(df_path=df_path, save_path=save_path, dataset=dataset)

In [None]:
csv_folder = os.path.join(FOLDER_SAVE_DATA, DATASET, "gs_events")
json_folder = os.path.join(FOLDER_SAVE_DATA, DATASET, "referents")

csv_files = os.listdir(csv_folder)

args = [
    (os.path.join(csv_folder, csv_file),
     os.path.join(
         json_folder,
         f"{os.path.splitext(csv_file)[0]}.json"),
     DATASET) \
             for csv_file in csv_files
]

pool = Pool(processes=NB_CPUS)
pool.starmap(add_equivalent_url, args)


### 2.3. Start and End dates of each event

Minimum start date among all start dates, maximum end date among all end dates.

Start date must be before end date.

In [None]:
QUERY_DATES_TEMPLATE = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (min(?startTimeEvent) as ?min) (max(?endTimeEvent) as ?max)
WHERE {
    
 ?event owl:sameAs <event-to-replace> .
 ?event sem:hasSubEvent* ?subEvent .
 ?event sem:hasBeginTimeStamp ?startTimeEvent .
 OPTIONAL { ?event sem:hasEndTimeStamp ?endTimeEvent . }
 ?event owl:sameAs ?eventKG .

 FILTER( strStarts( str(?eventKG), "<dataset_start_uri>" ) ) .
}
GROUP BY ?eventKG
HAVING (max(?endTimeEvent) > min(?startTimeEvent))
"""

QUERY_DATES_TEMPLATE = QUERY_DATES_TEMPLATE.replace('<dataset_start_uri>', DATASET_TO_START_URI[DATASET])

In [None]:
query = QUERY_DATES_TEMPLATE.replace(
        "event-to-replace", "http://yago-knowledge.org/resource/World_War_II")
response = requests.get(ENDPOINT, headers=HEADERS,
                    params={"query": query})
pd.read_csv(io.StringIO(response.content.decode('utf-8')))

In [None]:
def get_dates(event):
    query = QUERY_DATES_TEMPLATE.replace(
        "event-to-replace", event)
    response = requests.get(ENDPOINT, headers=HEADERS,
                        params={"query": query})
    return pd.read_csv(io.StringIO(response.content.decode('utf-8')))

In [None]:
if ray.is_initialized() == True:
    ray.shutdown()
pool = Pool(processes=NB_CPUS)
result = pool.map(get_dates, df_events[df_events.nbSubEvent >= 10].eventKG.values)
if ray.is_initialized() == True:
    ray.shutdown()

In [None]:
result[0]

In [None]:
from src.triply_interface import TriplInterface

interface = TriplInterface()

def retrieve_date_triply(node):
    predicate = "http://dbpedia.org/ontology/startDate"
    triples = interface.run_request(params=dict(subject=node, predicate=predicate), filter_pred=[], filter_keep=False)
    if len(triples) == 0:
        return None, None
    elif len(triples) == 1:
        return str(triples[0][2]), str(triples[0][2])
    else:
        dates = [str(elt[2]) for elt in triples]
        return min(dates), max(dates)

In [None]:
manual_dates = {
    "2014_United_States_elections": {"start": "2014-11-04", "end": "2014-11-04"},
    "2018_United_States_elections": {"start": "2018-11-06", "end": "2018-11-06"},
    "Arab–Israeli_conflict": {"start": "1948-05-15", "end": "2021-12-31"},
    "War_on_terror": {"start": "2011-09-11", "end": "2021-12-31"},
    "Iraqi_conflict_(2003–present)": {"start": "2003-03-20", "end": "2021-12-31"},
    "War_on_Terror": {"start": "2001-09-15", "end": "2022-12-31"}
}

In [None]:
def store_changing_config(dico):
    counter = 0
    for i, event in enumerate(df_events[df_events.nbSubEvent >= 10].eventKG.values):
        curr_df = result[i]
        name = event.split("/")[-1]
        start, end = None, None
        if curr_df.shape[0] != 0:
            start = curr_df["min"].values[0]
            end = curr_df["max"].values[0]
        else:
            #start, end = retrieve_date_triply(node=event)
            if not (start and end) and name in manual_dates:
                start, end = manual_dates.get(name).get("start"), manual_dates.get(name).get("end")
        if start and end:
            dico[event] = {
                "start": event,
                "start_date": start,
                "end_date": end,
                #"gold_standard": os.path.join(FOLDER_SAVE_DATA, "gs_events", f"{name}.csv"),
                #"referents": os.path.join(FOLDER_SAVE_DATA, "referents", f"{name}.json"),
                "gold_standard": f"{name}.csv",
                "referents": f"{name}.json",
                "name_exp": name,
            }
        else:
            counter += 1
            print(f"Dates for {name} could not be found")
    print(f"{counter} events could not be processed further")
    return dico

dico_config = store_changing_config(dico={})

## 3. Prepare configuration files

In [None]:
with open(os.path.join(
    FOLDER_PATH, "configs-example", f"config-{DATASET}.json"), "r", encoding="utf-8") as openfile:
    BASE_CONFIG = json.load(openfile)

In [None]:
for event in df_events[df_events.nbSubEvent >= 10].eventKG.values:
    name = event.split("/")[-1]
    if event in dico_config:
        BASE_CONFIG.update(dico_config[event])
        with open(os.path.join(FOLDER_SAVE_DATA, DATASET, "config", f"{name}.json"), "w", encoding='utf-8') as openfile:
            json.dump(BASE_CONFIG, openfile, indent=4)