# EventKG statistics

Exploring EventKG using:
- Dask for reading and filtering 
- parquet format for savinb
- pandas for operations

Ran using:
- pandas==1.4.2
- dask==2022.3.0
- fastparquet==0.8.1

In [None]:
import os
import pandas as pd
import dask.dataframe as dd
import plotly.express as px

In [None]:
EVENTKG_FOLDER = './eventkg/'

In [None]:
def sep_col(x_content):
	sep = x_content.strip().split(" <")
	val = sep[1] if len(sep) > 1 else ""
	return val

def basic_preprocess(x_content):
	return x_content.replace("<", "").strip()

def process_object(x_content):
	sep = x_content.strip().split(" <")
	return basic_preprocess(sep[0])

def read_nq(folder=None, path=None, preprocess=True):
	if not (folder or path):
		raise ValueError("Either `folder` or `path` must be specified")
	
	if folder:
		to_read = f'{folder}/*.nq'
	else:
		to_read = path

	df=dd.read_csv(to_read, sep='>',
				   names=["subject", "predicate", "object", "meta", "."],
				   on_bad_lines='skip')

	if preprocess:
		df.subject = df.subject.apply(basic_preprocess, meta=('subject', 'str'))
		df.predicate = df.predicate.apply(basic_preprocess, meta=('predicate', 'str'))
		df.meta = df[["object", "meta"]].apply(lambda row: basic_preprocess(row.meta) \
															if row.meta.strip() != '.' \
															else sep_col(row.object), meta=('meta', 'str'), axis=1)
		df.object = df.object.apply(process_object, meta=('object', 'str'))
	
	columns = ["subject", "predicate", "object", "meta"] if preprocess else df.columns

	return df[columns]

## 0. Filtering out wrongly encoded data
Removing bad lines

## 1. Extract sem:hasSubEvent triples
Relevant eventkg file: `relations_base.nq`

In [None]:
df_relations_base = read_nq(path=os.path.join(EVENTKG_FOLDER, 'relations_base.nq'))

In [None]:
df_relations_base.to_csv("relations_base.nq")

In [None]:
df_relations_base.head(5)

In [None]:
df_sem = df_relations_base[df_relations_base.predicate == "http://semanticweb.cs.vu.nl/2009/11/sem/hasSubEvent"]

In [None]:
df_sem.to_parquet('hasSubEvent.parquet')

## 2. Distribution of event, sem:hasSubEvent
prefix sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>

In [None]:
df_sem_subevent = pd.read_parquet("hasSubEvent.parquet")

In [None]:
df_sem_subevent.head(5)

In [None]:
grouped = df_sem_subevent.groupby(['predicate', 'subject']).object.count()
grouped

In [None]:
grouped.to_csv("event_has_sub_event_grouped.csv")

In [None]:
grouped = pd.read_csv("event_has_sub_event_grouped.csv")
grouped.object.describe()

In [None]:
fig = px.histogram(grouped, x='object', nbins=100)
fig.show()

## 3. Linking EventKG events to DBpedia
Relevant eventkg file: `events.nq`

In [None]:
df_events = read_nq(path=os.path.join(EVENTKG_FOLDER, 'events.nq'))

In [None]:
coarser_events = grouped.subject.values
df_events_filtered = df_events[(df_events.subject.isin(coarser_events)) & \
                               (df_events.object.str.startswith("http://dbpedia.org/"))]

In [None]:
df_events_filtered.to_parquet("coarser_events_eventkg.parquet")

In [None]:
df_events_filtered = pd.read_parquet("coarser_events_eventkg.parquet")
print(df_events_filtered.shape)
df_events_filtered.head(5)

## 4. Thresholding for selection/visualisation

In [None]:
grouped[['subject', 'predicate', 'object']]

In [None]:
grouped_with_dbpedia = pd.merge(
    left=grouped[['subject', 'predicate', 'object']],
    right=df_events_filtered[['subject', 'object']].rename(columns={"object": "dbpedia"}),
    on='subject', how='left'
).fillna("")
grouped_with_dbpedia.head(10)

In [None]:
grouped_with_dbpedia.sort_values(by='object', ascending=False).to_csv('info_sub_events_events.csv')

In [None]:
grouped_with_dbpedia[grouped_with_dbpedia.object > 300].sort_values(by='object', ascending=False).head(20)

# 5. Extracting ground truth
?e s.t. `(even£t, sem:hasSubEvent, ?e)`

### One example

In [None]:
def get_eventkg_uri(df_mapped, dbpedia_event):
    return df_mapped[df_mapped.dbpedia == dbpedia_event].subject.values[0]

eventkg_uri_ex = get_eventkg_uri(df_mapped=grouped_with_dbpedia,
                                 dbpedia_event='http://dbpedia.org/resource/Cold_War')
eventkg_uri_ex

In [None]:
df = pd.read_parquet("hasSubEvent.parquet")
df[df.subject == eventkg_uri_ex].head(3)

In [None]:
sub_events_ex = df[df.subject == eventkg_uri_ex].object.unique()

df_sub_events_filtered = df_events[(df_events.subject.isin(sub_events_ex)) & \
                               (df_events.object.str.startswith("http://dbpedia.org/"))]

In [None]:
df_sub_events_filtered = df_sub_events_filtered.compute()

In [None]:
df_sub_events_filtered

### All

In [None]:
all_sub_events = df.object.unique()
df_all_sub_events_filtered = df_events[df_events.subject.isin(all_sub_events)]

In [None]:
df_all_sub_events_filtered = df_all_sub_events_filtered.compute()

In [None]:
df_all_sub_events_filtered.to_csv("all_sub_events.csv")

In [None]:
kg_eq = df_all_sub_events_filtered[df_all_sub_events_filtered.predicate == "http://www.w3.org/2002/07/owl#sameAs"]
kg_eq.head(10)

In [None]:
kg_eq['type_uri'] = kg_eq.object.apply(lambda x: '/'.join(x.split('/')[:3]))
kg_eq.head(5)

In [None]:
grouped_with_dbpedia_sub_event = pd.merge(
    left=df_sem_subevent[['subject', 'predicate', 'object']],
    right=kg_eq[['subject', 'object', 'type_uri']].rename(
        columns={"subject": "object", "object": "kg_object"}),
    on='object', how='left'
).fillna("")
grouped_with_dbpedia_sub_event[['subject', 'object', 'kg_object', 'type_uri']].head(5)

In [None]:
grouped_with_dbpedia_sub_event.groupby(['subject', 'type_uri']).agg({'object': 'count'})

In [None]:
test = grouped_with_dbpedia_sub_event \
    .groupby(['subject', 'type_uri']) \
        .agg({'object': 'count'}) \
            .reset_index(level=0).reset_index(level=0)
test

In [None]:
TYPE_URI = "http://dbpedia.org"
only_en_dbpedia = test[test.type_uri == TYPE_URI].sort_values(by='object', ascending=False)
only_en_dbpedia

In [None]:
df_events_filtered_pd = df_events_filtered.compute()

In [None]:
test2 = pd.merge(
    left=only_en_dbpedia[['subject', 'object']],
    right=df_events_filtered_pd[['subject', 'object']].rename(columns={"object": "dbpedia"}),
    on='subject', how='left'
).fillna("")
test2.head(10)

In [None]:
test2.head(1).subject.values

In [None]:
test2[test2.dbpedia != '']

In [None]:
one_event = ['http://eventKG.l3s.uni-hannover.de/resource/event_58557']
df_one_event = df_events[(df_events.subject.isin(one_event))]

In [None]:
df_one_event = df_one_event.compute()

In [None]:
df_one_event

In [None]:
df_events_filtered_pd[df_events_filtered_pd.subject == 'http://eventKG.l3s.uni-hannover.de/resource/event_58557']

In [None]:
import pandas as pd
import plotly.express as px

df = pd.read_csv("dbpedia-sub-events.csv")
fig = px.histogram(df, x='nbSubEvent', histnorm='percent')
fig.show()


In [None]:
import plotly.graph_objects as go

import numpy as np

x = df.nbSubEvent.values
fig = go.Figure(data=[go.Histogram(x=x, cumulative_enabled=True, histnorm='percent')])

fig.show()

In [1]:
from src.hdt_interface import HDTInterface
import pandas as pd

interface = HDTInterface()
params = {"subject": "http://dbpedia.org/resource/2016_Summer_Olympics"}

PREDICATE = ["http://dbpedia.org/ontology/wikiPageWikiLink",
                    "http://dbpedia.org/ontology/wikiPageRedirects",
                    "http://dbpedia.org/ontology/wikiPageDisambiguates",
                    "http://www.w3.org/2000/01/rdf-schema#seeAlso",
                    "http://xmlns.com/foaf/0.1/depiction",
                    "http://xmlns.com/foaf/0.1/isPrimaryTopicOf",
                    "http://dbpedia.org/ontology/thumbnail",
                    "http://dbpedia.org/ontology/wikiPageExternalLink",
                    "http://dbpedia.org/ontology/wikiPageID",
                    "http://dbpedia.org/ontology/wikiPageLength",
                    "http://dbpedia.org/ontology/wikiPageRevisionID",
                    "http://dbpedia.org/property/wikiPageUsesTemplate",
                    "http://www.w3.org/2002/07/owl#sameAs",
                    "http://www.w3.org/ns/prov#wasDerivedFrom",
                    "http://dbpedia.org/ontology/wikiPageWikiLinkText",
                    "http://dbpedia.org/ontology/wikiPageOutDegree",
                    "http://dbpedia.org/ontology/abstract",
                    "http://www.w3.org/2000/01/rdf-schema#comment",
                    "http://www.w3.org/2000/01/rdf-schema#label"]

In [2]:
triples = interface.run_request(params=params, filter_pred=[], filter_keep=False)

filter_f = lambda x: x.startswith("http://dbpedia.org/") or \
                            not any(x.startswith(elt) for elt in ["http", '"'])

triples = [elt for elt in triples if filter_f(elt[2])]
triples = [elt for elt in triples if filter_f(elt[0])]

In [3]:
def triple_to_df(triples):
    return pd.DataFrame({'subject': [x[0] for x in triples],
                         'predicate': [x[1] for x in triples],
                         'object': [x[2] for x in triples]})

In [4]:
df = triple_to_df(triples=triples)
df = df[~df.predicate.isin(PREDICATE)]
df.predicate.unique()

array(['http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
       'http://dbpedia.org/property/cauldron',
       'http://dbpedia.org/property/stadium',
       'http://dbpedia.org/property/summerNext',
       'http://dbpedia.org/property/summerPrev',
       'http://dbpedia.org/property/winterNext',
       'http://dbpedia.org/property/winterPrev',
       'http://purl.org/linguistics/gold/hypernym',
       'http://purl.org/dc/terms/subject'], dtype=object)

In [5]:
df.to_csv("analysis.csv")

In [21]:
ingoing, outgoing, spec = interface(
    node="http://dbpedia.org/resource/Érick_Barrondo",
    predicate=PREDICATE)

0it [00:00, ?it/s]
100%|██████████| 8/8 [00:00<00:00, 40.46it/s]


In [18]:
date_df = spec[spec.predicate.isin(filtering.temporal)]
date_df.object = date_df.object.astype(str)

filtering.get_to_discard_regex(ingoing, outgoing, dates)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df.object = date_df.object.astype(str)


['http://dbpedia.org/resource/2010_Commonwealth_Games',
 "http://dbpedia.org/resource/Shooting_at_the_2014_Asian_Games_–_Men's_10_metre_air_rifle",
 "http://dbpedia.org/resource/Shooting_at_the_2014_Asian_Games_–_Men's_10_metre_air_rifle_team",
 "http://dbpedia.org/resource/Shooting_at_the_2008_Summer_Olympics_–_Men's_10_metre_air_rifle",
 "http://dbpedia.org/resource/Shooting_at_the_2014_Commonwealth_Games_–_Men's_10_metre_air_rifle",
 "http://dbpedia.org/resource/Shooting_at_the_2010_Asian_Games_–_Men's_10_metre_air_rifle_team"]

In [15]:
import re
def regex_helper(val, default_return_val):
    """ Finding regex column name in subject str uri """
    pattern = "\\d{4}"
    matches = re.findall(pattern, val)
    if matches:
        return str(matches[0])
    return default_return_val

In [16]:
ingoing['regex_helper']= ingoing["subject"].apply(lambda x: regex_helper(x, "2016"))
ingoing

Unnamed: 0,subject,predicate,object,type_df,regex_helper
0,http://dbpedia.org/resource/Sushil_Kumar,http://dbpedia.org/property/after,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2016
1,http://dbpedia.org/resource/2010_Commonwealth_...,http://dbpedia.org/property/athlete'sOath,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2010
2,http://dbpedia.org/resource/A_Shot_at_History,http://dbpedia.org/property/author,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2016
3,http://dbpedia.org/resource/Manpreet_Singh_(fi...,http://dbpedia.org/property/before,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2016
4,http://dbpedia.org/resource/Mary_Kom,http://dbpedia.org/property/before,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2016
5,http://dbpedia.org/resource/Shooting_at_the_20...,http://dbpedia.org/property/bronze,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2014
6,http://dbpedia.org/resource/Shooting_at_the_20...,http://dbpedia.org/property/bronze,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2014
7,http://dbpedia.org/resource/India_at_the_2016_...,http://dbpedia.org/property/flagbearer,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2016
8,http://dbpedia.org/resource/Shooting_at_the_20...,http://dbpedia.org/property/gold,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2008
9,http://dbpedia.org/resource/Shooting_at_the_20...,http://dbpedia.org/property/gold,http://dbpedia.org/resource/Abhinav_Bindra,ingoing,2014


In [22]:
ingoing.subject.values

array([], dtype=float64)

In [23]:
outgoing.object.values

array(['http://dbpedia.org/resource/Juan_Ignacio_Maegli',
       'http://dbpedia.org/resource/Alta_Verapaz_Department',
       'http://dbpedia.org/resource/Guatemala',
       'http://dbpedia.org/resource/San_Cristóbal_Verapaz',
       'http://dbpedia.org/resource/2016_Summer_Olympics',
       'http://dbpedia.org/ontology/Athlete',
       'http://dbpedia.org/ontology/Agent',
       'http://dbpedia.org/ontology/Person'], dtype=object)

In [19]:
from src.filtering import Filtering

filtering = Filtering(args={"what": 1, "where": 1, "when": 1})
dates = ["2016-01-01", "2016-12-31"]

to_discard = filtering(ingoing, outgoing, spec, dates)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df.object = date_df.object.astype(str)


In [20]:
to_discard

['http://dbpedia.org/resource/Zirakpur',
 'http://dbpedia.org/resource/India',
 'http://dbpedia.org/resource/Punjab,_India',
 'http://dbpedia.org/resource/Uttarakhand',
 'http://dbpedia.org/resource/Dehradun',
 "http://dbpedia.org/resource/Shooting_at_the_2008_Summer_Olympics_–_Men's_10_metre_air_rifle",
 "http://dbpedia.org/resource/Shooting_at_the_2014_Asian_Games_–_Men's_10_metre_air_rifle",
 'http://dbpedia.org/resource/2010_Commonwealth_Games',
 "http://dbpedia.org/resource/Shooting_at_the_2010_Asian_Games_–_Men's_10_metre_air_rifle_team",
 "http://dbpedia.org/resource/Shooting_at_the_2014_Asian_Games_–_Men's_10_metre_air_rifle_team",
 "http://dbpedia.org/resource/Shooting_at_the_2014_Commonwealth_Games_–_Men's_10_metre_air_rifle"]

In [None]:
spec.subject.values

In [None]:
to_discard

In [None]:
spec.to_csv("analysis.csv")

In [None]:
temporal = filtering.dates + filtering.start_dates + filtering.end_dates
date_df = spec[spec.predicate.isin(temporal)]
date_df.object = date_df.object.astype(str)

In [None]:
spec[spec.regex_helper > dates[1][:4]]

In [None]:
filtering.get_to_discard_regex(df_pd=spec, dates=dates)