# Wikidataparsing

In [None]:
from wikidataparsing.wikidataparsing import DARES

from model.model import SyntacticIndex, SemanticIndex

from utils.utils import balanceRelationDataset, loadCorpus, prepare_corpus
from collections import Counter
import json 

import pandas as pd

%load_ext autoreload
%autoreload 2

In [None]:
!python -m spacy download en_core_web_lg
!python -m spacy download fr_core_news_lg
!python -m spacy download en_core_web_trf
!python -m spacy download fr_dep_news_trf


# Build the DARES dataset

In [None]:
# language code to select the language of the corpus
lg = 'en'
# lg = 'fr'

# nlp = spacy.load('fr_dep_news_trf')
# nlp = spacy.load('en_core_web_trf')
# nlp = spacy.load('fr_core_news_lg')
# nlp = spacy.load('en_core_web_lg')


# specifies number of cores to use
n_core = 6


wp = DARES(lg=lg, nlp_model='en_core_web_lg')

In [None]:
dict_rel = [
    {
        # specifiy by what Item type / Property must search in the WhatLinksHere pages (e.g. Q5 ('human'))
        "type": 'Q5',
        # you can provide a label for the Item / Property (e.g. 'human')
        "name": "human",
        # indicate the set of relations you want to collect from Wikidata / Wikipedia
        "props":{
            # PXX are the identifier of a Property on Wikidata
            # you define the value (e.g. placeOfBirth)
            "P19": 'placeOfBirth',
            "P569": "dateOfBirth",
            "P509": 'causeOfDeath',
            "P570": "dateOfDeath",
            "P119": "placeOfBurial",
            "P26": "spouse",
            "P106": "occupation",
            "P69": "educatedAt",
        }      
    }]

projectname = f'Q5'

wp.initiate_project(projectname, dict_rel)



In [None]:
# below, example when collecting data for multiple entity types (here, person and location)
dict_rel = [
    {
        "type": 'Q5',
        "name": "person",
        "props":{
            "P19": 'placeOfBirth',
            "P569": "dateOfBirth",
            "P509": 'causeOfDeath',
            "P570": "dateOfDeath",
            "P119": "placeOfBurial",
            "P26": "spouse",
            "P106": "occupation",
            "P69": "educatedAt",
            "P509": "causeOfDeath"
        }      
    },
    {   
        "type": "Property:P625",
        "name": "location",
        'props':{
            "P571": 'inception',
            "P17": "country",
            "P1082": "population",
            "P1376": "capitalOf",
            "P276": "location",
            "P36":"capital",
            "P35": "headOfState",
            "P6": "headOfGoverment",
            "P1082": "population",
            "P47": "sharesBordersWith",
            "P463": "memberOf",
            "P206": "nextInBodyWater"
        }
    }
]

projectname = f'test3'

wp.initiate_project(projectname, dict_rel)



In [None]:
# total of pages to collect
limit = 1
# limit = 10

# specifies at which steps saves collected entity ids to disk
save_step = 20

# specifies how many ids to collect per pages
m_size = 10

# m_size = 50

# collect list of Wikidata entities
wp.collect_Wikidata_links(dict_rel, limit, m_size, save_step, n_core)

In [None]:
source_doc = 'wikipedia'

# similarity threshold for the distant supervision step
score_cutoff = 95

# getOther = False
getOther = True
maxsizesent = True

wp.processListEntities(source_doc=source_doc, score_cutoff=score_cutoff, getOther=getOther, maxsizesent=maxsizesent, n_core=n_core)


# Shortest Dependency Path

In [None]:

# removes sentences where these is no entity match
removeNoMatch = True

# list of Property by ID to keep. Here, keeps every Property

corpus = wp.extract_sdp(removeNoMatch=removeNoMatch, n_core=n_core)


In [None]:
corpus

In [None]:
# checks the distribution of relation in dataset
list_prop = [z['prop'] for x in corpus for y in x['content'] for z in y['props']]
# list_prop
Counter(list_prop).most_common()

In [None]:
from glob import glob 


# use this cell if you want to define the type of the target entitys involved in each Relation
def replaceEntType(dict_prop:dict) -> dict:
    prop = dict_prop['prop']

    if prop == 'P19': # placeOfBirth
        dict_prop['target_type'] = 'Property:P625'

    elif prop == 'P569': # dateOfBirth
        dict_prop['target_type'] = 'time'

    elif prop == 'P570': # dateOfDeath
        dict_prop['target_type'] = 'time'

    elif prop == 'P26': # spouse
        dict_prop['target_type'] = 'Q5'

    elif prop == 'P106': # occupation
        dict_prop['target_type'] = 'Misc'

    elif prop == 'P69': # educatedAt
        dict_prop['target_type'] = 'Property:P625'

    elif prop == 'P17': # country
        dict_prop['target_type'] = 'Property:P625'

    elif prop == 'P571': # inception
        dict_prop['target_type'] = 'time'

    elif prop == 'P1376': # capitalOf
        dict_prop['target_type'] = 'Property:P625'

    elif prop == 'P36': # capital
        dict_prop['target_type'] = 'Property:P625'

    elif prop == 'P6': # headOfGoverment
        dict_prop['target_type'] = 'Q5'

    elif prop == 'P47': # sharesBordersWith
        dict_prop['target_type'] = 'Property:P625'

    elif prop == 'P463': # memberOf
        dict_prop['target_type'] = 'Property:P625'

    elif prop == 'P206': # nextInBodyWater
        dict_prop['target_type'] = 'Property:P625'


    return dict_prop

for filepath in glob(f"{project}/corpus/**/*.json", recursive=True):
    print(filepath)
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    try:
        for sent in data['content']:
            for prop in sent['props']:
                prop = replaceEntType(prop)

        with open(filepath, 'w', encoding='utf-8') as f:
            data = json.dump(data, f, indent=4)
    except:
        print(filepath)
    # break
# with open(f"{project}/corpus")

# Build the Indices

In [None]:
# clean = True if you want to removes sentences annotated as Other
clean = True
corpus = loadCorpus(wp.project_path, clean=clean)
len(corpus), corpus[0]

In [None]:
# Run this cell to convert the corpus in a format for building the indices
# also, allows to divide the corpus into a train, dev and validation test
params = {
    # corpus to process
    "corpus": corpus,
    # size of the train set. 1 means the whole corpus
    "train_size": 1,
    # size of the dev set
    "dev_size": 0,
    # removes Other labels
    "clean": True
    # "maxsize": 200000
}
data = prepare_corpus(**params)
# print(len(list_graphs))
# list_graphs[0]
try:
    print(len(data['X_train']), len(data['X_dev']), len(data['X_test']))
except:
    print(len(data['X_train']), len(data['y_train']))

c = Counter([y for y in data['y_train']])
print(pd.Series(dict(c.most_common())).to_frame().to_latex())

In [None]:
# run this cell to build the Syntactic Index 
syntactic_index_params = {
    # data to use for building the index
    "list_graphs": data['X_train'],
    # surface form of the predicates
    "anchor_textvalue": ['lemma', 'pos'],
    # which graph to use as lexico-syntactic pattern
    "graphkey": 'sdpgraph',
    # key to use for relation label
    "propkey": 'prop',
    # minimum support for each relation per pattern
    "support": 0,
    "dict_rel": dict_rel,
    "savepath": wp.project_path
}
syntacticIndex = SyntacticIndex()
syntacticIndex.trainSyntacticIndex(**syntactic_index_params)
syntacticIndex.syntacticIndex

In [None]:
semantic_index_params = {
    "list_graphs": data['X_train'],
    "textvalue":  syntacticIndex.syntacticIndexParams['anchor_textvalue'],
    "dict_rel": dict_rel,
    "removePROPN": True,
    "savepath": wp.project_path

}
base_semanticIndex = SemanticIndex()
base_semanticIndex.trainSemanticIndex(**semantic_index_params)
base_semanticIndex.semanticIndex