# Wikidataparsing

In [12]:
from src.dares import DARES

from src.model import ELIJERE

from src.utils import loadCorpus, prepare_corpus
from collections import Counter
# import json 

# import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
!python -m spacy download en_core_web_lg
!python -m spacy download fr_core_news_lg
!python -m spacy download en_core_web_trf
!python -m spacy download fr_dep_news_trf


In [2]:
config_project = {
    "project_name": "Q5",
    "lg": "en",
    "spacy_model": "en_core_web_lg",
    "n_core": 6,
    "dares_parameters": {
        "item_limit": 1,
        "item_save_step": 20,
        "items_per_pages": 10,
        "source_doc": 'wikipedia',
        "score_cutoff": 95,
        "getOther": True,
        "maxsizesent": True,
        "removeNoMatch": True
    },
    "elijere_parameters": {
        "anchor_textvalue": ['lemma', 'pos'],
        "support": 0,
        "removePROPN": True
    },
    "entities":[
        {
            # specifiy by what Item type / Property must search in the WhatLinksHere pages (e.g. Q5 ('human'))
            "type": 'Q5',
            # you can provide a label for the Item / Property (e.g. 'human')
            "label": "human",
            # indicate the set of relations you want to collect from Wikidata / Wikipedia
            "props":{
                # PXX are the identifier of a Property on Wikidata
                # you define the label (e.g. placeOfBirth)
                "P19": {
                    "label": 'placeOfBirth',
                    "source": "Person",
                    "target": "Location"
                },
                "P119": {
                    "label": 'placeOfBurial',
                    "source": "Person",
                    "target": "Location"
                },
                "P569": {
                    "label": 'dateOfBirth',
                    "source": "Person",
                    "target": "Time"
                },
                "P570": {
                    "label": 'dateOfDeath',
                    "source": "Person",
                    "target": "Time"
                },
                "P509": {
                    "label": 'causeOfDeath',
                    "source": "Person",
                    "target": "Misc"
                },
                "P26": {
                    "label": 'spouse',
                    "source": "Person",
                    "target": "Person"
                },
                "P106": {
                    "label": 'occupation',
                    "source": "Person",
                    "target": "Misc"
                },
                "P69": {
                    "label": 'educatedAt',
                    "source": "Person",
                    "target": "Location"
                }
            }      
        }]
}




In [None]:
# below, example when collecting data for multiple entity types (here, person and location)
config_project = {
    [
        {
            "type": 'Q5',
            "name": "person",
            "props":{
                "P19": 'placeOfBirth',
                "P569": "dateOfBirth",
                "P509": 'causeOfDeath',
                "P570": "dateOfDeath",
                "P119": "placeOfBurial",
                "P26": "spouse",
                "P106": "occupation",
                "P69": "educatedAt",
                "P509": "causeOfDeath"
            }      
        },
        {   
            "type": "Property:P625",
            "name": "location",
            'props':{
                "P571": 'inception',
                "P17": "country",
                "P1082": "population",
                "P1376": "capitalOf",
                "P276": "location",
                "P36":"capital",
                "P35": "headOfState",
                "P6": "headOfGoverment",
                "P1082": "population",
                "P47": "sharesBordersWith",
                "P463": "memberOf",
                "P206": "nextInBodyWater"
            }
        }
    ]
}
projectname = f'test3'

wp.initiate_project(projectname, dict_rel)



# Build the DARES dataset

In [49]:
dares_config = config_project.copy()
del dares_config['elijere_parameters']
wp = DARES(**dares_config)


In [51]:
wp.collect_Wikidata_links()

Processing Q5 type...
Starting from first url
Done processing Q5 type
Processing Q5 done
projects/Q5


In [53]:
wp.processListEntities()

Step 1/6
Collecting Entity data...
Processing Q5 type...
Processing Q5 done
Entity data collected
Step 2/6
Collecting content...
Content collected
Step 3/6
Collecting Entity labels...
Entity labels collected
Step 4/6
Collecting properties...
Properties collected
Step 5/6
Collecting sentences...
Sentences collected
Step 6/6
Collecting Other sentences...
Other sentences collected


In [None]:
# extracting shortest dependency path
corpus = wp.extract_sdp()


The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


In [55]:
# checks the distribution of relation in dataset
list_prop = [z['prop'] for x in corpus for y in x['content'] for z in y['props']]
# list_prop
Counter(list_prop).most_common()

[('dateOfBirth', 4), ('placeOfBirth', 3), ('dateOfDeath', 2), ('spouse', 1)]

# Build the Indices

In [4]:
# clean = True if you want to removes sentences annotated as Other
clean = True
corpus = loadCorpus(f"projects/{config_project['project_name']}", clean=clean)
len(corpus), corpus[0]

The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.


(1,
 {'id': 'Q23',
  'content': [{'sent': "Based on his private papers and on accounts from his contemporaries, Washington slowly developed a cautious sympathy toward abolitionism that ended with his will freeing his long-time valet Billy Lee, and freeing the rest of his personally owned slaves outright upon Martha's death. On January 1, 1801, one year after George Washington's death, Martha Washington signed an order to free his slaves.",
    'sent_i': 16,
    'props': [{'prop': 'spouse',
      'sent': "Based on his private papers and on accounts from his contemporaries, Washington slowly developed a cautious sympathy toward abolitionism that ended with his will freeing his long-time valet Billy Lee, and freeing the rest of his personally owned slaves outright upon Martha's death. On January 1, 1801, one year after George Washington's death, Martha Washington signed an order to free his slaves.",
      'source': 'George Washington',
      'source_type': 'Person',
      'target': 'Mart

In [11]:
# Run this cell to convert the corpus in a format for building the indices
# also, allows to divide the corpus into a train, dev and validation test
params = {
    # corpus to process
    "corpus": corpus,
    # size of the train set. 1 means the whole corpus
    "train_size": 1,
    # size of the dev set
    "dev_size": 0,
    # removes Other labels
    "clean": True
    # "maxsize": 200000
}
data = prepare_corpus(**params)

try:
    print('Train size :',len(data['X_train']), 'Dev size :', len(data['X_dev']), 'Test size :',len(data['X_test']))
except:
    print('Train size :',len(data['X_train']))

c = Counter([y for y in data['y_train']])
print('Classes distribution : ', dict(c.most_common()))


Train size : 10
Classes distribution :  {'dateOfBirth': 4, 'placeOfBirth': 3, 'dateOfDeath': 2, 'spouse': 1}


In [15]:
elijere = ELIJERE()

elijere_parameters = config_project['elijere_parameters']
elijere_parameters['data'] = data
elijere_parameters['savepath'] = f"projects/{config_project['project_name']}"

elijere_parameters

{'anchor_textvalue': ['lemma', 'pos'],
 'support': 0,
 'removePROPN': True,
 'data': {'X_train': [{'sourceNode': [58, 59],
    'targetNode': [63, 64],
    'sourceNodeRoot': 59,
    'targetNodeRoot': 64,
    'sdpgraph': <networkx.classes.digraph.DiGraph at 0x16fa02480>,
    'prop': 'spouse',
    'source': 'George Washington',
    'target': 'Martha Washington',
    'sent': "Based on his private papers and on accounts from his contemporaries, Washington slowly developed a cautious sympathy toward abolitionism that ended with his will freeing his long-time valet Billy Lee, and freeing the rest of his personally owned slaves outright upon Martha's death. On January 1, 1801, one year after George Washington's death, Martha Washington signed an order to free his slaves.",
    'sent_graph': <networkx.classes.digraph.DiGraph at 0x16fa01880>,
    'source_type': 'Person',
    'target_type': 'Person'},
   {'sourceNode': [0, 1],
    'targetNode': [3, 4, 5, 6],
    'sourceNodeRoot': 1,
    'targetNo

In [17]:
elijere.fit(**elijere_parameters)

Building Syntactic Index...
Building Syntactic Index done !
Building Lexical Index...
Building Lexical Index done !


The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


In [21]:
elijere.classifier.semanticIndex

CONCEPT-INDEX,dateOfBirth,dateOfDeath,placeOfBirth,spouse
sign_VERB,0.0,0.0,0.0,1.0
after_ADP,0.0,0.0,0.0,1.0
death_NOUN,0.0,0.0,0.0,1.0
1732_NUM,1.0,0.0,0.0,0.0
1799_NUM,0.0,1.0,0.0,0.0
bear_VERB,0.58696,0.0,0.809616,0.0
at_ADP,0.0,0.0,1.0,0.0
in_ADP,0.0,0.0,1.0,0.0
on_ADP,1.0,0.0,0.0,0.0


In [38]:
elijere.extractFacts("Nicolas Gutehrkle was born on February 17, 1995 in Strasbourg")

[{'fact': {'pred': 'dateOfBirth',
   'score': np.float64(0.8100023529280639),
   'rule': 'semantic',
   'anchor': 3,
   'anchortext': 'bear_VERB',
   'candidate': {'nodes': [1, 3, 4, 5, 6],
    'labels': [{'name': 'dateOfBirth', 'support': 1}],
    'graph': <networkx.classes.digraph.DiGraph at 0x3d5db6690>,
    'source_types': ['Person'],
    'source_nodes': [1],
    'target_types': ['Time'],
    'target_nodes': [6],
    'ner_rules': {'dateOfBirth': {'source_type': 'Person',
      'target_type': 'Time'}}}},
  'ner': [{'pred': 'Person',
    'root_node': 1,
    'start': 1,
    'end': 2,
    'char_start': 8,
    'char_end': 17},
   {'pred': 'Time',
    'root_node': 6,
    'start': 6,
    'end': 7,
    'char_start': 39,
    'char_end': 41}]},
 {'fact': {'pred': 'dateOfBirth',
   'score': np.float64(0.7397289709521384),
   'rule': 'semantic',
   'anchor': 3,
   'anchortext': 'bear_VERB',
   'candidate': {'nodes': [1, 10, 3, 9],
    'labels': [{'name': 'dateOfBirth', 'support': 1}],
    'gra