# Wikidataparsing

In [2]:
from src.dares import DARES

from src.model import SyntacticIndex, SemanticIndex

from src.utils import balanceRelationDataset, loadCorpus, prepare_corpus
from collections import Counter
import json 

import pandas as pd

%load_ext autoreload
%autoreload 2

In [None]:
!python -m spacy download en_core_web_lg
!python -m spacy download fr_core_news_lg
!python -m spacy download en_core_web_trf
!python -m spacy download fr_dep_news_trf


# Build the DARES dataset

In [3]:
# language code to select the language of the corpus
lg = 'en'
# lg = 'fr'

# nlp = spacy.load('fr_dep_news_trf')
# nlp = spacy.load('en_core_web_trf')
# nlp = spacy.load('fr_core_news_lg')
# nlp = spacy.load('en_core_web_lg')


# specifies number of cores to use
n_core = 6


wp = DARES(lg=lg, nlp_model='en_core_web_lg')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# dict_rel = [
#     {
#         # specifiy by what Item type / Property must search in the WhatLinksHere pages (e.g. Q5 ('human'))
#         "type": 'Q5',
#         # you can provide a label for the Item / Property (e.g. 'human')
#         "name": "human",
#         # indicate the set of relations you want to collect from Wikidata / Wikipedia
#         "props":{
#             # PXX are the identifier of a Property on Wikidata
#             # you define the value (e.g. placeOfBirth)
#             "P19": 'placeOfBirth',
#             "P569": "dateOfBirth",
#             "P509": 'causeOfDeath',
#             "P570": "dateOfDeath",
#             "P119": "placeOfBurial",
#             "P26": "spouse",
#             "P106": "occupation",
#             "P69": "educatedAt",
#         }      
#     }]

dict_rel = [
    {
        # specifiy by what Item type / Property must search in the WhatLinksHere pages (e.g. Q5 ('human'))
        "type": 'Q5',
        # you can provide a label for the Item / Property (e.g. 'human')
        "label": "human",
        # indicate the set of relations you want to collect from Wikidata / Wikipedia
        "props":{
            # PXX are the identifier of a Property on Wikidata
            # you define the label (e.g. placeOfBirth)
            "P19": {
                "label": 'placeOfBirth',
                "source": "Person",
                "target": "Location"
            },
            "P119": {
                "label": 'placeOfBurial',
                "source": "Person",
                "target": "Location"
            },
            "P569": {
                "label": 'dateOfBirth',
                "source": "Person",
                "target": "Time"
            },
            "P570": {
                "label": 'dateOfDeath',
                "source": "Person",
                "target": "Time"
            },
            "P509": {
                "label": 'causeOfDeath',
                "source": "Person",
                "target": "Misc"
            },
            "P26": {
                "label": 'spouse',
                "source": "Person",
                "target": "Person"
            },
            "P106": {
                "label": 'occupation',
                "source": "Person",
                "target": "Misc"
            },
            "P69": {
                "label": 'educatedAt',
                "source": "Person",
                "target": "Location"
            }
        }      
    }]


projectname = f'Q5'

wp.initiate_project(projectname, dict_rel)



In [None]:
# below, example when collecting data for multiple entity types (here, person and location)
dict_rel = [
    {
        "type": 'Q5',
        "name": "person",
        "props":{
            "P19": 'placeOfBirth',
            "P569": "dateOfBirth",
            "P509": 'causeOfDeath',
            "P570": "dateOfDeath",
            "P119": "placeOfBurial",
            "P26": "spouse",
            "P106": "occupation",
            "P69": "educatedAt",
            "P509": "causeOfDeath"
        }      
    },
    {   
        "type": "Property:P625",
        "name": "location",
        'props':{
            "P571": 'inception',
            "P17": "country",
            "P1082": "population",
            "P1376": "capitalOf",
            "P276": "location",
            "P36":"capital",
            "P35": "headOfState",
            "P6": "headOfGoverment",
            "P1082": "population",
            "P47": "sharesBordersWith",
            "P463": "memberOf",
            "P206": "nextInBodyWater"
        }
    }
]

projectname = f'test3'

wp.initiate_project(projectname, dict_rel)



In [6]:
# total of pages to collect
limit = 1
# limit = 10

# specifies at which steps saves collected entity ids to disk
save_step = 20

# specifies how many ids to collect per pages
m_size = 10

# m_size = 50

# collect list of Wikidata entities
wp.collect_Wikidata_links(dict_rel, limit, m_size, save_step, n_core)

Processing Q5 type...
Starting from first url
Done processing Q5 type
Processing Q5 done
projects/Q5


In [7]:
source_doc = 'wikipedia'

# similarity threshold for the distant supervision step
score_cutoff = 95

# getOther = False
getOther = True
maxsizesent = True

wp.processListEntities(source_doc=source_doc, score_cutoff=score_cutoff, getOther=getOther, maxsizesent=maxsizesent, n_core=n_core)


Step 1/6
Collecting Entity data...
Processing Q5 type...
Processing Q5 done
Entity data collected
Step 2/6
Collecting content...
Content collected
Step 3/6
Collecting Entity labels...
Entity labels collected
Step 4/6
Collecting properties...
Properties collected
Step 5/6
Collecting sentences...
Sentences collected
Step 6/6
Collecting Other sentences...
Other sentences collected


# Shortest Dependency Path

In [13]:

# removes sentences where these is no entity match
removeNoMatch = True

# list of Property by ID to keep. Here, keeps every Property

corpus = wp.extract_sdp(removeNoMatch=removeNoMatch, n_core=n_core)


In [14]:
# checks the distribution of relation in dataset
list_prop = [z['prop'] for x in corpus for y in x['content'] for z in y['props']]
# list_prop
Counter(list_prop).most_common()

[('dateOfBirth', 4), ('placeOfBirth', 3), ('dateOfDeath', 2), ('spouse', 1)]

# Build the Indices

In [15]:
# clean = True if you want to removes sentences annotated as Other
clean = True
corpus = loadCorpus(wp.project_path, clean=clean)
len(corpus), corpus[0]

The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.


(1,
 {'id': 'Q23',
  'content': [{'sent': "Based on his private papers and on accounts from his contemporaries, Washington slowly developed a cautious sympathy toward abolitionism that ended with his will freeing his long-time valet Billy Lee, and freeing the rest of his personally owned slaves outright upon Martha's death. On January 1, 1801, one year after George Washington's death, Martha Washington signed an order to free his slaves.",
    'sent_i': 16,
    'props': [{'prop': 'spouse',
      'sent': "Based on his private papers and on accounts from his contemporaries, Washington slowly developed a cautious sympathy toward abolitionism that ended with his will freeing his long-time valet Billy Lee, and freeing the rest of his personally owned slaves outright upon Martha's death. On January 1, 1801, one year after George Washington's death, Martha Washington signed an order to free his slaves.",
      'source': 'George Washington',
      'source_type': 'Person',
      'target': 'Mart

In [16]:
# Run this cell to convert the corpus in a format for building the indices
# also, allows to divide the corpus into a train, dev and validation test
params = {
    # corpus to process
    "corpus": corpus,
    # size of the train set. 1 means the whole corpus
    "train_size": 1,
    # size of the dev set
    "dev_size": 0,
    # removes Other labels
    "clean": True
    # "maxsize": 200000
}
data = prepare_corpus(**params)

try:
    print(len(data['X_train']), len(data['X_dev']), len(data['X_test']))
except:
    print(len(data['X_train']), len(data['y_train']))

c = Counter([y for y in data['y_train']])
dict(c.most_common())

10 10


{'dateOfBirth': 4, 'placeOfBirth': 3, 'dateOfDeath': 2, 'spouse': 1}

In [18]:
# run this cell to build the Syntactic Index 
syntactic_index_params = {
    # data to use for building the index
    "list_graphs": data['X_train'],
    # surface form of the predicates
    "anchor_textvalue": ['lemma', 'pos'],
    # which graph to use as lexico-syntactic pattern
    "graphkey": 'sdpgraph',
    # key to use for relation label
    "propkey": 'prop',
    # minimum support for each relation per pattern
    "support": 0,
    # "dict_rel": dict_rel,
    "savepath": wp.project_path
}
syntacticIndex = SyntacticIndex()
syntacticIndex.trainSyntacticIndex(**syntactic_index_params)
syntacticIndex.syntacticIndex

{'bear_VERB': [{'graph': <networkx.classes.digraph.DiGraph at 0x357b75370>,
   'size': 3,
   'props': [{'name': 'dateOfBirth', 'support': 1}],
   'ambiguous': 0,
   'source_types': ['Person'],
   'source_nodes': [1],
   'target_types': ['Time'],
   'target_nodes': [5],
   'ner_rules': {'dateOfBirth': {'source_type': 'Person',
     'target_type': 'Time'}},
   'i': 1},
  {'graph': <networkx.classes.digraph.DiGraph at 0x357b74bc0>,
   'size': 4,
   'props': [{'name': 'dateOfBirth', 'support': 1}],
   'ambiguous': 0,
   'source_types': ['Person'],
   'source_nodes': [1],
   'target_types': ['Time'],
   'target_nodes': [8],
   'ner_rules': {'dateOfBirth': {'source_type': 'Person',
     'target_type': 'Time'}},
   'i': 2},
  {'graph': <networkx.classes.digraph.DiGraph at 0x3578bb320>,
   'size': 5,
   'props': [{'name': 'placeOfBirth', 'support': 3}],
   'ambiguous': 0,
   'source_types': ['Person'],
   'source_nodes': [1],
   'target_types': ['Location'],
   'target_nodes': [15],
   'ner_ru

In [19]:
semantic_index_params = {
    "list_graphs": data['X_train'],
    "textvalue":  syntacticIndex.syntacticIndexParams['anchor_textvalue'],
    # "dict_rel": dict_rel,
    "removePROPN": True,
    "savepath": wp.project_path

}
base_semanticIndex = SemanticIndex()
base_semanticIndex.trainSemanticIndex(**semantic_index_params)
base_semanticIndex.semanticIndex

CONCEPT-INDEX,dateOfBirth,dateOfDeath,placeOfBirth,spouse
sign_VERB,0.0,0.0,0.0,1.0
after_ADP,0.0,0.0,0.0,1.0
death_NOUN,0.0,0.0,0.0,1.0
1732_NUM,1.0,0.0,0.0,0.0
1799_NUM,0.0,1.0,0.0,0.0
bear_VERB,0.58696,0.0,0.809616,0.0
at_ADP,0.0,0.0,1.0,0.0
in_ADP,0.0,0.0,1.0,0.0
on_ADP,1.0,0.0,0.0,0.0
