**Wikidata matching.**

Match Wikibase and Wikidata data to find title/authority links.

In [1]:

import pathlib, pandas, pydash, datetime, json, numpy
from IPython.display import clear_output
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

"""
Import data from Wikibase export and format into a suitable state for matching.
"""

def glean(row, col, element):
    return(pydash.get(row[col], element))

def glean_multi(row, col, element):
    val = pydash.get(row['claims'], col)
    if val is not None:
        return([pydash.get(x, element) for x in pydash.get(row['claims'], col)])    

data_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '1_wikibase_instance' 
source_data = pandas.read_json(data_path / 'complete_export_cleaned.json', orient='records')
with open(data_path / 'prebuilt.json') as prebuilt:
    struct = json.load(prebuilt)

source_data['simple_label'] = source_data.apply(glean, col='labels', element='en.value', axis=1)
source_data['instance'] = source_data.apply(glean, col='claims', element='P1.0.mainsnak.datavalue.value.id', axis=1)
source_data['country'] = source_data.apply(glean_multi, col=struct['country of origin'], element='mainsnak.datavalue.value.id', axis=1)    
source_data['director'] = source_data.apply(glean_multi, col=struct['director'], element='mainsnak.datavalue.value.id', axis=1)
source_data['year'] = source_data.apply(glean_multi, col=struct['year'], element='mainsnak.datavalue.value.time', axis=1)
source_data['title'] = source_data.apply(glean_multi, col=struct['title'], element='mainsnak.datavalue.value.text', axis=1)

for x in ['director', 'year', 'country']:    
    source_data = source_data.explode(x)

translator = source_data[['id', 'simple_label']]
translator = {translator.iloc[x]['id']:translator.iloc[x]['simple_label'] for x in range(0,len(translator))}

film_data = source_data.loc[source_data.instance.isin([struct['cinematographic work']])]
film_data = film_data[['id', 'simple_label', 'director', 'year', 'country']]
film_data['year'] = film_data['year'].str[1:].str[:4]
film_data['director_label'] = film_data['director']
film_data = film_data.replace({'director':translator})
film_data['country_id'] = film_data['country']
film_data = film_data.replace({'country':translator})

print(len(film_data))
film_data.head()


510


Unnamed: 0,id,simple_label,director,year,country,director_label,country_id
8,Q3,Soldiers Of The Cross,Herbert Booth,1900,Australia,Q581,Q2
8,Q3,Soldiers Of The Cross,Joseph Perry,1900,Australia,Q604,Q2
9,Q4,The Story Of The Kelly Gang,Charles Tait,1906,Australia,Q527,Q2
10,Q5,Eureka Stockade,Arthur Cornwell,1907,Australia,Q503,Q2
10,Q5,Eureka Stockade,George Cornwell,1907,Australia,Q568,Q2


In [2]:

"""
Import Wikidata data for matching.
"""

data_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '2_wikidata_extract' 
wiki = pandas.read_csv(data_path / 'wikidata.csv', low_memory=False)

print(len(wiki))
wiki.head()


6932393


Unnamed: 0,item,title,country,year,prop,label,link
0,Q59811424,Buffalo Running,United Kingdom,1883,P57,Eadweard Muybridge,Q190568
1,Q59811424,Buffalo Running,United States of America,1883,P57,Eadweard Muybridge,Q190568
2,Q11766965,Man Walking Around a Corner,United Kingdom,1887,P57,Louis Le Prince,Q421675
3,Q11766965,Man Walking Around a Corner,United Kingdom,1887,P344,Louis Le Prince,Q421675
4,Q267176,Roundhay Garden Scene,England,1888,P57,Louis Le Prince,Q421675


In [5]:


"""
Match data between the two datasets. This is done by identifying shared filmography "fingerprints".
"""

def levenshtein(other_data, column, score, source_value):
    
    """
    This is the string matching function,
    which finds all matches between datasets which are over the threshold score.
    """
    
    gloss = list()
    for val in source_value:
        produce = process.extract(val, list(other_data[column].unique()), limit=10000, scorer=fuzz.token_sort_ratio)
        gloss.extend([x for x, n in produce if n >= score])
    other_data = other_data.loc[other_data[column].isin(gloss)] 
    return(other_data)

film_data = film_data[['director_label', 'director', 'id', 'simple_label', 'year', 'country']]
film_data = film_data.sort_values(by=['director'])
film_director_list = [x for x in list(film_data.director_label.unique()) if x != None]

commencer = datetime.datetime.now()
for n, x in enumerate(film_director_list):
    candidate_dataframe = pandas.DataFrame(columns=['dir_id', 'dir_lab', 'film_id', 'film_lab', 'cand_id', 'cand_lab', 
                                                    'cand_score', 'cand_film_id', 'cand_film_lab', 'score'])            
    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(film_director_list)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'Processing: {n+1} of {len(film_director_list)}; eta {time_to_finish}.')
    clear_output(wait=True)
    
    data = film_data.loc[film_data.director_label.isin([x])]
    director_films = list(zip(list(data.id), list(data.simple_label), list(data.year), list(data.country)))
    director_id = list(data.director_label)[0] 
    director_label = list(data.director)[0] 
    candidates = levenshtein(wiki, 'label', 60, [data.director.unique()[0]])
    for c in list(candidates.link.unique()): # these are the candidates one by one
        name_compare = candidates.loc[candidates.link.isin([c])]
        candidate_name_score = fuzz.token_sort_ratio(director_label, list(name_compare.label)[0])
        for film_id, film_title, film_year, film_country in director_films:
            this_candidate = candidates.loc[candidates.link.isin([c])]
            this_candidate['year'] = this_candidate['year'].astype('str')
            res = this_candidate.loc[this_candidate.year.isin([ str(x) for x in range(int(film_year)-1, int(film_year)+1)])]
            if len(res):
                def leven_fly(row, source):
                    return(fuzz.token_sort_ratio(row['title'], source))
                res['title_score'] = res.apply(leven_fly, source=film_title, axis=1)
                res = res.sort_values(by='title_score', ascending=False)
                candidate_dataframe.loc[len(candidate_dataframe)] = [(director_id), (director_label), (film_id), (film_title), 
                                                                     (list(this_candidate.link)[0]), (list(this_candidate.label)[0]), 
                                                                     (candidate_name_score), (list(res.item)[0]), (list(res.title)[0]), 
                                                                     (list(res.title_score)[0])]
            else:
                candidate_dataframe.loc[len(candidate_dataframe)] = [(director_id), (director_label), (film_id), (film_title), 
                                                                     (list(this_candidate.link)[0]), (list(this_candidate.label)[0]), 
                                                                     (candidate_name_score), None, None, 0]

    data_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '3_matching' / 'authority' 
    if pathlib.Path.exists(data_path) == False:
        pathlib.Path.mkdir(data_path, parents=True, exist_ok=True)
    candidate_dataframe.to_csv(data_path / f'{x}.csv', index=False)

print(len(candidate_dataframe))
candidate_dataframe.head()


108


Unnamed: 0,dir_id,dir_lab,film_id,film_lab,cand_id,cand_lab,cand_score,cand_film_id,cand_film_lab,score
0,Q704,Yves Allegret,Q352,The Restless And The Damned,Q3133457,Herbert Heyes,62,,,0
1,Q704,Yves Allegret,Q352,The Restless And The Damned,Q12314711,Gyda Aller,61,,,0
2,Q704,Yves Allegret,Q352,The Restless And The Damned,Q17620547,Alberto Traversa,62,,,0
3,Q704,Yves Allegret,Q352,The Restless And The Damned,Q1973276,Albert Ray,61,,,0
4,Q704,Yves Allegret,Q352,The Restless And The Damned,Q91886,Albert Mayer,64,,,0


In [9]:

"""
Once all possible matches are returned the are filtered down to trusted results,
based on rating of film title match against number of film titles.
"""

data_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '3_matching' / 'authority' 
authority_files = [x for x in data_path.iterdir()]

matches = pandas.DataFrame()

def georg_function(row, data): 
    section = data.loc[data.cand_id.isin([row['cand_id']])]
    query = list(section.score)
    if len(query) == 1:
        if numpy.mean(query) >= 90 and row['cand_score'] >= 90:
            return('yes')
        else:
            return('no')
    elif len(query) == 2:
        if numpy.mean(query) >= 85 and row['cand_score'] >= 85:
            return('yes')
        else:
            return('no')   
    elif len(query) == 3:
        if numpy.mean(query) >= 80 and row['cand_score'] >= 80:
            return('yes')
        else:
            return('no')          
    else:
        if numpy.mean(query) >= 80:
            return('yes')
        else:
            return('no')  
        
for x in authority_files:
    auth = pandas.read_csv(x)
    auth['correct'] = auth.apply(georg_function, data=auth, axis=1)  
    auth = auth.loc[auth.correct.isin(['yes'])]
    if len(auth.cand_id.unique()) > 1:
        raise Exception('multiples found here')
    matches = pandas.concat([auth, matches])        

print(len(matches))
matches.head()


409


Unnamed: 0,dir_id,dir_lab,film_id,film_lab,cand_id,cand_lab,cand_score,cand_film_id,cand_film_lab,score,correct
261,Q690,Tom Cowan,Q403,The Office Picnic,Q7815438,Tom Cowan,100,Q7754723,The Office Picnic,100,yes
262,Q690,Tom Cowan,Q431,Promised Woman,Q7815438,Tom Cowan,100,Q7249802,Promised Woman,100,yes
263,Q690,Tom Cowan,Q481,Journey Among Women,Q7815438,Tom Cowan,100,Q6296331,Journey Among Women,100,yes
10,Q684,Tal Ordell,Q240,The Kid Stakes,Q7678712,Tal Ordell,100,Q7744230,The Kid Stakes,100,yes
750,Q653,Peter Weir,Q488,The Last Wave,Q55424,Peter Weir,100,Q1218841,The Last Wave,100,yes


In [20]:

"""
Matching wikibase/wikidata links reduced and exported.
"""

accepted_results = matches[['dir_id', 'cand_id']].rename(columns={'dir_id':'wikibase', 'cand_id':'wikidata'}).drop_duplicates()

title_matching = matches.copy()
title_matching = title_matching.loc[title_matching.score > 75]
title_matching = title_matching[['film_id', 'cand_film_id']].rename(columns={'film_id':'wikibase', 'cand_film_id':'wikidata'}).drop_duplicates()
accepted_results = pandas.concat([accepted_results, title_matching])
if len(accepted_results.wikibase.unique()) != len(accepted_results) or len(accepted_results.wikidata.unique()) != len(accepted_results):
    raise Exception('Identifier matching must return one-for-one relationships.')
    
data_path = pathlib.Path.cwd().resolve().parents[0] / 'data' / '3_matching'
accepted_results.to_csv(data_path / 'wikibase_wikidata_links.csv', index=False)
print(len(accepted_results))
accepted_results.head()


532


Unnamed: 0,wikibase,wikidata
261,Q690,Q7815438
10,Q684,Q7678712
750,Q653,Q55424
347,Q647,Q7172711
124,Q519,Q449220
