In [1]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
HOME_DIR = '/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files'
table_path = f'{HOME_DIR}/faast/brazilian-ports-2021-03-24-corporativo - Planilha1_brazilian-ports-2021-03-24-corporativo - Planilha1.csv'
f_name = table_path.split("/")[-1]
wikify_column_name = "subject"
final_score_column = "siamese_prediction"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count,context'
temp_dir= f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/temp/apr_test.csv'
model_file_path = './models/weighted_lr.pkl'
ranking_model_file_path = './models/epoch_5_loss_0.09882864356040955_top1_0.8968926553672316.pth'
min_max_scaler_path = './models/normalization_factor.pkl'

model_voted_candidate_file_path = f'{HOME_DIR}/temp/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/temp/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/temp/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/temp/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/temp/lof_feature.csv'
context_score_file = f'{HOME_DIR}/temp/context_score_file.csv'

output_model_pred_file = f'{HOME_DIR}/temp/model_prediction.csv'
top5_links = f'{HOME_DIR}/temp/top5_links.csv'
colorized_kg_links = f'{HOME_DIR}/temp/{f_name.strip(".csv")}_colorized.xlsx'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
class_count_file = f'{HOME_DIR}/temp/class_count.tsv'
property_count_file = f'{HOME_DIR}/temp/property_count.tsv'
context_file = f'{HOME_DIR}/temp/context.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

In [3]:
!rm -rf $temp_dir
!mkdir -p $temp_dir

In [4]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score']

### Canonicalize

In [5]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" \
> "$canonical_file_path"

In [6]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context
0,0,0,Barcarena,SGUF|PA|acessos
1,0,1,São Francisco do Sul,SGUF|SC|acessos
2,0,2,São Luís,SGUF|MA|acessos
3,0,3,Navegantes,SGUF|SC|acessos
4,0,4,Osório,SGUF|RS|acessos


### Candidate Generation

In [7]:
!tl clean -c label -o label_clean "$canonical_file_path" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-exact-matches \
-c label_clean --auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" > "$candidate_file_path"

In [8]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count',
    'context': 'context'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t'))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{HOME_DIR}/temp/{field}.tsv', sep='\t', index=False)

In [9]:
pd.read_csv(candidate_file_path, nrows=6)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,fuzzy-augmented,"municipality of the state of Pará, Brazil",7.578764e-08,24.232975
1,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q3006790,Cuilapa-Barbarena,,fuzzy-augmented,mountain in Guatemala,4.542504e-09,21.678867
2,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q807903,Barcarena,Barcarena (Oeiras)|Barcarena (Portugal),fuzzy-augmented,civil parish in Oeiras,5.668506e-08,20.110527
3,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q31910854,Barcarena,,fuzzy-augmented,human settlement in Portugal,1.049783e-08,20.016058
4,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q3977874,Baccalena,,fuzzy-augmented,genus of molluscs,6.834578e-09,17.74442
5,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q22027289,Rio Barcarena,,fuzzy-augmented,river in Brazil,3.539613e-09,17.02375


### Generate lof-related features: lof-graph-embedding-score, lof-reciprocal-rank, lof-tfidf
##### Generate required 4 features for voting classifier

In [10]:
!tl align-page-rank $candidate_file_path \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
/ string-similarity -i --method jaro_winkler -o jaro_winkler \
/ string-similarity -i --method levenshtein -o levenshtein \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard / smallest-qnode-number \
/ mosaic-features -c kg_labels --num-char --num-tokens \
/ create-singleton-feature -o singleton \
> $aligned_pagerank_candidate_file_path

In [11]:
features_df = pd.read_csv(aligned_pagerank_candidate_file_path)
features_df.loc[:, ['method', 'pagerank', 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].head()

Unnamed: 0,method,pagerank,aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized
0,exact-match,7.578764e-08,7.578764e-08,0,1.0,0.0
1,exact-match,3.539613e-09,3.539613e-09,0,1.0,0.0
2,exact-match,1.049783e-08,1.049783e-08,0,1.0,0.0
3,exact-match,5.668506e-08,5.668506e-08,0,1.0,0.0
4,fuzzy-augmented,7.578764e-08,0.0,0,1.0,0.0


##### Generate model-voted candidates result

In [12]:
!tl vote-by-classifier $aligned_pagerank_candidate_file_path \
--prob-threshold 0.995 \
--model $model_file_path \
--features "aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized" \
> $model_voted_candidate_file_path

In [13]:
model_voted_df = pd.read_csv(model_voted_candidate_file_path)
model_voted_df.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,monge_elkan_aliases,jaro_winkler,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier
0,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,exact-match,"municipality of the state of Pará, Brazil",...,0.0,1.0,1.0,0.0,0.0,0,9,1,0,0
1,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q194721,Barcarena,,exact-match,Wikimedia disambiguation page,...,0.0,1.0,1.0,0.0,0.0,0,9,1,0,0
2,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q31910854,Barcarena,,exact-match,human settlement in Portugal,...,0.0,1.0,1.0,0.0,0.0,0,9,1,0,0
3,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q807903,Barcarena,Barcarena (Portugal)|Barcarena (Oeiras),exact-match,civil parish in Oeiras,...,0.868519,1.0,1.0,0.0,0.0,0,9,1,0,0
4,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,fuzzy-augmented,"municipality of the state of Pará, Brazil",...,0.0,1.0,1.0,0.0,0.0,0,9,1,0,0


##### Generate graph-embedding-score using centroid-of-lof and lof-strategy

In [14]:
!tl score-using-embedding $model_voted_candidate_file_path \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o lof-graph-embedding-score \
--embedding-file $graph_embedding_complex_file \
--embedding-url $index_url \
> $graph_embedding_file_path

Qnodes to lookup: 15307
Qnodes from file: 15069
Qnodes from server: 0
Outlier removal generates 148 lof-voted candidates


In [15]:
score_df = pd.read_csv(graph_embedding_file_path)
score_df.head(5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
0,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,exact-match,"municipality of the state of Pará, Brazil",...,1.0,0.0,0.0,0,9,1,0,0,-1,0.705301
1,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q194721,Barcarena,,exact-match,Wikimedia disambiguation page,...,1.0,0.0,0.0,0,9,1,0,0,-1,0.394649
2,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q31910854,Barcarena,,exact-match,human settlement in Portugal,...,1.0,0.0,0.0,0,9,1,0,0,-1,0.49701
3,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q807903,Barcarena,Barcarena (Portugal)|Barcarena (Oeiras),exact-match,civil parish in Oeiras,...,1.0,0.0,0.0,0,9,1,0,0,-1,0.60197
4,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,fuzzy-augmented,"municipality of the state of Pará, Brazil",...,1.0,0.0,0.0,0,9,1,0,0,-1,0.705301


In [16]:
score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).loc[:, [
    'kg_id', 'kg_labels', 'kg_descriptions', 'method', 'singleton', 'vote_by_classifier', 'is_lof', 'lof-graph-embedding-score'
]].head(20)

Unnamed: 0,kg_id,kg_labels,kg_descriptions,method,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
14500,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
34698,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
34938,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
5335,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
19478,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
24491,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
41902,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
42136,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
42256,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188
42502,Q5776,Betlemme|Bethléem|Belén|Belém|Bethlehem,city in the State of Palestine,fuzzy-augmented,0,1,1,0.845188


##### Generate lof reciprocal rank feature

In [17]:
!tl generate-reciprocal-rank "$graph_embedding_file_path" \
-c lof-graph-embedding-score \
-o lof-reciprocal-rank \
> "$lof_reciprocal_rank_file_path"

In [18]:
pd.read_csv(lof_reciprocal_rank_file_path, nrows=5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank
0,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,exact-match,"municipality of the state of Pará, Brazil",...,0.0,0.0,0,9,1,0,0,-1,0.705301,1.0
1,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q194721,Barcarena,,exact-match,Wikimedia disambiguation page,...,0.0,0.0,0,9,1,0,0,-1,0.394649,0.013699
2,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q31910854,Barcarena,,exact-match,human settlement in Portugal,...,0.0,0.0,0,9,1,0,0,-1,0.49701,0.021277
3,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q807903,Barcarena,Barcarena (Portugal)|Barcarena (Oeiras),exact-match,civil parish in Oeiras,...,0.0,0.0,0,9,1,0,0,-1,0.60197,0.045455
4,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,fuzzy-augmented,"municipality of the state of Pará, Brazil",...,0.0,0.0,0,9,1,0,0,-1,0.705301,0.5


##### Generate lof tfidf feature

In [19]:
!tl compute-tf-idf "$lof_reciprocal_rank_file_path" \
--feature-file "$class_count_file" \
--feature-name class_count \
--singleton-column is_lof \
-o lof_class_count_tf_idf_score \
/ compute-tf-idf \
--feature-file "$property_count_file" \
--feature-name property_count \
--singleton-column is_lof \
-o lof_property_count_tf_idf_score \
> "$lof_feature_file"

In [20]:
d = pd.read_csv(lof_feature_file, nrows=5)
d

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
0,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,exact-match,"municipality of the state of Pará, Brazil",...,0,9,1,0,0,-1,0.705301,1.0,0.946161,0.632029
1,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q194721,Barcarena,,exact-match,Wikimedia disambiguation page,...,0,9,1,0,0,-1,0.394649,0.013699,0.036156,0.000129
2,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q31910854,Barcarena,,exact-match,human settlement in Portugal,...,0,9,1,0,0,-1,0.49701,0.021277,0.204595,0.099514
3,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q807903,Barcarena,Barcarena (Portugal)|Barcarena (Oeiras),exact-match,civil parish in Oeiras,...,0,9,1,0,0,-1,0.60197,0.045455,0.301452,0.312674
4,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,fuzzy-augmented,"municipality of the state of Pará, Brazil",...,0,9,1,0,0,-1,0.705301,0.5,0.946161,0.632029


##### Add context score

In [23]:
!tl context-match $lof_feature_file \
    --context-file $context_file  \
    -o context_score \
    --debug \
> $context_score_file

In [24]:
pd.read_csv(context_score_file, nrows=10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score,context_properties,context_similarity,context_score
0,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,exact-match,"municipality of the state of Pará, Brazil",...,0,0,-1,0.705301,1.0,0.946161,0.632029,|P131|,0.0|1.0|0.0,0.1795
1,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q194721,Barcarena,,exact-match,Wikimedia disambiguation page,...,0,0,-1,0.394649,0.013699,0.036156,0.000129,,0.0,0.0
2,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q31910854,Barcarena,,exact-match,human settlement in Portugal,...,0,0,-1,0.49701,0.021277,0.204595,0.099514,||,0.0|0.0|0.0,0.0
3,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q807903,Barcarena,Barcarena (Portugal)|Barcarena (Oeiras),exact-match,civil parish in Oeiras,...,0,0,-1,0.60197,0.045455,0.301452,0.312674,||,0.0|0.0|0.0,0.0
4,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q2009595,Barcarena,,fuzzy-augmented,"municipality of the state of Pará, Brazil",...,0,0,-1,0.705301,0.5,0.946161,0.632029,|P131|,0.0|1.0|0.0,0.1795
5,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q3006790,Cuilapa-Barbarena,,fuzzy-augmented,mountain in Guatemala,...,0,0,-1,0.505933,0.022727,0.041338,0.05465,||,0.0|0.0|0.0,0.0
6,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q807903,Barcarena,Barcarena (Oeiras)|Barcarena (Portugal),fuzzy-augmented,civil parish in Oeiras,...,0,0,-1,0.60197,0.043478,0.301452,0.312674,||,0.0|0.0|0.0,0.0
7,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q31910854,Barcarena,,fuzzy-augmented,human settlement in Portugal,...,0,0,-1,0.49701,0.020833,0.204595,0.099514,||,0.0|0.0|0.0,0.0
8,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q3977874,Baccalena,,fuzzy-augmented,genus of molluscs,...,0,0,-1,0.362767,0.012195,0.010678,0.037238,||,0.0|0.0|0.0,0.0
9,0,0,Barcarena,SGUF|PA|acessos,Barcarena,Q22027289,Rio Barcarena,,fuzzy-augmented,river in Brazil,...,0,0,-1,0.67107,0.333333,0.051048,0.071431,|P131|,0.0|1.0|0.0,0.1795


### Model Prediction

In [25]:
features_str = ",".join(features)
!tl predict-using-model -o siamese_prediction \
--ranking-model $ranking_model_file_path \
--features $features_str \
--normalization-factor $min_max_scaler_path $context_score_file > $output_model_pred_file

### Get Top 5 links

In [26]:
!tl get-kg-links -c $final_score_column -k 5 --k-rows $output_model_pred_file > $top5_links

In [27]:
pd.set_option('display.max_rows', None)
final_output = pd.read_csv(top5_links, nrows=10)
final_output[['column', 'row', 'label', 'context', 'kg_id', 'kg_labels', 'kg_aliases',
             'kg_descriptions', 'siamese_prediction']]

Unnamed: 0,column,row,label,context,kg_id,kg_labels,kg_aliases,kg_descriptions,siamese_prediction
0,0,0,Barcarena,SGUF|PA|acessos,Q2009595,Barcarena,,"municipality of the state of Pará, Brazil",6.147359e-07
1,0,0,Barcarena,SGUF|PA|acessos,Q807903,Barcarena,Barcarena (Portugal)|Barcarena (Oeiras),civil parish in Oeiras,2.104932e-12
2,0,0,Barcarena,SGUF|PA|acessos,Q22027289,Rio Barcarena,,river in Brazil,6.185879e-23
3,0,0,Barcarena,SGUF|PA|acessos,Q31910854,Barcarena,,human settlement in Portugal,5.511385e-26
4,0,0,Barcarena,SGUF|PA|acessos,Q271819,La Barberina|Barbara Campanini,La Barberina|Barbarina|Barbara Campanini|Campa...,Italian ballerina,1.109671e-26
5,0,1,São Francisco do Sul,SGUF|SC|acessos,Q986536,São Francisco do Sul|San Francisco del Sur,São Francisco do Sul|Sao Francisco do Sul,"municipality in Santa Catarina, Brazil",1.0
6,0,1,São Francisco do Sul,SGUF|SC|acessos,Q22063790,São Francisco do Sul,Sao Francisco do Sul,human settlement in Brazil,7.17725e-12
7,0,1,São Francisco do Sul,SGUF|SC|acessos,Q22035152,São Francisco do Sul,Sao Francisco do Sul,,3.229673e-14
8,0,1,São Francisco do Sul,SGUF|SC|acessos,Q22035149,São Francisco de Paula,Sao Francisco de Paula,municipality in the Brazilian state of Rio Gra...,2.065728e-15
9,0,1,São Francisco do Sul,SGUF|SC|acessos,Q1765057,São Bento do Sul,Sao Bento do Sul,human settlement in Brazil,6.34379e-16


### Colorized KG Links file

In [33]:
!tl add-color -c "$final_score_column" -k 5 $top5_links --output "$colorized_kg_links"

In [35]:
!open "$colorized_kg_links"