In [1]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
HOME_DIR = '/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files'
table_path = f'{HOME_DIR}/faast/Tab25 - Tabela 25_Tab25 - Tabela 25.csv'
f_name = table_path.split("/")[-1]
wikify_column_name = "subject"
final_score_column = "siamese_prediction"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count,context'
temp_dir= f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/temp/apr_test.csv'
model_file_path = './models/weighted_lr.pkl'
ranking_model_file_path = './models/epoch_5_loss_0.09882864356040955_top1_0.8968926553672316.pth'
min_max_scaler_path = './models/normalization_factor.pkl'

model_voted_candidate_file_path = f'{HOME_DIR}/temp/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/temp/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/temp/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/temp/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/temp/lof_feature.csv'
context_score_file = f'{HOME_DIR}/temp/context_score_file.csv'

output_model_pred_file = f'{HOME_DIR}/temp/model_prediction.csv'
top5_links = f'{HOME_DIR}/temp/top5_links.csv'
colorized_kg_links = f'{HOME_DIR}/temp/{f_name.strip(".csv")}_colorized.xlsx'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
class_count_file = f'{HOME_DIR}/temp/class_count.tsv'
property_count_file = f'{HOME_DIR}/temp/property_count.tsv'
context_file = f'{HOME_DIR}/temp/context.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

In [3]:
!rm -rf $temp_dir
!mkdir -p $temp_dir

In [4]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score']

In [5]:
print(len(features))

15


In [6]:
!ls "$table_path"

/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/faast/Tab25 - Tabela 25_Tab25 - Tabela 25.csv


### Canonicalize

In [7]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" \
> "$canonical_file_path"

canonicalize Time: 0.019055843353271484s


In [8]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos
1,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos
2,0,2,Paraná,Área\nplantada\n(ha)|10732006|acessos
3,0,3,Rio Grande do Sul,Área\nplantada\n(ha)|8934034|acessos
4,0,4,São Paulo,Área\nplantada\n(ha)|8631378|acessos


### Candidate Generation

In [9]:
!tl clean -c label -o label_clean "$canonical_file_path" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-exact-matches \
-c label_clean --auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" > "$candidate_file_path"

clean Time: 0.004969120025634766s
get-fuzzy-augmented-matches Time: 6.552438974380493s
get-exact-matches Time: 0.6618070602416992s


In [10]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count',
    'context': 'context'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t'))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{HOME_DIR}/temp/{field}.tsv', sep='\t', index=False)

In [11]:
pd.read_csv(candidate_file_path, nrows=6)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899371,Brasil|Île de Brasil|Ilha Brasil|Hy Brazil|Bra...,Brasil (isla mitica)|Bracile|Brazir|Île du Bré...,fuzzy-augmented,mythical island,3.966036e-09,18.423153
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q3162038,Janko Brašić,"Janko Brasić|Brašić, J. |Janko Brasic|J. Brašić",fuzzy-augmented,Serbian artist (1906-1994),2.842326e-09,17.85708
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q38803995,Fabrício Lima Brasil,"Fabricio Brasil|Brasil, F. L. |F. Brasil|Fabrí...",fuzzy-augmented,researcher,2.842326e-09,17.685045
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q61880296,Jose Nilton Brasil,"Jose N. Brasil|Fernando Brasil|Brasil, J. N. |...",fuzzy-augmented,Brazilian politician,2.949439e-09,17.68013
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q61892643,Paulo Sergio Brasil dos Santos,Paulo S. Brasil dos Santos|Paulo Sergio Brasil...,fuzzy-augmented,Brazilian politician,2.958745e-09,17.549759
5,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q76691853,Christiano Pereira Brasil,"C. P. Brasil|C. Pereira Brasil|Brasil, C. P. |...",fuzzy-augmented,Brazilian politician,2.842326e-09,17.312145


### Generate lof-related features: lof-graph-embedding-score, lof-reciprocal-rank, lof-tfidf
##### Generate required 4 features for voting classifier

In [12]:
!tl align-page-rank $candidate_file_path \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
/ string-similarity -i --method jaro_winkler -o jaro_winkler \
/ string-similarity -i --method levenshtein -o levenshtein \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard / smallest-qnode-number \
/ mosaic-features -c kg_labels --num-char --num-tokens \
/ create-singleton-feature -o singleton \
> $aligned_pagerank_candidate_file_path

align-page-rank Time: 0.22733116149902344s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 7.548395872116089s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.382241010665894s
string-similarity-['jaro_winkler'] Time: 0.4321939945220947s
string-similarity-['levenshtein'] Time: 4.071407794952393s
string-similarity-['jaccard:tokenizer=word'] Time: 0.06338810920715332s
normalize-scores-des_cont_jaccard Time: 0.01743602752685547s
smallest-qnode-number Time: 0.1722428798675537s
mosaic-features Time: 0.010573148727416992s
creat-singleton-feature Time: 0.12150192260742188s


In [13]:
features_df = pd.read_csv(aligned_pagerank_candidate_file_path)
features_df.loc[:, ['method', 'pagerank', 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].head()

Unnamed: 0,method,pagerank,aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized
0,exact-match,2.842326e-09,2.842326e-09,0,0.512195,0.0
1,exact-match,0.0,0.0,0,0.512195,0.0
2,exact-match,8.110844e-09,8.110844e-09,0,0.512195,0.0
3,exact-match,2.842326e-09,2.842326e-09,0,0.512195,0.0
4,exact-match,4.750551e-09,4.750551e-09,0,0.512195,0.0


##### Generate model-voted candidates result

In [14]:
!tl vote-by-classifier $aligned_pagerank_candidate_file_path \
--prob-threshold 0.995 \
--model $model_file_path \
--features "aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized" \
> $model_voted_candidate_file_path

vote-by-classifier Time: 1.8542311191558838s


In [15]:
model_voted_df = pd.read_csv(model_voted_candidate_file_path)
model_voted_df.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,monge_elkan_aliases,jaro_winkler,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q4957796,Brasil,Brasil (música de Cazuza)|Brasil (canção de Ca...,exact-match,1988 song performed by Cazuza,...,0.369834,0.0,0.130435,0.0,0.0,0,6,1,0,0
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q104882163,Brasil,,exact-match,,...,0.0,0.0,0.130435,0.0,0.0,0,6,1,0,0
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q2122741,Brasil,,exact-match,Wikimedia disambiguation page,...,0.0,0.0,0.130435,0.0,0.0,0,6,1,0,0
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q67210045,Brasil,,exact-match,Monument in the city of Rio de Janeiro,...,0.0,0.0,0.130435,0.0,0.0,0,6,1,0,0
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899086,Brasil,,exact-match,The Manhattan Transfer album,...,0.0,0.0,0.130435,0.0,0.0,0,6,1,0,0


##### Generate graph-embedding-score using centroid-of-lof and lof-strategy

In [18]:
!tl score-using-embedding $model_voted_candidate_file_path \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o lof-graph-embedding-score \
--embedding-file $graph_embedding_complex_file \
--embedding-url $index_url \
> $graph_embedding_file_path

Qnodes to lookup: 2862
Qnodes from file: 2836
Qnodes from server: 0
No pseudo GT available, using all exact matches as high precision
_centroid_of_lof: Missing 5 of 307
Outlier removal generates 181 lof-voted candidates
score-using-embedding Time: 1.5239858627319336s


In [19]:
score_df = pd.read_csv(graph_embedding_file_path)
score_df.head(5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q4957796,Brasil,Brasil (música de Cazuza)|Brasil (canção de Ca...,exact-match,1988 song performed by Cazuza,...,0.130435,0.0,0.0,0,6,1,0,0,-1,0.690211
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q104882163,Brasil,,exact-match,,...,0.130435,0.0,0.0,0,6,1,0,0,-1,0.0
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q2122741,Brasil,,exact-match,Wikimedia disambiguation page,...,0.130435,0.0,0.0,0,6,1,0,0,-1,0.616807
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q67210045,Brasil,,exact-match,Monument in the city of Rio de Janeiro,...,0.130435,0.0,0.0,0,6,1,0,0,-1,0.823957
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899086,Brasil,,exact-match,The Manhattan Transfer album,...,0.130435,0.0,0.0,0,6,1,0,0,-1,0.590354


In [20]:
score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).loc[:, [
    'kg_id', 'kg_labels', 'kg_descriptions', 'method', 'singleton', 'vote_by_classifier', 'is_lof', 'lof-graph-embedding-score'
]].head(20)

Unnamed: 0,kg_id,kg_labels,kg_descriptions,method,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
1096,Q14210939,Sertão (Pernambuco),,fuzzy-augmented,0,0,-1,0.919986
1809,Q5589752,Governorate General of Rio de Janeiro,,fuzzy-augmented,0,0,-1,0.917529
493,Q6472458,Lagoa do Mato (distrito de Itatira)|Lagoa do M...,,fuzzy-augmented,0,0,-1,0.915594
1051,Q7973821,Saneamento em Pernambuco|Water supply and sani...,,fuzzy-augmented,0,0,-1,0.907733
2005,Q53658012,Acre,constituency of the Chamber of Deputies of Brazil,exact-match,0,0,1,0.893119
2090,Q53658012,Acre,constituency of the Chamber of Deputies of Brazil,fuzzy-augmented,0,0,-1,0.893119
2882,Q53657913,Bahia,constituency of the Chamber of Deputies of Brazil,exact-match,0,0,1,0.889193
2971,Q53657913,Bahia,constituency of the Chamber of Deputies of Brazil,fuzzy-augmented,0,0,-1,0.889193
996,Q53657928,Pernambuco,constituency of the Chamber of Deputies of Brazil,exact-match,0,0,1,0.888362
1040,Q53657928,Pernambuco,constituency of the Chamber of Deputies of Brazil,fuzzy-augmented,0,0,-1,0.888362


##### Generate lof reciprocal rank feature

In [21]:
!tl generate-reciprocal-rank "$graph_embedding_file_path" \
-c lof-graph-embedding-score \
-o lof-reciprocal-rank \
> "$lof_reciprocal_rank_file_path"

generate-reciprocal-rank-lof-graph-embedding-score Time: 0.17420387268066406s


In [22]:
pd.read_csv(lof_reciprocal_rank_file_path, nrows=5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q4957796,Brasil,Brasil (música de Cazuza)|Brasil (canção de Ca...,exact-match,1988 song performed by Cazuza,...,0.0,0.0,0,6,1,0,0,-1,0.690211,0.02381
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q104882163,Brasil,,exact-match,,...,0.0,0.0,0,6,1,0,0,-1,0.0,0.008475
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q2122741,Brasil,,exact-match,Wikimedia disambiguation page,...,0.0,0.0,0,6,1,0,0,-1,0.616807,0.009709
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q67210045,Brasil,,exact-match,Monument in the city of Rio de Janeiro,...,0.0,0.0,0,6,1,0,0,-1,0.823957,1.0
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899086,Brasil,,exact-match,The Manhattan Transfer album,...,0.0,0.0,0,6,1,0,0,-1,0.590354,0.009009


##### Generate lof tfidf feature

In [23]:
!tl compute-tf-idf "$lof_reciprocal_rank_file_path" \
--feature-file "$class_count_file" \
--feature-name class_count \
--singleton-column is_lof \
-o lof_class_count_tf_idf_score \
/ compute-tf-idf \
--feature-file "$property_count_file" \
--feature-name property_count \
--singleton-column is_lof \
-o lof_property_count_tf_idf_score \
> "$lof_feature_file"

compute-tf-idf-class_count Time: 0.2207801342010498s
compute-tf-idf-property_count Time: 0.8968029022216797s


In [24]:
d = pd.read_csv(lof_feature_file, nrows=5)
d

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank,lof_class_count_tf_idf_score,top5_class_count,lof_property_count_tf_idf_score,top5_property_count
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q4957796,Brasil,Brasil (música de Cazuza)|Brasil (canção de Ca...,exact-match,1988 song performed by Cazuza,...,1,0,0,-1,0.690211,0.02381,0.105605,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.014212,P646:0.007|P361:0.006|P136:0.002|P577:0.000|P1...
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q104882163,Brasil,,exact-match,,...,1,0,0,-1,0.0,0.008475,0.0,,0.000916,P2671:0.001
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q2122741,Brasil,,exact-match,Wikimedia disambiguation page,...,1,0,0,-1,0.616807,0.009709,0.170382,Q16686448:0.007|Q4167410:0.004|Q15633587:0.004...,0.001009,P1889:0.003|P460:0.001|P31:-0.002
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q67210045,Brasil,,exact-match,Monument in the city of Rio de Janeiro,...,1,0,0,-1,0.823957,1.0,0.109207,Q618123:0.012|Q16686448:0.007|Q27096213:0.007|...,0.022503,P131:0.009|P17:0.007|P625:0.005|P973:0.003|P14...
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899086,Brasil,,exact-match,The Manhattan Transfer album,...,1,0,0,-1,0.590354,0.009009,0.042707,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.000822,P136:0.002|P577:0.000|P175:0.000|P155:0.000|P1...


##### Add context score

In [25]:
!tl context-match $lof_feature_file \
    --context-file $context_file  \
    -o context_score \
    --debug \
> $context_score_file

context-match Time: 23.37723708152771s


In [26]:
pd.read_csv(context_score_file, nrows=10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,is_lof,lof-graph-embedding-score,lof-reciprocal-rank,lof_class_count_tf_idf_score,top5_class_count,lof_property_count_tf_idf_score,top5_property_count,context_properties,context_similarity,context_score
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q4957796,Brasil,Brasil (música de Cazuza)|Brasil (canção de Ca...,exact-match,1988 song performed by Cazuza,...,-1,0.690211,0.02381,0.105605,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.014212,P646:0.007|P361:0.006|P136:0.002|P577:0.000|P1...,||,0.0|0.0|0.0,0.0
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q104882163,Brasil,,exact-match,,...,-1,0.0,0.008475,0.0,,0.000916,P2671:0.001,||,0.0|0.0|0.0,0.0
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q2122741,Brasil,,exact-match,Wikimedia disambiguation page,...,-1,0.616807,0.009709,0.170382,Q16686448:0.007|Q4167410:0.004|Q15633587:0.004...,0.001009,P1889:0.003|P460:0.001|P31:-0.002,||,0.0|0.0|0.0,0.0
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q67210045,Brasil,,exact-match,Monument in the city of Rio de Janeiro,...,-1,0.823957,1.0,0.109207,Q618123:0.012|Q16686448:0.007|Q27096213:0.007|...,0.022503,P131:0.009|P17:0.007|P625:0.005|P973:0.003|P14...,||,0.0|0.0|0.0,0.0
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899086,Brasil,,exact-match,The Manhattan Transfer album,...,-1,0.590354,0.009009,0.042707,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.000822,P136:0.002|P577:0.000|P175:0.000|P155:0.000|P1...,||,0.0|0.0|0.0,0.0
5,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899371,Hy Brazil|Isla Brasil|Brasilinsel|Ilha Brasil|...,Brasil (isla mítica)|Brazir|Ile de Brasil|Hy B...,exact-match,mythical island,...,-1,0.679959,0.019608,7.5e-05,Q7184903:0.003|Q488383:0.001|Q129264:0.000|Q18...,0.01433,P646:0.007|P227:0.006|P1889:0.003|P206:0.000|P...,||,0.0|0.0|0.0,0.0
6,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q3284268,Brasil (álbum)|Brasil,Brasil (album),exact-match,Ratos de Porão album,...,-1,0.679621,0.018868,0.042707,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.002575,P136:0.002|P495:0.002|P577:0.000|P175:0.000|P1...,||,0.0|0.0|0.0,0.0
7,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q36962840,Brasil,,exact-match,family name,...,1,0.615094,0.009524,0.027079,Q7184903:0.003|Q1786828:0.002|Q5371079:0.002|Q...,0.012906,P373:0.007|P1705:0.003|P1889:0.003|P3879:0.001...,||,0.0|0.0|0.0,0.0
8,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q45312653,palo campeche|bois de campêche|logwood,brasil|bois de campeche,exact-match,dye,...,-1,0.625324,0.010204,0.074377,Q16686448:0.007|Q58416391:0.006|Q337060:0.004|...,0.020928,P1417:0.010|P1343:0.008|P279:0.003|P527:0.002|...,||,0.0|0.0|0.0,0.0
9,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q66789217,brazilwood dye,brasil|brasilwood dye|brazil|brazilwood|brasil...,exact-match,red dye extracted from the wood of trees of th...,...,-1,0.638706,0.010753,0.074377,Q16686448:0.007|Q58416391:0.006|Q337060:0.004|...,0.002836,P279:0.003|P527:0.002|P1014:0.000|P1582:0.000|...,||,0.0|0.0|0.0,0.0


### Model Prediction

In [27]:
features_str = ",".join(features)
!tl predict-using-model -o siamese_prediction \
--ranking-model $ranking_model_file_path \
--features $features_str \
--normalization-factor $min_max_scaler_path $context_score_file > $output_model_pred_file

predict-using-model Time: 1.2382500171661377s


In [28]:
features_str

'pagerank,retrieval_score,monge_elkan,monge_elkan_aliases,des_cont_jaccard,jaro_winkler,levenshtein,singleton,num_char,num_tokens,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score,lof-graph-embedding-score,lof-reciprocal-rank,context_score'

### Get Top 5 links

In [29]:
!tl get-kg-links -c $final_score_column -k 5 --k-rows $output_model_pred_file > $top5_links

get-kg-links-siamese_prediction Time: 0.2170569896697998s


In [30]:
pd.set_option('display.max_rows', None)
final_output = pd.read_csv(top5_links, nrows=10)
final_output[['column', 'row', 'label', 'context', 'kg_id', 'kg_labels', 'kg_aliases',
             'kg_descriptions', 'siamese_prediction']]

Unnamed: 0,column,row,label,context,kg_id,kg_labels,kg_aliases,kg_descriptions,siamese_prediction
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q155,Brasilien|Brasile|Brésil|Brasil|Brazil,Federativa República do Brasil|BRA|Brezil|Bres...,country in South America,2.36291e-22
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q104882163,Brasil,,,3.754191e-36
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q17388323,Edgar Brasil,"E. Brasil|Brasil, E.",Brazilian cinematographer,4.2228469999999994e-38
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q5035713,capital|Capital of Brazil,,,8.141020000000001e-39
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q89406886,Thiago Gonçalves Brasil,"Brasil, T. G. |Thiago Goncalves Brasil|T. Gonç...",researcher,0.0
5,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q42824,Mato Grosso,Mato-Grosso|Estado de Mato Grosso|Matorral Gue...,federated state of Brazil,3.4371929999999996e-19
6,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q43319,Mato Grosso do Sul,Estado de Mato Grosso do Sul|Estado de Mato Gr...,federated state of Brazil,5.750948999999999e-26
7,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q53657975,Mato Grosso,,constituency of the Federal Senate of Brazil,8.92739e-32
8,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q1261323,Cercomacra melanaria|Chororó-do-pantanal|Mato ...,Cercomacra melanaria|Hormiguero de Matto Groso...,species of bird,3.8521640000000005e-33
9,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q1265128,Andorinhão-da-amazônia|Chaetura viridipennis|M...,Chaetura viridipennis|Chaetura chapmani viridi...,species of bird,1.726228e-33


### Colorized KG Links file

In [None]:
!tl add-color -c "$final_score_column" -k 5 $top5_links --output "$colorized_kg_links"

In [None]:
!open "$colorized_kg_links"