In [1]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
HOME_DIR = '/tmp'
table_path = f'{HOME_DIR}/coinv_rand_new_aff2.csv'
f_name = table_path.split("/")[-1]
wikify_column_name = "0"
final_score_column = "siamese_prediction"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count,context'
temp_dir= f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/temp/apr_test.csv'
model_file_path = './models/weighted_lr.pkl'
ranking_model_file_path = './models/epoch_5_loss_0.09882864356040955_top1_0.8968926553672316.pth'
min_max_scaler_path = './models/normalization_factor.pkl'

model_voted_candidate_file_path = f'{HOME_DIR}/temp/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/temp/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/temp/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/temp/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/temp/lof_feature.csv'
context_score_file = f'{HOME_DIR}/temp/context_score_file.csv'

output_model_pred_file = f'{HOME_DIR}/temp/model_prediction.csv'
top5_links = f'{HOME_DIR}/temp/top5_links.csv'
colorized_kg_links = f'{HOME_DIR}/temp/{f_name.strip(".csv")}_colorized.xlsx'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
class_count_file = f'{HOME_DIR}/temp/class_count.tsv'
property_count_file = f'{HOME_DIR}/temp/property_count.tsv'
context_file = f'{HOME_DIR}/temp/context.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

In [3]:
!rm -rf $temp_dir
!mkdir -p $temp_dir

In [4]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score']

In [5]:
print(len(features))

15


In [6]:
!ls "$table_path"

/tmp/coinv_rand_new_aff2.csv


### Canonicalize

In [7]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" \
> "$canonical_file_path"

canonicalize Time: 0.006151914596557617s


In [8]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context,filename,column-id
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
1,0,1,RAYMOND J. MACDONALD,UNIVERSITY OF UTAH|LEWIS C MURTAUGH,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
2,0,2,CAROLYN J. ANDERSON,UNIVERSITY OF PITTSBURGH AT PITTSBURGH|JELENA ...,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
3,0,3,NATHANIEL A. HATHAWAY,"EPIGENOS BIOSCIENCE, INC.|DMITRI KIREEV",coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
4,0,4,CARL D LANGEFELD,WAKE FOREST UNIVERSITY HEALTH SCIENCES|DONALD ...,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0


### Candidate Generation

In [9]:
!tl clean -c label -o label_clean "$canonical_file_path" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-exact-matches \
-c label_clean --auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" > "$candidate_file_path"

clean Time: 0.005613088607788086s
get-fuzzy-augmented-matches Time: 5.528326034545898s
get-exact-matches Time: 1.7042810916900635s


In [10]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count',
    'context': 'context'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t'))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{HOME_DIR}/temp/{field}.tsv', sep='\t', index=False)

In [11]:
pd.read_csv(candidate_file_path, nrows=6)

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"Smoot, R. |R. Smoot",fuzzy-augmented,college basketball player (2019–2019) Hampton,2.842326e-09,20.439472
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"Smoot, D. |D. Smoot",fuzzy-augmented,FBI agent; conservative political activist,2.842326e-09,20.439472
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"F. Smoot|Smoot, F.",fuzzy-augmented,"All-American college football player, professi...",2.842326e-09,20.439472
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Joseph P. Smoot|J. Smoot|Smoot, J. P. |J. P. S...",fuzzy-augmented,researcher ORCID ID = 0000-0002-5064-8070,2.842326e-09,20.010927
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q36550494,Edith L. Smoot,"E. L. Smoot|Smoot|Smoot, E. L. |Edith L. Smoot",fuzzy-augmented,,2.842326e-09,19.942667
5,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924777,Jared Smoot,"Smoot, J. |J. Smoot",fuzzy-augmented,college basketball player (2012–2013) Navy,2.842326e-09,19.846024


### Generate lof-related features: lof-graph-embedding-score, lof-reciprocal-rank, lof-tfidf
##### Generate required 4 features for voting classifier

In [12]:
!tl align-page-rank $candidate_file_path \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
/ string-similarity -i --method jaro_winkler -o jaro_winkler \
/ string-similarity -i --method levenshtein -o levenshtein \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard / smallest-qnode-number \
/ mosaic-features -c kg_labels --num-char --num-tokens \
/ create-singleton-feature -o singleton \
> $aligned_pagerank_candidate_file_path

align-page-rank Time: 0.24790287017822266s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.9394710063934326s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 7.717921018600464s
string-similarity-['jaro_winkler'] Time: 0.44293713569641113s
string-similarity-['levenshtein'] Time: 2.168013095855713s
string-similarity-['jaccard:tokenizer=word'] Time: 0.10189223289489746s
normalize-scores-des_cont_jaccard Time: 0.04284214973449707s
smallest-qnode-number Time: 0.24477171897888184s
mosaic-features Time: 0.01777482032775879s
create-singleton-feature Time: 0.22430634498596191s


In [16]:
features_df = pd.read_csv(aligned_pagerank_candidate_file_path)
features_df.loc[:, ['method', 'pagerank', 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].head()

Unnamed: 0,method,pagerank,aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized
0,exact-match,0.0,0.0,0,0.0,0.0
1,fuzzy-augmented,2.842326e-09,0.0,0,0.5,0.0
2,fuzzy-augmented,2.842326e-09,0.0,0,0.5,0.0
3,fuzzy-augmented,2.842326e-09,0.0,0,0.75,0.0
4,fuzzy-augmented,2.842326e-09,0.0,0,0.736111,0.0


##### Generate model-voted candidates result

In [17]:
!tl vote-by-classifier $aligned_pagerank_candidate_file_path \
--prob-threshold 0.995 \
--model $model_file_path \
--features "aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized" \
> $model_voted_candidate_file_path

vote-by-classifier Time: 0.9107117652893066s


In [18]:
model_voted_df = pd.read_csv(model_voted_candidate_file_path)
model_voted_df.head()

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,monge_elkan_aliases,jaro_winkler,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,,,,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"Smoot, R. |R. Smoot",...,0.5,0.733333,0.5,0.0,0.0,0,14,2,0,0
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"Smoot, D. |D. Smoot",...,0.5,0.755556,0.6,0.0,0.0,0,9,2,0,0
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"F. Smoot|Smoot, F.",...,0.5,0.8,0.6,0.0,0.0,0,10,2,0,0
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Joseph P. Smoot|J. Smoot|Smoot, J. P. |J. P. S...",...,0.613426,0.713492,0.583333,0.0,0.0,0,12,2,0,0


##### Generate graph-embedding-score using centroid-of-lof and lof-strategy

In [19]:
!tl score-using-embedding $model_voted_candidate_file_path \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o lof-graph-embedding-score \
--embedding-file $graph_embedding_complex_file \
--embedding-url $index_url \
> $graph_embedding_file_path

Qnodes to lookup: 5281
Qnodes from file: 5264
Qnodes from server: 0
Outlier removal generates 13 lof-voted candidates
score-using-embedding Time: 2.0438449382781982s


In [20]:
score_df = pd.read_csv(graph_embedding_file_path)
score_df.head(5)

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,,,,...,0.0,0.0,0.0,0,0,0,0,0,-1,0.0
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"Smoot, R. |R. Smoot",...,0.5,0.0,0.0,0,14,2,0,0,-1,0.721259
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"Smoot, D. |D. Smoot",...,0.6,0.0,0.0,0,9,2,0,0,-1,0.689004
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"F. Smoot|Smoot, F.",...,0.6,0.0,0.0,0,10,2,0,0,-1,0.668961
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Joseph P. Smoot|J. Smoot|Smoot, J. P. |J. P. S...",...,0.583333,0.0,0.0,0,12,2,0,0,-1,0.724349


In [21]:
score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).loc[:, [
    'kg_id', 'kg_labels', 'kg_descriptions', 'method', 'singleton', 'vote_by_classifier', 'is_lof', 'lof-graph-embedding-score'
]].head(20)

Unnamed: 0,kg_id,kg_labels,kg_descriptions,method,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
5030,Q89287596,Hyunjung Lee,researcher,fuzzy-augmented,0,0,-1,0.972943
3077,Q89869774,Lin Cheng,researcher (ORCID 0000-0001-5731-8405),fuzzy-augmented,0,0,-1,0.969543
5299,Q90043729,Timothy J Satchwell,researcher,fuzzy-augmented,0,0,-1,0.968866
4170,Q90076127,Elaine S Barry,researcher,fuzzy-augmented,0,0,-1,0.968765
4578,Q57890879,Xiaorong Zhou,"researcher, ORCID id # 0000-0001-9943-3997",fuzzy-augmented,0,0,-1,0.968411
3101,Q95941333,Lin Cheng,researcher ORCID 0000-0003-3326-4590,fuzzy-augmented,0,0,-1,0.968114
4786,Q89414425,Retief Wessels,researcher,fuzzy-augmented,0,0,-1,0.968035
4581,Q92529444,Xiaorong Luo,researcher (ORCID 0000-0001-5973-3258),fuzzy-augmented,0,0,-1,0.967789
3113,Q98647455,Cheng Lin,researcher ORCID 0000-0003-1152-3512,fuzzy-augmented,0,0,-1,0.967326
3029,Q98647455,Cheng Lin,researcher ORCID 0000-0003-1152-3512,exact-match,0,0,-1,0.967326


##### Generate lof reciprocal rank feature

In [22]:
!tl generate-reciprocal-rank "$graph_embedding_file_path" \
-c lof-graph-embedding-score \
-o lof-reciprocal-rank \
> "$lof_reciprocal_rank_file_path"

generate-reciprocal-rank-lof-graph-embedding-score Time: 0.11087489128112793s


In [23]:
pd.read_csv(lof_reciprocal_rank_file_path, nrows=5)

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,,,,...,0.0,0.0,0,0,0,0,0,-1,0.0,0.009615
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"Smoot, R. |R. Smoot",...,0.0,0.0,0,14,2,0,0,-1,0.721259,0.034483
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"Smoot, D. |D. Smoot",...,0.0,0.0,0,9,2,0,0,-1,0.689004,0.021739
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"F. Smoot|Smoot, F.",...,0.0,0.0,0,10,2,0,0,-1,0.668961,0.016129
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Joseph P. Smoot|J. Smoot|Smoot, J. P. |J. P. S...",...,0.0,0.0,0,12,2,0,0,-1,0.724349,0.037037


##### Generate lof tfidf feature

In [26]:
!tl compute-tf-idf "$lof_reciprocal_rank_file_path" \
--feature-file "$class_count_file" \
--feature-name class_count \
--singleton-column is_lof \
-o lof_class_count_tf_idf_score \
/ compute-tf-idf \
--feature-file "$property_count_file" \
--feature-name property_count \
--singleton-column is_lof \
-o lof_property_count_tf_idf_score \
> "$lof_feature_file"

compute-tf-idf-class_count Time: 0.4749891757965088s
compute-tf-idf-property_count Time: 1.0780651569366455s


In [25]:
d = pd.read_csv(lof_feature_file, nrows=5)
d

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank,lof_class_count_tf_idf_score,top5_class_count,lof_property_count_tf_idf_score,top5_property_count
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,,,,...,0,0,0,-1,0.0,0.009615,0.0,,0.0,
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"Smoot, R. |R. Smoot",...,2,0,0,-1,0.721259,0.034483,0.0,Q103940464:0.000|Q154954:0.000|Q159344:0.000|Q...,0.0,P106:0.000|P118:0.000|P2094:0.000|P21:0.000|P3...
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"Smoot, D. |D. Smoot",...,2,0,0,-1,0.689004,0.021739,0.0,Q103940464:0.000|Q1190554:0.000|Q12737077:0.00...,0.0,P1006:0.000|P106:0.000|P1207:0.000|P1938:0.000...
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"F. Smoot|Smoot, F.",...,2,0,0,-1,0.668961,0.016129,0.0,Q103940464:0.000|Q14128148:0.000|Q154954:0.000...,0.0,P106:0.000|P1532:0.000|P18:0.000|P19:0.000|P20...
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Joseph P. Smoot|J. Smoot|Smoot, J. P. |J. P. S...",...,2,0,0,-1,0.724349,0.037037,0.0,Q103940464:0.000|Q154954:0.000|Q159344:0.000|Q...,0.0,P106:0.000|P108:0.000|P1153:0.000|P2038:0.000|...


##### Add context score

In [25]:
!tl context-match $lof_feature_file \
    --context-file $context_file  \
    -o context_score \
    --debug \
> $context_score_file

context-match Time: 23.37723708152771s


In [26]:
pd.read_csv(context_score_file, nrows=10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,is_lof,lof-graph-embedding-score,lof-reciprocal-rank,lof_class_count_tf_idf_score,top5_class_count,lof_property_count_tf_idf_score,top5_property_count,context_properties,context_similarity,context_score
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q4957796,Brasil,Brasil (música de Cazuza)|Brasil (canção de Ca...,exact-match,1988 song performed by Cazuza,...,-1,0.690211,0.02381,0.105605,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.014212,P646:0.007|P361:0.006|P136:0.002|P577:0.000|P1...,||,0.0|0.0|0.0,0.0
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q104882163,Brasil,,exact-match,,...,-1,0.0,0.008475,0.0,,0.000916,P2671:0.001,||,0.0|0.0|0.0,0.0
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q2122741,Brasil,,exact-match,Wikimedia disambiguation page,...,-1,0.616807,0.009709,0.170382,Q16686448:0.007|Q4167410:0.004|Q15633587:0.004...,0.001009,P1889:0.003|P460:0.001|P31:-0.002,||,0.0|0.0|0.0,0.0
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q67210045,Brasil,,exact-match,Monument in the city of Rio de Janeiro,...,-1,0.823957,1.0,0.109207,Q618123:0.012|Q16686448:0.007|Q27096213:0.007|...,0.022503,P131:0.009|P17:0.007|P625:0.005|P973:0.003|P14...,||,0.0|0.0|0.0,0.0
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899086,Brasil,,exact-match,The Manhattan Transfer album,...,-1,0.590354,0.009009,0.042707,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.000822,P136:0.002|P577:0.000|P175:0.000|P155:0.000|P1...,||,0.0|0.0|0.0,0.0
5,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q899371,Hy Brazil|Isla Brasil|Brasilinsel|Ilha Brasil|...,Brasil (isla mítica)|Brazir|Ile de Brasil|Hy B...,exact-match,mythical island,...,-1,0.679959,0.019608,7.5e-05,Q7184903:0.003|Q488383:0.001|Q129264:0.000|Q18...,0.01433,P646:0.007|P227:0.006|P1889:0.003|P206:0.000|P...,||,0.0|0.0|0.0,0.0
6,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q3284268,Brasil (álbum)|Brasil,Brasil (album),exact-match,Ratos de Porão album,...,-1,0.679621,0.018868,0.042707,Q16686448:0.007|Q337060:0.004|Q28877:0.004|Q83...,0.002575,P136:0.002|P495:0.002|P577:0.000|P175:0.000|P1...,||,0.0|0.0|0.0,0.0
7,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q36962840,Brasil,,exact-match,family name,...,1,0.615094,0.009524,0.027079,Q7184903:0.003|Q1786828:0.002|Q5371079:0.002|Q...,0.012906,P373:0.007|P1705:0.003|P1889:0.003|P3879:0.001...,||,0.0|0.0|0.0,0.0
8,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q45312653,palo campeche|bois de campêche|logwood,brasil|bois de campeche,exact-match,dye,...,-1,0.625324,0.010204,0.074377,Q16686448:0.007|Q58416391:0.006|Q337060:0.004|...,0.020928,P1417:0.010|P1343:0.008|P279:0.003|P527:0.002|...,||,0.0|0.0|0.0,0.0
9,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Brasil,Q66789217,brazilwood dye,brasil|brasilwood dye|brazil|brazilwood|brasil...,exact-match,red dye extracted from the wood of trees of th...,...,-1,0.638706,0.010753,0.074377,Q16686448:0.007|Q58416391:0.006|Q337060:0.004|...,0.002836,P279:0.003|P527:0.002|P1014:0.000|P1582:0.000|...,||,0.0|0.0|0.0,0.0


### Model Prediction

In [27]:
features_str = ",".join(features)
!tl predict-using-model -o siamese_prediction \
--ranking-model $ranking_model_file_path \
--features $features_str \
--normalization-factor $min_max_scaler_path $context_score_file > $output_model_pred_file

predict-using-model Time: 1.2382500171661377s


In [28]:
features_str

'pagerank,retrieval_score,monge_elkan,monge_elkan_aliases,des_cont_jaccard,jaro_winkler,levenshtein,singleton,num_char,num_tokens,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score,lof-graph-embedding-score,lof-reciprocal-rank,context_score'

### Get Top 5 links

In [29]:
!tl get-kg-links -c $final_score_column -k 5 --k-rows $output_model_pred_file > $top5_links

get-kg-links-siamese_prediction Time: 0.2170569896697998s


In [30]:
pd.set_option('display.max_rows', None)
final_output = pd.read_csv(top5_links, nrows=10)
final_output[['column', 'row', 'label', 'context', 'kg_id', 'kg_labels', 'kg_aliases',
             'kg_descriptions', 'siamese_prediction']]

Unnamed: 0,column,row,label,context,kg_id,kg_labels,kg_aliases,kg_descriptions,siamese_prediction
0,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q155,Brasilien|Brasile|Brésil|Brasil|Brazil,Federativa República do Brasil|BRA|Brezil|Bres...,country in South America,2.36291e-22
1,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q104882163,Brasil,,,3.754191e-36
2,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q17388323,Edgar Brasil,"E. Brasil|Brasil, E.",Brazilian cinematographer,4.2228469999999994e-38
3,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q5035713,capital|Capital of Brazil,,,8.141020000000001e-39
4,0,0,Brasil,Área\nplantada\n(ha)|77337268|acessos,Q89406886,Thiago Gonçalves Brasil,"Brasil, T. G. |Thiago Goncalves Brasil|T. Gonç...",researcher,0.0
5,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q42824,Mato Grosso,Mato-Grosso|Estado de Mato Grosso|Matorral Gue...,federated state of Brazil,3.4371929999999996e-19
6,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q43319,Mato Grosso do Sul,Estado de Mato Grosso do Sul|Estado de Mato Gr...,federated state of Brazil,5.750948999999999e-26
7,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q53657975,Mato Grosso,,constituency of the Federal Senate of Brazil,8.92739e-32
8,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q1261323,Cercomacra melanaria|Chororó-do-pantanal|Mato ...,Cercomacra melanaria|Hormiguero de Matto Groso...,species of bird,3.8521640000000005e-33
9,0,1,Mato Grosso,Área\nplantada\n(ha)|14551044|acessos,Q1265128,Andorinhão-da-amazônia|Chaetura viridipennis|M...,Chaetura viridipennis|Chaetura chapmani viridi...,species of bird,1.726228e-33


### Colorized KG Links file

In [None]:
!tl add-color -c "$final_score_column" -k 5 $top5_links --output "$colorized_kg_links"

In [None]:
!open "$colorized_kg_links"