In [2052]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2053]:
HOME_DIR = '/Users/grantxie/test/coi14'
table_path = f'{HOME_DIR}/coinv_rand_new_aff2.csv'
f_name = table_path.split("/")[-1]
wikify_column_name = "0"
final_score_column = "siamese_prediction"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count,context'
temp_dir= f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/temp/apr_test.csv'
model_file_path = './models/weighted_lr.pkl'
ranking_model_file_path = './models/epoch_5_loss_0.09882864356040955_top1_0.8968926553672316.pth'
min_max_scaler_path = './models/normalization_factor.pkl'

model_voted_candidate_file_path = f'{HOME_DIR}/temp/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/temp/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/temp/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/temp/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/temp/lof_feature.csv'
context_score_file = f'{HOME_DIR}/temp/context_score_file.csv'

output_model_pred_file = f'{HOME_DIR}/temp/model_prediction.csv'
top5_links = f'{HOME_DIR}/temp/top5_links.csv'
colorized_kg_links = f'{HOME_DIR}/temp/{f_name.strip(".csv")}_colorized.xlsx'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
class_count_file = f'{HOME_DIR}/temp/class_count.tsv'
property_count_file = f'{HOME_DIR}/temp/property_count.tsv'
context_file = f'{HOME_DIR}/temp/context.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

string_threshold = 0.9
siamese_threshold = 0.9
custom_context_file = '/Users/grantxie/test/coi3/coauthors.context.tsv.gz'
gt = '/Users/grantxie/Downloads/groundtruth_new.csv'
selection_save_path = 'test_selection.csv'
labeled_path = 'test_eva.csv'

In [2054]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score']

In [2055]:
print(len(features))

15


In [2056]:
!ls "$table_path"

/Users/grantxie/test/coi14/coinv_rand_new_aff2.csv


### Canonicalize

In [2057]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" \
> "$canonical_file_path"

canonicalize Time: 0.002680063247680664s


In [2058]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context,filename,column-id
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
1,0,1,RAYMOND J. MACDONALD,UNIVERSITY OF UTAH|LEWIS C MURTAUGH,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
2,0,2,CAROLYN J. ANDERSON,UNIVERSITY OF PITTSBURGH AT PITTSBURGH|JELENA ...,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
3,0,3,NATHANIEL A. HATHAWAY,"EPIGENOS BIOSCIENCE, INC.|DMITRI KIREEV",coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0
4,0,4,CARL D LANGEFELD,WAKE FOREST UNIVERSITY HEALTH SCIENCES|DONALD ...,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0


In [2059]:
canonical_file_path

'/Users/grantxie/test/coi14/temp/canonical.csv'

### Candidate Generation

In [2060]:
!tl clean -c label -o label_clean "$canonical_file_path" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-exact-matches \
-c label_clean --auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" > "$candidate_file_path"

clean Time: 0.00244903564453125s
get-fuzzy-augmented-matches Time: 4.517951965332031s
get-exact-matches Time: 0.2938730716705322s


In [2061]:
candidate_file_path

'/Users/grantxie/test/coi14/temp/candidates.csv'

In [2062]:
cand = pd.read_csv(candidate_file_path)
cand.head()

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"R. Smoot|Smoot, R.",fuzzy-augmented,college basketball player (2019–2019) Hampton,2.842326e-09,20.439472
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"D. Smoot|Smoot, D.",fuzzy-augmented,FBI agent; conservative political activist,2.842326e-09,20.439472
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"Smoot, F. |F. Smoot",fuzzy-augmented,"All-American college football player, professi...",2.842326e-09,20.439472
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Smoot, J. |J. Smoot|Smoot, J. P. |Joseph P. Sm...",fuzzy-augmented,researcher ORCID ID = 0000-0002-5064-8070,2.842326e-09,20.010927
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q36550494,Edith L. Smoot,"Edith L. Smoot|E. L. Smoot|Smoot|Smoot, E. L.",fuzzy-augmented,,2.842326e-09,19.942667


In [2063]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count',
    'context': 'context'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t'))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{HOME_DIR}/temp/{field}.tsv', sep='\t', index=False)

In [2064]:
pd.read_csv(candidate_file_path, nrows=6)

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"R. Smoot|Smoot, R.",fuzzy-augmented,college basketball player (2019–2019) Hampton,2.842326e-09,20.439472
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"D. Smoot|Smoot, D.",fuzzy-augmented,FBI agent; conservative political activist,2.842326e-09,20.439472
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"Smoot, F. |F. Smoot",fuzzy-augmented,"All-American college football player, professi...",2.842326e-09,20.439472
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Smoot, J. |J. Smoot|Smoot, J. P. |Joseph P. Sm...",fuzzy-augmented,researcher ORCID ID = 0000-0002-5064-8070,2.842326e-09,20.010927
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q36550494,Edith L. Smoot,"Edith L. Smoot|E. L. Smoot|Smoot|Smoot, E. L.",fuzzy-augmented,,2.842326e-09,19.942667
5,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924777,Jared Smoot,"Smoot, J. |J. Smoot",fuzzy-augmented,college basketball player (2012–2013) Navy,2.842326e-09,19.846024


### Generate lof-related features: lof-graph-embedding-score, lof-reciprocal-rank, lof-tfidf
##### Generate required 4 features for voting classifier

In [2065]:
!tl align-page-rank $candidate_file_path \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
/ string-similarity -i --method jaro_winkler -o jaro_winkler \
/ string-similarity -i --method levenshtein -o levenshtein \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard / smallest-qnode-number \
/ mosaic-features -c kg_labels --num-char --num-tokens \
/ create-singleton-feature -o singleton \
> $aligned_pagerank_candidate_file_path

align-page-rank Time: 0.17309117317199707s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.9182159900665283s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.5664570331573486s
string-similarity-['jaro_winkler'] Time: 0.21261310577392578s
string-similarity-['levenshtein'] Time: 0.9800057411193848s
string-similarity-['jaccard:tokenizer=word'] Time: 0.049462080001831055s
normalize-scores-des_cont_jaccard Time: 0.017277956008911133s
smallest-qnode-number Time: 0.16054296493530273s
mosaic-features Time: 0.007483959197998047s
create-singleton-feature Time: 0.10145902633666992s


In [2066]:
features_df = pd.read_csv(aligned_pagerank_candidate_file_path)
features_df.loc[:, ['method', 'pagerank', 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].head()

Unnamed: 0,method,pagerank,aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized
0,exact-match,0.0,0.0,0,0.0,0.0
1,fuzzy-augmented,2.842326e-09,0.0,0,0.5,0.0
2,fuzzy-augmented,2.842326e-09,0.0,0,0.5,0.0
3,fuzzy-augmented,2.842326e-09,0.0,0,0.75,0.0
4,fuzzy-augmented,2.842326e-09,0.0,0,0.736111,0.0


##### Generate model-voted candidates result

In [2067]:
!tl vote-by-classifier $aligned_pagerank_candidate_file_path \
--prob-threshold 0.995 \
--model $model_file_path \
--features "aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized" \
> $model_voted_candidate_file_path

vote-by-classifier Time: 0.41292500495910645s


In [2068]:
model_voted_df = pd.read_csv(model_voted_candidate_file_path)
model_voted_df.head()

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,monge_elkan_aliases,jaro_winkler,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier
0,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,,,,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q100924778,Ricquall Smoot,"R. Smoot|Smoot, R.",...,0.5,0.733333,0.5,0.0,0.0,0,14,2,0,0
2,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5214410,Dan Smoot,"D. Smoot|Smoot, D.",...,0.5,0.755556,0.6,0.0,0.0,0,9,2,0,0
3,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q5496310,Fred Smoot,"Smoot, F. |F. Smoot",...,0.5,0.8,0.6,0.0,0.0,0,10,2,0,0
4,0,0,KYLE SMOOT,UNIVERSITY OF KENTUCKY|MARK ANTHONY LOVELL,coinv_rand_new_aff2.csv,coinv_rand_new_aff2.csv-0,KYLE SMOOT,Q59831732,Joseph Smoot,"Smoot, J. |J. Smoot|Smoot, J. P. |Joseph P. Sm...",...,0.613426,0.713492,0.583333,0.0,0.0,0,12,2,0,0


##### Generate graph-embedding-score using centroid-of-lof and lof-strategy

In [2069]:
!tl score-using-embedding $model_voted_candidate_file_path \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o lof-graph-embedding-score \
--embedding-file $graph_embedding_complex_file \
--embedding-url $index_url \
> $graph_embedding_file_path

Qnodes to lookup: 5281
Qnodes from file: 5264
Qnodes from server: 0
Outlier removal generates 13 lof-voted candidates
score-using-embedding Time: 0.8567259311676025s


##### Generate lof reciprocal rank feature

In [2070]:
!tl generate-reciprocal-rank "$graph_embedding_file_path" \
-c lof-graph-embedding-score \
-o lof-reciprocal-rank \
> "$lof_reciprocal_rank_file_path"

generate-reciprocal-rank-lof-graph-embedding-score Time: 0.24054908752441406s


##### Generate lof tfidf feature

In [2071]:
!tl compute-tf-idf "$lof_reciprocal_rank_file_path" \
--feature-file "$class_count_file" \
--feature-name class_count \
--singleton-column is_lof \
-o lof_class_count_tf_idf_score \
/ compute-tf-idf \
--feature-file "$property_count_file" \
--feature-name property_count \
--singleton-column is_lof \
-o lof_property_count_tf_idf_score \
> "$lof_feature_file"

compute-tf-idf-class_count Time: 0.23376798629760742s
compute-tf-idf-property_count Time: 0.5045177936553955s


##### Add context score

In [2072]:
!tl context-match --custom-context-file $custom_context_file \
--context-file $context_file --string-separator ";" \
--similarity-string-threshold $string_threshold $lof_feature_file > $context_score_file

### Model Prediction

In [2073]:
features_str = ",".join(features)
!tl predict-using-model -o siamese_prediction \
--ranking-model $ranking_model_file_path \
--features $features_str \
--normalization-factor $min_max_scaler_path $context_score_file > $output_model_pred_file

predict-using-model Time: 0.562143087387085s


In [2074]:
features_str

'pagerank,retrieval_score,monge_elkan,monge_elkan_aliases,des_cont_jaccard,jaro_winkler,levenshtein,singleton,num_char,num_tokens,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score,lof-graph-embedding-score,lof-reciprocal-rank,context_score'

### Get Top 5 links

In [2075]:
!tl get-kg-links -c $final_score_column -k 5 --k-rows $output_model_pred_file > $top5_links

get-kg-links-siamese_prediction Time: 0.24179720878601074s


### Colorized KG Links file

In [2076]:
!tl add-color -c "$final_score_column" -k 5 $top5_links --output "$colorized_kg_links"

add-color Time: 0.09406304359436035s


In [2077]:
#!open "$colorized_kg_links"

In [2078]:
df = pd.read_excel(colorized_kg_links)

In [2079]:
ls = df.columns

In [2080]:
len(ls)

38

In [2081]:
arr = []
for i in range(0, len(ls)):
    arr.append('')
    
arr[0] = 0
arr[7] = 'NIL'
arr[len(arr)-1] = ''

In [2082]:
df_list = df.values.tolist()

In [2083]:
nil = pd.DataFrame(columns = df.columns)

In [2084]:
new = pd.DataFrame(columns = df.columns)

In [2085]:
nil

Unnamed: 0,column,row,label,context,filename,column-id,label_clean,kg_id,kg_labels,kg_aliases,...,lof-reciprocal-rank,lof_class_count_tf_idf_score,top5_class_count,lof_property_count_tf_idf_score,top5_property_count,context_property,context_similarity,context_score,siamese_prediction,rank


In [2086]:
done = []
b = []
for i in range (0, len(df)):
    if df['row'][i] in done:
        continue
        
    
    arr[1] = df['row'][i]
    arr[2] = df['label'][i]
    arr[3] = df['context'][i]
    arr[4] = df['filename'][i]
    arr[5] = df['column-id'][i]
    arr[6] = df['label_clean'][i]
    
    
    nil.loc[len(nil)] = arr
    

    done.append(df['row'][i])

    
    
    

In [2087]:
nil_list = nil.values.tolist()

In [2088]:
for i in range(0, len(df)):
    new.loc[len(new)] = df_list[i]
    if i % 5 == 4:
        new.loc[len(new)] = (nil_list[int(i/5)])
        

In [2089]:
for i in range(0, len(new)):
    #print(new['siamese_prediction'][i])
    if (new['siamese_prediction'][i] == ''):
        new['siamese_prediction'][i] = 0
    if (new['context_score'][i] == ''):
        new['context_score'][i] = 0
    new['siamese_prediction'][i] = float(new['siamese_prediction'][i])
    new['context_score'][i] = float(new['context_score'][i])

In [2090]:
#siamese_threshold = 1e-17
select = new[(new['siamese_prediction'] > siamese_threshold) | (new['kg_id'] == 'NIL')]

In [2091]:
select = select.reset_index(drop = True)

In [2092]:
final = pd.DataFrame(columns = df.columns)

In [2093]:
done = []
select_list = select.values.tolist()
for i in range(0, len(select)):
    if select['row'][i] in done:
        continue
    final.loc[len(final)] = select_list[i]
    done.append(select['row'][i])

In [2094]:
final.to_csv(selection_save_path, index = False)

In [2095]:
#!tl ground-truth-labeler -f $gt \
#< $selection_save_path\
#> $labeled_path


In [2096]:
#!tl metrics $label_path -c evaluation_label