In [80]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from tqdm import tqdm

In [81]:
HOME_DIR = '/data/amandeep/nih-dataset/person'
table_path = f'{HOME_DIR}/tl_person_coinvestigator_with_qnihid.tsv'
f_name = table_path.split("/")[-1]
wikify_column_name = "person_name"
final_score_column = "siamese_prediction"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count,context'
temp_dir= f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/temp/apr_test.csv'
model_file_path = './models/weighted_lr.pkl'
ranking_model_file_path = './models/epoch_5_loss_0.09882864356040955_top1_0.8968926553672316.pth'
min_max_scaler_path = './models/normalization_factor.pkl'

model_voted_candidate_file_path = f'{HOME_DIR}/temp/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/temp/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/temp/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/temp/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/temp/lof_feature.csv'
context_score_file = f'{HOME_DIR}/temp/context_score_file.csv'

output_model_pred_file = f'{HOME_DIR}/temp/model_prediction.csv'
top5_links = f'{HOME_DIR}/temp/top5_links.csv'
colorized_kg_links = f'{HOME_DIR}/temp/{f_name.strip(".csv")}_colorized.xlsx'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
class_count_file = f'{HOME_DIR}/temp/class_count.tsv'
property_count_file = f'{HOME_DIR}/temp/property_count.tsv'
context_file = f'{HOME_DIR}/temp/context.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

string_threshold = 0.9
siamese_threshold = 0.9


In [82]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score']

In [83]:
print(len(features))

15


In [84]:
!ls "$table_path"

/data/amandeep/nih-dataset/person/tl_person_coinvestigator_with_qnihid.tsv


### Canonicalize

In [13]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" -s person_qnihid --tsv \
> "$canonical_file_path"

canonicalize Time: 0.3060629367828369s


In [14]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context,filename,column-id
0,0,0,LINDLEY BARBEE,UNIVERSITY OF WASHINGTON|CHRISTINE MITRA KHOSR...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0
1,0,1,CHRISTINE MITRA KHOSROPOUR,UNIVERSITY OF WASHINGTON|LINDLEY BARBEE,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0
2,0,2,MICHAEL J GALE,UNIVERSITY OF WASHINGTON|CAROLYN B COYNE;HUGO ...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0
3,0,3,DANIEL M. RATNER,UNIVERSITY OF WASHINGTON|PATRICK S. STAYTON;SH...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0
4,0,4,SHAWN J. SKERRETT,UNIVERSITY OF WASHINGTON|COURTNEY CRANE;DANIEL...,tl_person_coinvestigator_with_qnihid.tsv,tl_person_coinvestigator_with_qnihid.tsv-0


In [15]:
!ls -l $canonical_file_path

-rw-r--r-- 1 amandeep isdstaff 9682890 Jul 27 16:34 /data/amandeep/nih-dataset/temp/canonical.csv


In [16]:
def split(f_path, output_path):
    df = pd.read_csv(f_path)
    d_list = np.array_split(df, 50)
    for i, d in enumerate(d_list):
        d.to_csv(f'{output_path}/split_{i}.csv', index=False)

In [17]:
split(canonical_file_path, '/data/amandeep/nih-dataset/person/nih-split')

# Candidate Generation

In [None]:
!tl clean -c label -o label_clean "$canonical_file_path" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-exact-matches \
-c label_clean --auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" > "$candidate_file_path"

In [18]:
def candidate_generation(path, output_path, class_count_path, prop_count_path, context_path, graph_embedding):
    file_list = glob.glob(path + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        st = time.time()
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        output_file = f"{output_path}/{filename}"
        
        !tl clean -c label -o label_clean $file / \
        --url http://ckg07:9200 --index wikidatadwd-augmented \
        get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        --url http://ckg07:9200 --index wikidatadwd-augmented \
        get-exact-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" > "$output_file"
        
        for field in aux_field.split(','):
            aux_list = []
            for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
                aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
            aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
            if field == 'class_count':
                class_count_file = f"{class_count_path}/{filename[:-4]}_class_count.tsv"
                aux_df.to_csv(class_count_file, sep='\t', index=False)
            elif field == 'property_count':
                prop_count_file = f"{prop_count_path}/{filename[:-4]}_prop_count.tsv"
                aux_df.to_csv(prop_count_file, sep='\t', index=False)
            elif field == 'context':
                context_file = f"{context_path}/{filename[:-4]}_context.tsv"
                aux_df.to_csv(context_file, sep='\t', index=False)
            else:
                graph_embedding_file = f"{graph_embedding}/{filename[:-4]}_graph_embedding_complex.tsv"
                aux_df.to_csv(graph_embedding_file, sep='\t', index=False)

In [19]:
input_path = '/data/amandeep/nih-dataset/person/nih-split'
output_path = '/data/amandeep/nih-dataset/person/candidates'
class_count_path = '/data/amandeep/nih-dataset/person/temp/class_c'
prop_count_path = '/data/amandeep/nih-dataset/person/temp/prop_c'
context_path = '/data/amandeep/nih-dataset/temp/person/context'
graph_embedding = '/data/amandeep/nih-dataset/person/temp/ge'
import time

In [20]:
candidate_generation(input_path, output_path, class_count_path, prop_count_path, context_path, graph_embedding)

0it [00:00, ?it/s]

split_0.csv: 1 of 50
clean Time: 0.052808523178100586s
get-fuzzy-augmented-matches Time: 201.6638102531433s
get-exact-matches Time: 11.67469573020935s


1it [04:06, 246.90s/it]

split_1.csv: 2 of 50
clean Time: 0.055995941162109375s
get-fuzzy-augmented-matches Time: 245.99158239364624s
get-exact-matches Time: 10.243574619293213s


2it [08:58, 273.25s/it]

split_2.csv: 3 of 50
clean Time: 0.08056139945983887s
get-fuzzy-augmented-matches Time: 153.50465774536133s
get-exact-matches Time: 4.976622104644775s


3it [12:00, 231.43s/it]

split_3.csv: 4 of 50
clean Time: 0.05420875549316406s
get-fuzzy-augmented-matches Time: 146.6575310230255s
get-exact-matches Time: 5.597810506820679s


4it [14:55, 209.30s/it]

split_4.csv: 5 of 50
clean Time: 0.045882225036621094s
get-fuzzy-augmented-matches Time: 142.1023669242859s
get-exact-matches Time: 4.729527950286865s


5it [17:48, 196.13s/it]

split_5.csv: 6 of 50
clean Time: 0.045220375061035156s
get-fuzzy-augmented-matches Time: 138.9335515499115s
get-exact-matches Time: 5.391014337539673s


6it [20:40, 188.04s/it]

split_6.csv: 7 of 50
clean Time: 0.06609463691711426s
get-fuzzy-augmented-matches Time: 139.2573163509369s
get-exact-matches Time: 5.221609592437744s


7it [23:30, 181.98s/it]

split_7.csv: 8 of 50
clean Time: 0.06023693084716797s
get-fuzzy-augmented-matches Time: 123.73327589035034s
get-exact-matches Time: 4.943582057952881s


8it [26:05, 173.47s/it]

split_8.csv: 9 of 50
clean Time: 0.059368133544921875s
get-fuzzy-augmented-matches Time: 130.0938262939453s
get-exact-matches Time: 4.992772340774536s


9it [28:49, 170.53s/it]

split_9.csv: 10 of 50
clean Time: 0.07809090614318848s
get-fuzzy-augmented-matches Time: 129.92380118370056s
get-exact-matches Time: 4.577636241912842s


10it [31:27, 166.74s/it]

split_10.csv: 11 of 50
clean Time: 0.08082199096679688s
get-fuzzy-augmented-matches Time: 126.60335421562195s
get-exact-matches Time: 4.9985737800598145s


11it [34:03, 163.22s/it]

split_11.csv: 12 of 50
clean Time: 0.08292341232299805s
get-fuzzy-augmented-matches Time: 114.72296047210693s
get-exact-matches Time: 5.029835224151611s


12it [36:36, 160.13s/it]

split_12.csv: 13 of 50
clean Time: 0.07935094833374023s
get-fuzzy-augmented-matches Time: 116.43931031227112s
get-exact-matches Time: 5.061426162719727s


13it [39:02, 155.99s/it]

split_13.csv: 14 of 50
clean Time: 0.08910036087036133s
get-fuzzy-augmented-matches Time: 117.67290234565735s
get-exact-matches Time: 4.8672215938568115s


14it [41:31, 153.83s/it]

split_14.csv: 15 of 50
clean Time: 0.06881499290466309s
get-fuzzy-augmented-matches Time: 106.11310577392578s
get-exact-matches Time: 5.041502952575684s


15it [43:46, 148.17s/it]

split_15.csv: 16 of 50
clean Time: 0.08726859092712402s
get-fuzzy-augmented-matches Time: 106.75482273101807s
get-exact-matches Time: 5.243393421173096s


16it [46:02, 144.49s/it]

split_16.csv: 17 of 50
clean Time: 0.08839201927185059s
get-fuzzy-augmented-matches Time: 110.12630462646484s
get-exact-matches Time: 4.857788562774658s


17it [48:21, 142.93s/it]

split_17.csv: 18 of 50
clean Time: 0.05005240440368652s
get-fuzzy-augmented-matches Time: 103.35693049430847s
get-exact-matches Time: 5.198227405548096s


18it [50:34, 139.76s/it]

split_18.csv: 19 of 50
clean Time: 0.07082033157348633s
get-fuzzy-augmented-matches Time: 103.63802337646484s
get-exact-matches Time: 5.6241419315338135s


19it [52:50, 138.78s/it]

split_19.csv: 20 of 50
clean Time: 0.08451080322265625s
get-fuzzy-augmented-matches Time: 96.76968216896057s
get-exact-matches Time: 4.820384979248047s


20it [54:54, 134.29s/it]

split_20.csv: 21 of 50
clean Time: 0.08559727668762207s
get-fuzzy-augmented-matches Time: 97.2390775680542s
get-exact-matches Time: 5.159435510635376s


21it [57:28, 140.26s/it]

split_21.csv: 22 of 50
clean Time: 0.0897369384765625s
get-fuzzy-augmented-matches Time: 99.63702964782715s
get-exact-matches Time: 4.949060440063477s


22it [59:36, 136.44s/it]

split_22.csv: 23 of 50
clean Time: 0.08876371383666992s
get-fuzzy-augmented-matches Time: 94.42974257469177s
get-exact-matches Time: 5.108072280883789s


23it [1:01:38, 132.18s/it]

split_23.csv: 24 of 50
clean Time: 0.08246970176696777s
get-fuzzy-augmented-matches Time: 91.13229846954346s
get-exact-matches Time: 4.578164577484131s


24it [1:03:36, 127.97s/it]

split_24.csv: 25 of 50
clean Time: 0.08789944648742676s
get-fuzzy-augmented-matches Time: 92.81205606460571s
get-exact-matches Time: 4.8444788455963135s


25it [1:05:36, 125.69s/it]

split_25.csv: 26 of 50
clean Time: 0.0859384536743164s
get-fuzzy-augmented-matches Time: 89.0306875705719s
get-exact-matches Time: 4.860267877578735s


26it [1:07:33, 123.07s/it]

split_26.csv: 27 of 50
clean Time: 0.0855414867401123s
get-fuzzy-augmented-matches Time: 95.18296718597412s
get-exact-matches Time: 5.263394355773926s


27it [1:09:37, 123.28s/it]

split_27.csv: 28 of 50
clean Time: 0.09132981300354004s
get-fuzzy-augmented-matches Time: 89.03500294685364s
get-exact-matches Time: 4.8453755378723145s


28it [1:11:37, 122.18s/it]

split_28.csv: 29 of 50
clean Time: 0.08446216583251953s
get-fuzzy-augmented-matches Time: 88.21044611930847s
get-exact-matches Time: 4.985000848770142s


29it [1:13:33, 120.37s/it]

split_29.csv: 30 of 50
clean Time: 0.0845794677734375s
get-fuzzy-augmented-matches Time: 85.90319609642029s
get-exact-matches Time: 4.922552108764648s


30it [1:15:26, 118.31s/it]

split_30.csv: 31 of 50
clean Time: 0.08876609802246094s
get-fuzzy-augmented-matches Time: 87.50543260574341s
get-exact-matches Time: 4.752007246017456s


31it [1:17:22, 117.39s/it]

split_31.csv: 32 of 50
clean Time: 0.08036041259765625s
get-fuzzy-augmented-matches Time: 82.71047830581665s
get-exact-matches Time: 4.891790151596069s


32it [1:19:12, 115.19s/it]

split_32.csv: 33 of 50
clean Time: 0.08256816864013672s
get-fuzzy-augmented-matches Time: 83.57973957061768s
get-exact-matches Time: 5.1534600257873535s


33it [1:21:04, 114.29s/it]

split_33.csv: 34 of 50
clean Time: 0.07982993125915527s
get-fuzzy-augmented-matches Time: 87.51987385749817s
get-exact-matches Time: 4.9763548374176025s


34it [1:23:00, 114.79s/it]

split_34.csv: 35 of 50
clean Time: 0.08474397659301758s
get-fuzzy-augmented-matches Time: 82.23816514015198s
get-exact-matches Time: 5.014142274856567s


35it [1:24:50, 113.43s/it]

split_35.csv: 36 of 50
clean Time: 0.06699204444885254s
get-fuzzy-augmented-matches Time: 87.57819437980652s
get-exact-matches Time: 4.659299612045288s


36it [1:26:47, 114.57s/it]

split_36.csv: 37 of 50
clean Time: 0.05597639083862305s
get-fuzzy-augmented-matches Time: 81.72805857658386s
get-exact-matches Time: 4.856358528137207s


37it [1:28:37, 113.15s/it]

split_37.csv: 38 of 50
clean Time: 0.09013175964355469s
get-fuzzy-augmented-matches Time: 80.77738332748413s
get-exact-matches Time: 5.075256109237671s


38it [1:30:26, 111.96s/it]

split_38.csv: 39 of 50
clean Time: 0.09770035743713379s
get-fuzzy-augmented-matches Time: 81.42277526855469s
get-exact-matches Time: 5.138622760772705s


39it [1:32:16, 111.21s/it]

split_39.csv: 40 of 50
clean Time: 0.08977174758911133s
get-fuzzy-augmented-matches Time: 85.02202439308167s
get-exact-matches Time: 5.402724266052246s


40it [1:34:09, 111.94s/it]

split_40.csv: 41 of 50
clean Time: 0.09025430679321289s
get-fuzzy-augmented-matches Time: 78.34915351867676s
get-exact-matches Time: 5.04120397567749s


41it [1:35:57, 110.55s/it]

split_41.csv: 42 of 50
clean Time: 0.08717727661132812s
get-fuzzy-augmented-matches Time: 85.51334547996521s
get-exact-matches Time: 5.366503000259399s


42it [1:37:52, 111.92s/it]

split_42.csv: 43 of 50
clean Time: 0.09060192108154297s
get-fuzzy-augmented-matches Time: 82.80622172355652s
get-exact-matches Time: 5.320855140686035s


43it [1:39:44, 112.08s/it]

split_43.csv: 44 of 50
clean Time: 0.04387927055358887s
get-fuzzy-augmented-matches Time: 81.10510611534119s
get-exact-matches Time: 5.16632604598999s


44it [1:41:33, 111.08s/it]

split_44.csv: 45 of 50
clean Time: 0.04443812370300293s
get-fuzzy-augmented-matches Time: 80.86636972427368s
get-exact-matches Time: 4.897298097610474s


45it [1:43:22, 110.33s/it]

split_45.csv: 46 of 50
clean Time: 0.08957457542419434s
get-fuzzy-augmented-matches Time: 76.38202738761902s
get-exact-matches Time: 4.927022218704224s


46it [1:45:06, 108.51s/it]

split_46.csv: 47 of 50
clean Time: 0.08445405960083008s
get-fuzzy-augmented-matches Time: 72.7830183506012s
get-exact-matches Time: 5.057124137878418s


47it [1:46:48, 106.53s/it]

split_47.csv: 48 of 50
clean Time: 0.0887291431427002s
get-fuzzy-augmented-matches Time: 71.14387273788452s
get-exact-matches Time: 4.78208327293396s


48it [1:48:27, 104.26s/it]

split_48.csv: 49 of 50
clean Time: 0.09058356285095215s
get-fuzzy-augmented-matches Time: 67.15201926231384s
get-exact-matches Time: 4.9366490840911865s


49it [1:50:02, 101.68s/it]

split_49.csv: 50 of 50
clean Time: 0.08889007568359375s
get-fuzzy-augmented-matches Time: 73.63883328437805s
get-exact-matches Time: 5.12647271156311s


50it [1:51:45, 134.11s/it]


In [21]:
features_path = '/data/amandeep/nih-dataset/person/features'
classifier_features= ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']

In [22]:
def feature_generation(candidate_dir, embedding_dir, class_count_dir, property_count_dir, context_path, output_path):
    file_list = glob.glob(candidate_dir + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        embedding_file = f"{embedding_dir}/{filename[:-4]}_graph_embedding_complex.tsv"
        class_count_file = f"{class_count_dir}/{filename[:-4]}_class_count.tsv"
        property_count_file = f"{property_count_dir}/{filename[:-4]}_prop_count.tsv"
        context_file = f"{context_path}/{filename[:-4]}_context.tsv"
        output_file = f"{output_path}/{filename}"
        if os.path.getsize(file) == 0:
                continue
        classifier_features_str = ",".join(classifier_features)
        !tl align-page-rank "$file" \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
            / string-similarity -i --method jaro_winkler -o jaro_winkler \
            / string-similarity -i --method levenshtein -o levenshtein \
            / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
            / normalize-scores -c des_cont_jaccard / smallest-qnode-number \
            / mosaic-features -c kg_labels --num-char --num-tokens \
            / create-singleton-feature -o singleton \
            / vote-by-classifier  \
            --prob-threshold 0.995 \
            --features "$classifier_features_str" \
            --model "$model_file_path" \
            / score-using-embedding \
            --column-vector-strategy centroid-of-lof \
            --lof-strategy ems-mv \
            -o lof-graph-embedding-score \
            --embedding-file "$embedding_file" \
            / generate-reciprocal-rank  \
            -c lof-graph-embedding-score \
            -o lof-reciprocal-rank \
            / compute-tf-idf  \
            --feature-file "$class_count_file" \
            --feature-name class_count \
            --singleton-column is_lof \
            -o lof_class_count_tf_idf_score \
            / compute-tf-idf \
            --feature-file "$property_count_file" \
            --feature-name property_count \
            --singleton-column is_lof \
            -o lof_property_count_tf_idf_score   > "$output_file"

In [23]:
feature_generation(output_path, graph_embedding, class_count_path, prop_count_path, context_path, features_path)

0it [00:00, ?it/s]

split_0.csv: 1 of 50
align-page-rank Time: 2.2507216930389404s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.784616470336914s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.4554123878479s
string-similarity-['jaro_winkler'] Time: 5.552843809127808s
string-similarity-['levenshtein'] Time: 20.565762042999268s
string-similarity-['jaccard:tokenizer=word'] Time: 1.417466640472412s
normalize-scores-des_cont_jaccard Time: 0.43269824981689453s
smallest-qnode-number Time: 3.8840510845184326s
mosaic-features Time: 0.19453907012939453s
create-singleton-feature Time: 2.8804738521575928s
vote-by-classifier Time: 5.029954195022583s
Qnodes to lookup: 102630
Qnodes from file: 102203
Outlier removal generates 298 lof-voted candidates
score-using-embedding Time: 195.41063594818115s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4363844394683838s
compute-tf-idf-class_count Time: 211.1447446346283s
compute-tf-idf-property_count Time: 217.32307767868042

1it [03:45, 225.67s/it]

split_1.csv: 2 of 50
align-page-rank Time: 2.032576322555542s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.90036940574646s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.6394567489624s
string-similarity-['jaro_winkler'] Time: 5.873132705688477s
string-similarity-['levenshtein'] Time: 20.538536310195923s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3488471508026123s
normalize-scores-des_cont_jaccard Time: 0.41242361068725586s
smallest-qnode-number Time: 3.68790340423584s
mosaic-features Time: 0.19437146186828613s
create-singleton-feature Time: 2.4444289207458496s
vote-by-classifier Time: 2.2058184146881104s
Qnodes to lookup: 102340
Qnodes from file: 101891
Outlier removal generates 287 lof-voted candidates
score-using-embedding Time: 180.6338300704956s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4756865501403809s
compute-tf-idf-class_count Time: 193.07104015350342s
compute-tf-idf-property_count Time: 198.64530539512634s


2it [07:12, 214.52s/it]

split_2.csv: 3 of 50
align-page-rank Time: 2.22701358795166s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.456555604934692s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.71581220626831s
string-similarity-['jaro_winkler'] Time: 5.7087318897247314s
string-similarity-['levenshtein'] Time: 20.67240023612976s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3114633560180664s
normalize-scores-des_cont_jaccard Time: 0.42096972465515137s
smallest-qnode-number Time: 4.62757134437561s
mosaic-features Time: 0.19523167610168457s
create-singleton-feature Time: 2.5996904373168945s
vote-by-classifier Time: 1.6644668579101562s
Qnodes to lookup: 102613
Qnodes from file: 102194
Outlier removal generates 249 lof-voted candidates
score-using-embedding Time: 180.33183813095093s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4701991081237793s
compute-tf-idf-class_count Time: 192.12218022346497s
compute-tf-idf-property_count Time: 198.3571171760559s

3it [10:40, 211.52s/it]

split_3.csv: 4 of 50
align-page-rank Time: 2.2031965255737305s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.343984127044678s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 78.38350510597229s
string-similarity-['jaro_winkler'] Time: 5.908080101013184s
string-similarity-['levenshtein'] Time: 20.354480743408203s
string-similarity-['jaccard:tokenizer=word'] Time: 1.344938039779663s
normalize-scores-des_cont_jaccard Time: 0.41622233390808105s
smallest-qnode-number Time: 3.857445001602173s
mosaic-features Time: 0.1926746368408203s
create-singleton-feature Time: 2.414954900741577s
vote-by-classifier Time: 1.7130506038665771s
Qnodes to lookup: 100988
Qnodes from file: 100527
Outlier removal generates 287 lof-voted candidates
score-using-embedding Time: 183.7381534576416s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4737792015075684s
compute-tf-idf-class_count Time: 196.60260915756226s
compute-tf-idf-property_count Time: 203.2562255859375s


4it [14:14, 212.57s/it]

split_4.csv: 5 of 50
align-page-rank Time: 1.994556188583374s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 24.610488176345825s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.85339164733887s
string-similarity-['jaro_winkler'] Time: 5.46380090713501s
string-similarity-['levenshtein'] Time: 20.25224232673645s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5659713745117188s
normalize-scores-des_cont_jaccard Time: 0.45339250564575195s
smallest-qnode-number Time: 3.773094892501831s
mosaic-features Time: 0.19551610946655273s
create-singleton-feature Time: 2.565678119659424s
vote-by-classifier Time: 1.5126636028289795s
Qnodes to lookup: 102792
Qnodes from file: 102326
Outlier removal generates 294 lof-voted candidates
score-using-embedding Time: 181.63341236114502s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4959089756011963s
compute-tf-idf-class_count Time: 196.23926043510437s
compute-tf-idf-property_count Time: 202.4990005493164s


5it [17:45, 212.02s/it]

split_5.csv: 6 of 50
align-page-rank Time: 2.033663034439087s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.851969957351685s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.3906729221344s
string-similarity-['jaro_winkler'] Time: 6.200682163238525s
string-similarity-['levenshtein'] Time: 20.29999351501465s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3344018459320068s
normalize-scores-des_cont_jaccard Time: 0.41785240173339844s
smallest-qnode-number Time: 3.767305612564087s
mosaic-features Time: 0.19395065307617188s
create-singleton-feature Time: 2.4961023330688477s
vote-by-classifier Time: 1.6686828136444092s
Qnodes to lookup: 103119
Qnodes from file: 102689
Outlier removal generates 278 lof-voted candidates
score-using-embedding Time: 181.4433434009552s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5211334228515625s
compute-tf-idf-class_count Time: 195.39475083351135s
compute-tf-idf-property_count Time: 201.27137565612793s

6it [21:15, 211.18s/it]

split_6.csv: 7 of 50
align-page-rank Time: 2.1430516242980957s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.731421947479248s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.47618341445923s
string-similarity-['jaro_winkler'] Time: 5.92342734336853s
string-similarity-['levenshtein'] Time: 20.26203227043152s
string-similarity-['jaccard:tokenizer=word'] Time: 1.655191421508789s
normalize-scores-des_cont_jaccard Time: 0.4190959930419922s
smallest-qnode-number Time: 4.029390811920166s
mosaic-features Time: 0.14474034309387207s
create-singleton-feature Time: 2.427368402481079s
vote-by-classifier Time: 2.500579833984375s
Qnodes to lookup: 103723
Qnodes from file: 103309
Outlier removal generates 272 lof-voted candidates
score-using-embedding Time: 180.68840551376343s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.7379443645477295s
compute-tf-idf-class_count Time: 194.08610892295837s
compute-tf-idf-property_count Time: 200.9192659854889s


7it [24:44, 210.63s/it]

split_7.csv: 8 of 50
align-page-rank Time: 2.105626106262207s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.36077308654785s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.48472952842712s
string-similarity-['jaro_winkler'] Time: 5.534584283828735s
string-similarity-['levenshtein'] Time: 20.356271505355835s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3474388122558594s
normalize-scores-des_cont_jaccard Time: 0.43161702156066895s
smallest-qnode-number Time: 3.781036853790283s
mosaic-features Time: 0.19462180137634277s
create-singleton-feature Time: 2.4883453845977783s
vote-by-classifier Time: 1.8283941745758057s
Qnodes to lookup: 101146
Qnodes from file: 100725
Outlier removal generates 261 lof-voted candidates
score-using-embedding Time: 176.52816081047058s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.6178312301635742s
compute-tf-idf-class_count Time: 191.77316975593567s
compute-tf-idf-property_count Time: 197.3679087162017

8it [28:10, 209.19s/it]

split_8.csv: 9 of 50
align-page-rank Time: 2.016780138015747s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 21.7373468875885s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.05490326881409s
string-similarity-['jaro_winkler'] Time: 6.0638415813446045s
string-similarity-['levenshtein'] Time: 21.464143991470337s
string-similarity-['jaccard:tokenizer=word'] Time: 1.360680103302002s
normalize-scores-des_cont_jaccard Time: 0.4648439884185791s
smallest-qnode-number Time: 3.8088459968566895s
mosaic-features Time: 0.19716835021972656s
create-singleton-feature Time: 2.6608433723449707s
vote-by-classifier Time: 1.753774642944336s
Qnodes to lookup: 103051
Qnodes from file: 102609
Outlier removal generates 278 lof-voted candidates
score-using-embedding Time: 178.85269021987915s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4472897052764893s
compute-tf-idf-class_count Time: 191.0550136566162s
compute-tf-idf-property_count Time: 197.50507140159607s


9it [31:36, 208.27s/it]

split_9.csv: 10 of 50
align-page-rank Time: 2.068415403366089s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.681389570236206s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.12107491493225s
string-similarity-['jaro_winkler'] Time: 5.466552257537842s
string-similarity-['levenshtein'] Time: 20.617379665374756s
string-similarity-['jaccard:tokenizer=word'] Time: 1.4011104106903076s
normalize-scores-des_cont_jaccard Time: 0.41144514083862305s
smallest-qnode-number Time: 3.9711387157440186s
mosaic-features Time: 0.19746065139770508s
create-singleton-feature Time: 2.521019458770752s
vote-by-classifier Time: 1.5111377239227295s
Qnodes to lookup: 103451
Qnodes from file: 103005
Outlier removal generates 304 lof-voted candidates
score-using-embedding Time: 176.84415292739868s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5007295608520508s
compute-tf-idf-class_count Time: 189.86219477653503s
compute-tf-idf-property_count Time: 196.54647493362

10it [35:02, 207.48s/it]

split_10.csv: 11 of 50
align-page-rank Time: 2.0502610206604004s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.03008198738098s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.8776798248291s
string-similarity-['jaro_winkler'] Time: 5.9576966762542725s
string-similarity-['levenshtein'] Time: 20.276453733444214s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3487818241119385s
normalize-scores-des_cont_jaccard Time: 0.451979398727417s
smallest-qnode-number Time: 3.732436418533325s
mosaic-features Time: 0.19447803497314453s
create-singleton-feature Time: 2.506124258041382s
vote-by-classifier Time: 1.6566874980926514s
Qnodes to lookup: 103406
Qnodes from file: 102937
Outlier removal generates 269 lof-voted candidates
score-using-embedding Time: 179.10123872756958s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.486020803451538s
compute-tf-idf-class_count Time: 191.84287476539612s
compute-tf-idf-property_count Time: 198.0176100730896s

11it [38:29, 207.19s/it]

split_11.csv: 12 of 50
align-page-rank Time: 2.3272578716278076s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.944178342819214s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 79.70839142799377s
string-similarity-['jaro_winkler'] Time: 5.896643400192261s
string-similarity-['levenshtein'] Time: 20.077730178833008s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3573782444000244s
normalize-scores-des_cont_jaccard Time: 0.4409337043762207s
smallest-qnode-number Time: 4.085714101791382s
mosaic-features Time: 0.19539642333984375s
create-singleton-feature Time: 2.4575724601745605s
vote-by-classifier Time: 1.6045746803283691s
Qnodes to lookup: 102378
Qnodes from file: 101943
Outlier removal generates 274 lof-voted candidates
score-using-embedding Time: 188.7638442516327s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4970769882202148s
compute-tf-idf-class_count Time: 201.4116985797882s
compute-tf-idf-property_count Time: 207.556360006332

12it [42:05, 209.86s/it]

split_12.csv: 13 of 50
align-page-rank Time: 2.016606569290161s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.760839223861694s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 78.96092128753662s
string-similarity-['jaro_winkler'] Time: 5.597720384597778s
string-similarity-['levenshtein'] Time: 21.128141403198242s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5278470516204834s
normalize-scores-des_cont_jaccard Time: 0.4256761074066162s
smallest-qnode-number Time: 3.8613710403442383s
mosaic-features Time: 0.19777655601501465s
create-singleton-feature Time: 2.5625247955322266s
vote-by-classifier Time: 1.4296197891235352s
Qnodes to lookup: 103126
Qnodes from file: 102674
Outlier removal generates 257 lof-voted candidates
score-using-embedding Time: 184.68976640701294s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4699981212615967s
compute-tf-idf-class_count Time: 197.99057865142822s
compute-tf-idf-property_count Time: 204.0096230506

13it [45:38, 210.78s/it]

split_13.csv: 14 of 50
align-page-rank Time: 2.056501865386963s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.830907821655273s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.51685810089111s
string-similarity-['jaro_winkler'] Time: 5.9324727058410645s
string-similarity-['levenshtein'] Time: 20.82344937324524s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3924541473388672s
normalize-scores-des_cont_jaccard Time: 0.4762430191040039s
smallest-qnode-number Time: 4.073378324508667s
mosaic-features Time: 0.1953575611114502s
create-singleton-feature Time: 2.4668571949005127s
vote-by-classifier Time: 1.5347740650177002s
Qnodes to lookup: 102987
Qnodes from file: 102590
_centroid_of_lof: Missing 1 of 431
Outlier removal generates 258 lof-voted candidates
score-using-embedding Time: 182.06877303123474s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.687804937362671s
compute-tf-idf-class_count Time: 196.43417477607727s
compute-tf-idf-pro

14it [49:08, 210.78s/it]

split_14.csv: 15 of 50
align-page-rank Time: 2.139066696166992s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.294872283935547s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.07025408744812s
string-similarity-['jaro_winkler'] Time: 5.4284491539001465s
string-similarity-['levenshtein'] Time: 20.236607551574707s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3277971744537354s
normalize-scores-des_cont_jaccard Time: 0.41768479347229004s
smallest-qnode-number Time: 3.7912163734436035s
mosaic-features Time: 0.19404816627502441s
create-singleton-feature Time: 2.434675693511963s
vote-by-classifier Time: 1.6905272006988525s
Qnodes to lookup: 100026
Qnodes from file: 99632
Outlier removal generates 266 lof-voted candidates
score-using-embedding Time: 178.50107979774475s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4963185787200928s
compute-tf-idf-class_count Time: 194.25225615501404s
compute-tf-idf-property_count Time: 199.3926486968

15it [52:37, 210.11s/it]

split_15.csv: 16 of 50
align-page-rank Time: 2.029071569442749s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.931191205978394s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 78.54340553283691s
string-similarity-['jaro_winkler'] Time: 5.497998476028442s
string-similarity-['levenshtein'] Time: 20.768617868423462s
string-similarity-['jaccard:tokenizer=word'] Time: 1.221628189086914s
normalize-scores-des_cont_jaccard Time: 0.46144628524780273s
smallest-qnode-number Time: 3.788501024246216s
mosaic-features Time: 0.19898056983947754s
create-singleton-feature Time: 2.445201873779297s
vote-by-classifier Time: 1.428006649017334s
Qnodes to lookup: 102544
Qnodes from file: 102081
Outlier removal generates 271 lof-voted candidates
score-using-embedding Time: 181.86611151695251s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4600834846496582s
compute-tf-idf-class_count Time: 196.3211600780487s
compute-tf-idf-property_count Time: 202.08920001983643

16it [56:08, 210.32s/it]

split_16.csv: 17 of 50
align-page-rank Time: 2.1913881301879883s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.04692053794861s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.08957266807556s
string-similarity-['jaro_winkler'] Time: 5.665759325027466s
string-similarity-['levenshtein'] Time: 20.871767044067383s
string-similarity-['jaccard:tokenizer=word'] Time: 1.2155277729034424s
normalize-scores-des_cont_jaccard Time: 0.4198484420776367s
smallest-qnode-number Time: 3.5666539669036865s
mosaic-features Time: 0.14911341667175293s
create-singleton-feature Time: 2.5072696208953857s
vote-by-classifier Time: 1.2933783531188965s
Qnodes to lookup: 102799
Qnodes from file: 102389
Outlier removal generates 288 lof-voted candidates
score-using-embedding Time: 175.3728792667389s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.7298691272735596s
compute-tf-idf-class_count Time: 188.4726550579071s
compute-tf-idf-property_count Time: 194.597959995269

17it [59:30, 207.92s/it]

split_17.csv: 18 of 50
align-page-rank Time: 2.041814088821411s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.25795888900757s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.0055046081543s
string-similarity-['jaro_winkler'] Time: 5.4732019901275635s
string-similarity-['levenshtein'] Time: 20.50051498413086s
string-similarity-['jaccard:tokenizer=word'] Time: 1.369032621383667s
normalize-scores-des_cont_jaccard Time: 0.3909485340118408s
smallest-qnode-number Time: 3.6991448402404785s
mosaic-features Time: 0.1978440284729004s
create-singleton-feature Time: 2.4530341625213623s
vote-by-classifier Time: 1.4126009941101074s
Qnodes to lookup: 102028
Qnodes from file: 101575
Outlier removal generates 279 lof-voted candidates
score-using-embedding Time: 180.65315008163452s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.473987340927124s
compute-tf-idf-class_count Time: 194.227707862854s
compute-tf-idf-property_count Time: 200.98786664009094s


18it [1:03:00, 208.44s/it]

split_18.csv: 19 of 50
align-page-rank Time: 2.151552438735962s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.782254695892334s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.74730491638184s
string-similarity-['jaro_winkler'] Time: 6.20510196685791s
string-similarity-['levenshtein'] Time: 20.882609128952026s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3855400085449219s
normalize-scores-des_cont_jaccard Time: 0.45525550842285156s
smallest-qnode-number Time: 3.8309943675994873s
mosaic-features Time: 0.19537568092346191s
create-singleton-feature Time: 2.4737653732299805s
vote-by-classifier Time: 1.6301815509796143s
Qnodes to lookup: 102493
Qnodes from file: 102040
Outlier removal generates 274 lof-voted candidates
score-using-embedding Time: 183.48542714118958s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4747231006622314s
compute-tf-idf-class_count Time: 196.219162940979s
compute-tf-idf-property_count Time: 202.048730850219

19it [1:06:30, 209.03s/it]

split_19.csv: 20 of 50
align-page-rank Time: 2.374566078186035s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.680912971496582s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.40215873718262s
string-similarity-['jaro_winkler'] Time: 6.001368522644043s
string-similarity-['levenshtein'] Time: 20.331312656402588s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3619742393493652s
normalize-scores-des_cont_jaccard Time: 0.41918158531188965s
smallest-qnode-number Time: 4.240908622741699s
mosaic-features Time: 0.19464349746704102s
create-singleton-feature Time: 2.4986677169799805s
vote-by-classifier Time: 1.6905155181884766s
Qnodes to lookup: 102025
Qnodes from file: 101613
Outlier removal generates 268 lof-voted candidates
score-using-embedding Time: 182.1978690624237s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.3631384372711182s
compute-tf-idf-class_count Time: 194.81476712226868s
compute-tf-idf-property_count Time: 200.44439458847

20it [1:09:59, 209.06s/it]

split_20.csv: 21 of 50
align-page-rank Time: 2.0668156147003174s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.93398118019104s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.55617022514343s
string-similarity-['jaro_winkler'] Time: 5.476082801818848s
string-similarity-['levenshtein'] Time: 21.177448511123657s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3840346336364746s
normalize-scores-des_cont_jaccard Time: 0.42021942138671875s
smallest-qnode-number Time: 4.1358442306518555s
mosaic-features Time: 0.20103883743286133s
create-singleton-feature Time: 2.7956645488739014s
vote-by-classifier Time: 1.6506288051605225s
Qnodes to lookup: 103119
Qnodes from file: 102690
Outlier removal generates 284 lof-voted candidates
score-using-embedding Time: 177.41111063957214s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.452890396118164s
compute-tf-idf-class_count Time: 191.46827054023743s
compute-tf-idf-property_count Time: 197.2196865081

21it [1:13:25, 207.94s/it]

split_21.csv: 22 of 50
align-page-rank Time: 2.1542975902557373s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.379746198654175s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.8138530254364s
string-similarity-['jaro_winkler'] Time: 6.338705062866211s
string-similarity-['levenshtein'] Time: 21.59595537185669s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3695597648620605s
normalize-scores-des_cont_jaccard Time: 0.4216926097869873s
smallest-qnode-number Time: 4.29482626914978s
mosaic-features Time: 0.2026233673095703s
create-singleton-feature Time: 2.519407033920288s
vote-by-classifier Time: 1.7203352451324463s
Qnodes to lookup: 103101
Qnodes from file: 102672
Outlier removal generates 265 lof-voted candidates
score-using-embedding Time: 187.48350024223328s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4142494201660156s
compute-tf-idf-class_count Time: 200.0237410068512s
compute-tf-idf-property_count Time: 205.97276735305786s


22it [1:16:59, 210.01s/it]

split_22.csv: 23 of 50
align-page-rank Time: 2.0992913246154785s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.351675510406494s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 73.37731099128723s
string-similarity-['jaro_winkler'] Time: 5.380293846130371s
string-similarity-['levenshtein'] Time: 20.557284593582153s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3500590324401855s
normalize-scores-des_cont_jaccard Time: 0.3918287754058838s
smallest-qnode-number Time: 4.0966410636901855s
mosaic-features Time: 0.20165586471557617s
create-singleton-feature Time: 2.5725276470184326s
vote-by-classifier Time: 1.6837222576141357s
Qnodes to lookup: 103237
Qnodes from file: 102810
Outlier removal generates 259 lof-voted candidates
score-using-embedding Time: 175.09485960006714s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4728565216064453s
compute-tf-idf-class_count Time: 189.28180646896362s
compute-tf-idf-property_count Time: 195.177482843

23it [1:20:23, 208.17s/it]

split_23.csv: 24 of 50
align-page-rank Time: 2.1763393878936768s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.757932662963867s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.88818645477295s
string-similarity-['jaro_winkler'] Time: 5.2859508991241455s
string-similarity-['levenshtein'] Time: 19.604538917541504s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3305208683013916s
normalize-scores-des_cont_jaccard Time: 0.41726183891296387s
smallest-qnode-number Time: 3.7209296226501465s
mosaic-features Time: 0.19443225860595703s
create-singleton-feature Time: 2.5205280780792236s
vote-by-classifier Time: 1.6684393882751465s
Qnodes to lookup: 100712
Qnodes from file: 100324
Outlier removal generates 283 lof-voted candidates
score-using-embedding Time: 176.96594834327698s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.469695806503296s
compute-tf-idf-class_count Time: 189.22234416007996s
compute-tf-idf-property_count Time: 195.14737367

24it [1:23:47, 206.74s/it]

split_24.csv: 25 of 50
align-page-rank Time: 2.131394624710083s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.77714514732361s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 79.12761688232422s
string-similarity-['jaro_winkler'] Time: 5.586172580718994s
string-similarity-['levenshtein'] Time: 20.57650351524353s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3509750366210938s
normalize-scores-des_cont_jaccard Time: 0.45108580589294434s
smallest-qnode-number Time: 4.281855583190918s
mosaic-features Time: 0.19268012046813965s
create-singleton-feature Time: 2.6310815811157227s
vote-by-classifier Time: 1.3066186904907227s
Qnodes to lookup: 100678
Qnodes from file: 100271
Outlier removal generates 303 lof-voted candidates
score-using-embedding Time: 185.76948928833008s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5431945323944092s
compute-tf-idf-class_count Time: 199.1523563861847s
compute-tf-idf-property_count Time: 205.7525541782379

25it [1:27:21, 208.96s/it]

split_25.csv: 26 of 50
align-page-rank Time: 2.095261812210083s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.206368923187256s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.35566878318787s
string-similarity-['jaro_winkler'] Time: 5.431614875793457s
string-similarity-['levenshtein'] Time: 20.424981355667114s
string-similarity-['jaccard:tokenizer=word'] Time: 1.382413387298584s
normalize-scores-des_cont_jaccard Time: 0.42708349227905273s
smallest-qnode-number Time: 4.157050848007202s
mosaic-features Time: 0.19969558715820312s
create-singleton-feature Time: 2.525026559829712s
vote-by-classifier Time: 1.4850728511810303s
Qnodes to lookup: 102893
Qnodes from file: 102435
Outlier removal generates 288 lof-voted candidates
score-using-embedding Time: 179.078871011734s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5248022079467773s
compute-tf-idf-class_count Time: 194.23351311683655s
compute-tf-idf-property_count Time: 200.19278597831726

26it [1:30:49, 208.65s/it]

split_26.csv: 27 of 50
align-page-rank Time: 2.011129856109619s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.33818817138672s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.38686847686768s
string-similarity-['jaro_winkler'] Time: 5.392008543014526s
string-similarity-['levenshtein'] Time: 19.868897438049316s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3100080490112305s
normalize-scores-des_cont_jaccard Time: 0.4192957878112793s
smallest-qnode-number Time: 3.7068965435028076s
mosaic-features Time: 0.19421005249023438s
create-singleton-feature Time: 2.460932970046997s
vote-by-classifier Time: 1.6748135089874268s
Qnodes to lookup: 103207
Qnodes from file: 102774
Outlier removal generates 284 lof-voted candidates
score-using-embedding Time: 175.78929996490479s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5093803405761719s
compute-tf-idf-class_count Time: 188.20862317085266s
compute-tf-idf-property_count Time: 194.368056058883

27it [1:34:12, 206.91s/it]

split_27.csv: 28 of 50
align-page-rank Time: 2.0713260173797607s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.755712509155273s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.09171438217163s
string-similarity-['jaro_winkler'] Time: 5.974955320358276s
string-similarity-['levenshtein'] Time: 20.762606859207153s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3241407871246338s
normalize-scores-des_cont_jaccard Time: 0.42968320846557617s
smallest-qnode-number Time: 4.459493160247803s
mosaic-features Time: 0.19576787948608398s
create-singleton-feature Time: 2.4268600940704346s
vote-by-classifier Time: 1.6906282901763916s
Qnodes to lookup: 101358
Qnodes from file: 100898
Outlier removal generates 231 lof-voted candidates
score-using-embedding Time: 181.97830414772034s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5273802280426025s
compute-tf-idf-class_count Time: 195.7190978527069s
compute-tf-idf-property_count Time: 201.9397568702

28it [1:37:42, 207.91s/it]

split_28.csv: 29 of 50
align-page-rank Time: 1.9665324687957764s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.46606755256653s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.35581088066101s
string-similarity-['jaro_winkler'] Time: 5.96728777885437s
string-similarity-['levenshtein'] Time: 20.844210386276245s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3231418132781982s
normalize-scores-des_cont_jaccard Time: 0.4380476474761963s
smallest-qnode-number Time: 3.6862080097198486s
mosaic-features Time: 0.2064204216003418s
create-singleton-feature Time: 2.484647750854492s
vote-by-classifier Time: 1.9342453479766846s
Qnodes to lookup: 102967
Qnodes from file: 102547
Outlier removal generates 266 lof-voted candidates
score-using-embedding Time: 181.19957780838013s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.45711350440979s
compute-tf-idf-class_count Time: 194.79086565971375s
compute-tf-idf-property_count Time: 200.5322244167328s


29it [1:41:11, 208.25s/it]

split_29.csv: 30 of 50
align-page-rank Time: 2.0360257625579834s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.9820818901062s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.50756072998047s
string-similarity-['jaro_winkler'] Time: 5.437276363372803s
string-similarity-['levenshtein'] Time: 20.78139638900757s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3634123802185059s
normalize-scores-des_cont_jaccard Time: 0.4139251708984375s
smallest-qnode-number Time: 3.8482892513275146s
mosaic-features Time: 0.15088605880737305s
create-singleton-feature Time: 2.8130698204040527s
vote-by-classifier Time: 2.8234658241271973s
Qnodes to lookup: 101997
Qnodes from file: 101571
Outlier removal generates 271 lof-voted candidates
score-using-embedding Time: 182.2152225971222s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.553135633468628s
compute-tf-idf-class_count Time: 194.87819242477417s
compute-tf-idf-property_count Time: 200.8549406528473s

30it [1:44:40, 208.62s/it]

split_30.csv: 31 of 50
align-page-rank Time: 2.039013624191284s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 25.545010805130005s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.96705985069275s
string-similarity-['jaro_winkler'] Time: 5.5736846923828125s
string-similarity-['levenshtein'] Time: 20.50182557106018s
string-similarity-['jaccard:tokenizer=word'] Time: 1.4244682788848877s
normalize-scores-des_cont_jaccard Time: 0.49935364723205566s
smallest-qnode-number Time: 4.450753688812256s
mosaic-features Time: 0.2422175407409668s
create-singleton-feature Time: 2.414795398712158s
vote-by-classifier Time: 1.6161620616912842s
Qnodes to lookup: 101600
Qnodes from file: 101202
_centroid_of_lof: Missing 1 of 440
Outlier removal generates 263 lof-voted candidates
score-using-embedding Time: 186.2919557094574s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5047035217285156s
compute-tf-idf-class_count Time: 200.36944460868835s
compute-tf-idf-pro

31it [1:48:16, 210.81s/it]

split_31.csv: 32 of 50
align-page-rank Time: 2.039365530014038s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.307249546051025s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 78.29411005973816s
string-similarity-['jaro_winkler'] Time: 5.496874809265137s
string-similarity-['levenshtein'] Time: 21.148146390914917s
string-similarity-['jaccard:tokenizer=word'] Time: 1.4047353267669678s
normalize-scores-des_cont_jaccard Time: 0.41672492027282715s
smallest-qnode-number Time: 4.113308906555176s
mosaic-features Time: 0.19344091415405273s
create-singleton-feature Time: 2.433891534805298s
vote-by-classifier Time: 3.017518997192383s
Qnodes to lookup: 101338
Qnodes from file: 100878
Outlier removal generates 235 lof-voted candidates
score-using-embedding Time: 189.26300525665283s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4266066551208496s
compute-tf-idf-class_count Time: 202.5829315185547s
compute-tf-idf-property_count Time: 208.3651859760284

32it [1:51:55, 213.13s/it]

split_32.csv: 33 of 50
align-page-rank Time: 2.0627377033233643s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.255211114883423s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.07423138618469s
string-similarity-['jaro_winkler'] Time: 5.549811363220215s
string-similarity-['levenshtein'] Time: 20.535226106643677s
string-similarity-['jaccard:tokenizer=word'] Time: 1.4402458667755127s
normalize-scores-des_cont_jaccard Time: 0.4285087585449219s
smallest-qnode-number Time: 3.768028736114502s
mosaic-features Time: 0.19581270217895508s
create-singleton-feature Time: 2.535351514816284s
vote-by-classifier Time: 2.0371620655059814s
Qnodes to lookup: 101712
Qnodes from file: 101300
Outlier removal generates 275 lof-voted candidates
score-using-embedding Time: 188.0476679801941s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4810795783996582s
compute-tf-idf-class_count Time: 201.06855869293213s
compute-tf-idf-property_count Time: 207.549117803573

33it [1:55:32, 214.24s/it]

split_33.csv: 34 of 50
align-page-rank Time: 2.5744447708129883s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 24.023158073425293s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 80.81210112571716s
string-similarity-['jaro_winkler'] Time: 5.525862216949463s
string-similarity-['levenshtein'] Time: 20.736436128616333s
string-similarity-['jaccard:tokenizer=word'] Time: 1.652477741241455s
normalize-scores-des_cont_jaccard Time: 0.44212889671325684s
smallest-qnode-number Time: 3.8851685523986816s
mosaic-features Time: 0.19960761070251465s
create-singleton-feature Time: 2.4059090614318848s
vote-by-classifier Time: 7.483780145645142s
Qnodes to lookup: 100988
Qnodes from file: 100553
Outlier removal generates 263 lof-voted candidates
score-using-embedding Time: 198.9079978466034s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.7067959308624268s
compute-tf-idf-class_count Time: 213.8951280117035s
compute-tf-idf-property_count Time: 220.383133172988

34it [1:59:35, 223.06s/it]

split_34.csv: 35 of 50
align-page-rank Time: 2.303365468978882s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.967201709747314s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.14244818687439s
string-similarity-['jaro_winkler'] Time: 5.5810325145721436s
string-similarity-['levenshtein'] Time: 20.919477701187134s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3289954662322998s
normalize-scores-des_cont_jaccard Time: 0.44580745697021484s
smallest-qnode-number Time: 3.9440600872039795s
mosaic-features Time: 0.21620821952819824s
create-singleton-feature Time: 2.4973647594451904s
vote-by-classifier Time: 6.532684087753296s
Qnodes to lookup: 101520
Qnodes from file: 101053
Outlier removal generates 276 lof-voted candidates
score-using-embedding Time: 206.96429562568665s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.5979549884796143s
compute-tf-idf-class_count Time: 222.71094799041748s
compute-tf-idf-property_count Time: 228.779638051

35it [2:03:53, 233.49s/it]

split_35.csv: 36 of 50
align-page-rank Time: 2.0807952880859375s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 24.15466594696045s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 73.20738101005554s
string-similarity-['jaro_winkler'] Time: 6.586921453475952s
string-similarity-['levenshtein'] Time: 20.465311288833618s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3875885009765625s
normalize-scores-des_cont_jaccard Time: 0.4163858890533447s
smallest-qnode-number Time: 3.8227624893188477s
mosaic-features Time: 0.21348786354064941s
create-singleton-feature Time: 2.9662833213806152s
vote-by-classifier Time: 1.6718213558197021s
Qnodes to lookup: 102691
Qnodes from file: 102232
Outlier removal generates 263 lof-voted candidates
score-using-embedding Time: 179.25527787208557s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.6695163249969482s
compute-tf-idf-class_count Time: 193.04334688186646s
compute-tf-idf-property_count Time: 199.6523585319

36it [2:08:06, 239.42s/it]

split_36.csv: 37 of 50
align-page-rank Time: 2.1082239151000977s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 25.685604095458984s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.72697234153748s
string-similarity-['jaro_winkler'] Time: 5.582223176956177s
string-similarity-['levenshtein'] Time: 20.875514268875122s
string-similarity-['jaccard:tokenizer=word'] Time: 1.367037057876587s
normalize-scores-des_cont_jaccard Time: 0.41529154777526855s
smallest-qnode-number Time: 3.758143186569214s
mosaic-features Time: 0.19394946098327637s
create-singleton-feature Time: 2.442347764968872s
vote-by-classifier Time: 1.5911931991577148s
Qnodes to lookup: 102185
Qnodes from file: 101724
Outlier removal generates 248 lof-voted candidates
score-using-embedding Time: 189.7403781414032s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4704616069793701s
compute-tf-idf-class_count Time: 203.03749203681946s
compute-tf-idf-property_count Time: 209.538336992263

37it [2:11:44, 232.91s/it]

split_37.csv: 38 of 50
align-page-rank Time: 2.0135035514831543s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.514184713363647s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.21775460243225s
string-similarity-['jaro_winkler'] Time: 5.450303077697754s
string-similarity-['levenshtein'] Time: 20.18549132347107s
string-similarity-['jaccard:tokenizer=word'] Time: 1.367936134338379s
normalize-scores-des_cont_jaccard Time: 0.4193274974822998s
smallest-qnode-number Time: 4.18639349937439s
mosaic-features Time: 0.2075343132019043s
create-singleton-feature Time: 2.5769548416137695s
vote-by-classifier Time: 7.136332035064697s
Qnodes to lookup: 103179
Qnodes from file: 102728
Outlier removal generates 268 lof-voted candidates
score-using-embedding Time: 191.21672773361206s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4683988094329834s
compute-tf-idf-class_count Time: 204.0748918056488s
compute-tf-idf-property_count Time: 210.69290971755981s


38it [2:15:24, 228.88s/it]

split_38.csv: 39 of 50
align-page-rank Time: 2.0168802738189697s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 24.049296140670776s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.52488780021667s
string-similarity-['jaro_winkler'] Time: 5.5474159717559814s
string-similarity-['levenshtein'] Time: 20.33226728439331s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5351064205169678s
normalize-scores-des_cont_jaccard Time: 0.41187524795532227s
smallest-qnode-number Time: 3.7500498294830322s
mosaic-features Time: 0.19355988502502441s
create-singleton-feature Time: 2.451186180114746s
vote-by-classifier Time: 1.5173892974853516s
Qnodes to lookup: 100548
Qnodes from file: 100091
Outlier removal generates 246 lof-voted candidates
score-using-embedding Time: 179.7041630744934s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4625511169433594s
compute-tf-idf-class_count Time: 192.10939025878906s
compute-tf-idf-property_count Time: 200.4906792640

39it [2:18:54, 223.32s/it]

split_39.csv: 40 of 50
align-page-rank Time: 2.197352886199951s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.36162042617798s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 78.52441716194153s
string-similarity-['jaro_winkler'] Time: 5.452152252197266s
string-similarity-['levenshtein'] Time: 20.780701398849487s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3544480800628662s
normalize-scores-des_cont_jaccard Time: 0.4173145294189453s
smallest-qnode-number Time: 3.8408703804016113s
mosaic-features Time: 0.19958782196044922s
create-singleton-feature Time: 2.596242904663086s
vote-by-classifier Time: 1.4815411567687988s
Qnodes to lookup: 102248
Qnodes from file: 101785
_centroid_of_lof: Missing 1 of 385
Outlier removal generates 230 lof-voted candidates
score-using-embedding Time: 181.45892596244812s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.46120285987854s
compute-tf-idf-class_count Time: 194.58352947235107s
compute-tf-idf-prop

40it [2:22:24, 219.50s/it]

split_40.csv: 41 of 50
align-page-rank Time: 2.0065503120422363s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.942084789276123s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.91021871566772s
string-similarity-['jaro_winkler'] Time: 5.527280807495117s
string-similarity-['levenshtein'] Time: 20.36776065826416s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3321642875671387s
normalize-scores-des_cont_jaccard Time: 0.47192907333374023s
smallest-qnode-number Time: 4.222314119338989s
mosaic-features Time: 0.19377946853637695s
create-singleton-feature Time: 2.438140869140625s
vote-by-classifier Time: 1.3628082275390625s
Qnodes to lookup: 102471
Qnodes from file: 101975
Outlier removal generates 241 lof-voted candidates
score-using-embedding Time: 184.17892456054688s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4605433940887451s
compute-tf-idf-class_count Time: 199.27223086357117s
compute-tf-idf-property_count Time: 205.94260144233

41it [2:26:00, 218.19s/it]

split_41.csv: 42 of 50
align-page-rank Time: 2.2662353515625s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.403815984725952s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 78.67268681526184s
string-similarity-['jaro_winkler'] Time: 5.6011693477630615s
string-similarity-['levenshtein'] Time: 21.803374528884888s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3293001651763916s
normalize-scores-des_cont_jaccard Time: 0.416149377822876s
smallest-qnode-number Time: 3.7629122734069824s
mosaic-features Time: 0.19379711151123047s
create-singleton-feature Time: 2.5252976417541504s
vote-by-classifier Time: 7.210618019104004s
Qnodes to lookup: 102651
Qnodes from file: 102251
Outlier removal generates 204 lof-voted candidates
score-using-embedding Time: 225.1956205368042s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4310555458068848s
compute-tf-idf-class_count Time: 244.85719275474548s
compute-tf-idf-property_count Time: 250.86828303337097

42it [2:31:14, 247.01s/it]

split_42.csv: 43 of 50
align-page-rank Time: 2.550342082977295s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.1168270111084s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.72680735588074s
string-similarity-['jaro_winkler'] Time: 5.496581792831421s
string-similarity-['levenshtein'] Time: 20.54487657546997s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3621454238891602s
normalize-scores-des_cont_jaccard Time: 0.4247417449951172s
smallest-qnode-number Time: 3.670790672302246s
mosaic-features Time: 0.1965930461883545s
create-singleton-feature Time: 2.572842597961426s
vote-by-classifier Time: 1.5535650253295898s
Qnodes to lookup: 102868
Qnodes from file: 102372
Outlier removal generates 228 lof-voted candidates
score-using-embedding Time: 201.18433904647827s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.445277214050293s
compute-tf-idf-class_count Time: 217.4446952342987s
compute-tf-idf-property_count Time: 224.5693438053131s


43it [2:35:53, 256.61s/it]

split_43.csv: 44 of 50
align-page-rank Time: 1.9934022426605225s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.97088646888733s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.5662488937378s
string-similarity-['jaro_winkler'] Time: 5.55888819694519s
string-similarity-['levenshtein'] Time: 20.69571542739868s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3273978233337402s
normalize-scores-des_cont_jaccard Time: 0.47182321548461914s
smallest-qnode-number Time: 3.7521815299987793s
mosaic-features Time: 0.19816923141479492s
create-singleton-feature Time: 2.4567227363586426s
vote-by-classifier Time: 1.6312122344970703s
Qnodes to lookup: 101320
Qnodes from file: 100852
Outlier removal generates 176 lof-voted candidates
score-using-embedding Time: 175.55113339424133s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4931912422180176s
compute-tf-idf-class_count Time: 188.14073538780212s
compute-tf-idf-property_count Time: 193.725029468536

44it [2:39:15, 240.26s/it]

split_44.csv: 45 of 50
align-page-rank Time: 2.048050880432129s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.011032819747925s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 77.15899395942688s
string-similarity-['jaro_winkler'] Time: 5.4773900508880615s
string-similarity-['levenshtein'] Time: 20.59821367263794s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3551034927368164s
normalize-scores-des_cont_jaccard Time: 0.48537683486938477s
smallest-qnode-number Time: 3.798187255859375s
mosaic-features Time: 0.19275450706481934s
create-singleton-feature Time: 2.500507354736328s
vote-by-classifier Time: 1.6831414699554443s
Qnodes to lookup: 101617
Qnodes from file: 101201
Outlier removal generates 235 lof-voted candidates
score-using-embedding Time: 175.84857749938965s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.607255220413208s
compute-tf-idf-class_count Time: 187.35324788093567s
compute-tf-idf-property_count Time: 192.525866508483

45it [2:42:37, 228.72s/it]

split_45.csv: 46 of 50
align-page-rank Time: 2.107189655303955s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.636942863464355s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.89788246154785s
string-similarity-['jaro_winkler'] Time: 5.68625020980835s
string-similarity-['levenshtein'] Time: 20.847614526748657s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3839151859283447s
normalize-scores-des_cont_jaccard Time: 0.42606258392333984s
smallest-qnode-number Time: 3.860334634780884s
mosaic-features Time: 0.19571518898010254s
create-singleton-feature Time: 2.464385747909546s
vote-by-classifier Time: 1.6436269283294678s
Qnodes to lookup: 103115
Qnodes from file: 102685
_centroid_of_lof: Missing 1 of 529
Outlier removal generates 317 lof-voted candidates
score-using-embedding Time: 175.9442367553711s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.462432622909546s
compute-tf-idf-class_count Time: 188.4198760986328s
compute-tf-idf-prope

46it [2:46:00, 221.00s/it]

split_46.csv: 47 of 50
align-page-rank Time: 2.1986937522888184s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.111056089401245s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 79.20997714996338s
string-similarity-['jaro_winkler'] Time: 5.566278696060181s
string-similarity-['levenshtein'] Time: 19.98146367073059s
string-similarity-['jaccard:tokenizer=word'] Time: 1.4043734073638916s
normalize-scores-des_cont_jaccard Time: 0.41917943954467773s
smallest-qnode-number Time: 3.677954912185669s
mosaic-features Time: 0.19442319869995117s
create-singleton-feature Time: 2.497929096221924s
vote-by-classifier Time: 1.729689359664917s
Qnodes to lookup: 103438
Qnodes from file: 103017
Outlier removal generates 278 lof-voted candidates
score-using-embedding Time: 178.43302488327026s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4687583446502686s
compute-tf-idf-class_count Time: 190.35654759407043s
compute-tf-idf-property_count Time: 195.466870784759

47it [2:49:24, 216.10s/it]

split_47.csv: 48 of 50
align-page-rank Time: 2.0246663093566895s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.689530611038208s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 74.99478650093079s
string-similarity-['jaro_winkler'] Time: 5.497133255004883s
string-similarity-['levenshtein'] Time: 20.649951219558716s
string-similarity-['jaccard:tokenizer=word'] Time: 1.381493330001831s
normalize-scores-des_cont_jaccard Time: 0.4163174629211426s
smallest-qnode-number Time: 3.7959446907043457s
mosaic-features Time: 0.193159818649292s
create-singleton-feature Time: 2.5138933658599854s
vote-by-classifier Time: 1.5062167644500732s
Qnodes to lookup: 103226
Qnodes from file: 102810
Outlier removal generates 323 lof-voted candidates
score-using-embedding Time: 174.79936289787292s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.474578857421875s
compute-tf-idf-class_count Time: 186.8765950202942s
compute-tf-idf-property_count Time: 191.98474502563477

48it [2:52:45, 211.42s/it]

split_48.csv: 49 of 50
align-page-rank Time: 2.014023780822754s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.608262300491333s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 76.41889429092407s
string-similarity-['jaro_winkler'] Time: 5.446140289306641s
string-similarity-['levenshtein'] Time: 20.56031084060669s
string-similarity-['jaccard:tokenizer=word'] Time: 1.6000192165374756s
normalize-scores-des_cont_jaccard Time: 0.42737436294555664s
smallest-qnode-number Time: 3.73067569732666s
mosaic-features Time: 0.19897246360778809s
create-singleton-feature Time: 2.4266467094421387s
vote-by-classifier Time: 1.4760348796844482s
Qnodes to lookup: 101851
Qnodes from file: 101432
Outlier removal generates 292 lof-voted candidates
score-using-embedding Time: 175.18187546730042s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4361016750335693s
compute-tf-idf-class_count Time: 186.13545632362366s
compute-tf-idf-property_count Time: 191.731784105300

49it [2:56:05, 208.05s/it]

split_49.csv: 50 of 50
align-page-rank Time: 2.2183539867401123s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.31024169921875s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 75.28602361679077s
string-similarity-['jaro_winkler'] Time: 5.629039525985718s
string-similarity-['levenshtein'] Time: 20.43037748336792s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3339831829071045s
normalize-scores-des_cont_jaccard Time: 0.39553332328796387s
smallest-qnode-number Time: 3.7197887897491455s
mosaic-features Time: 0.19759726524353027s
create-singleton-feature Time: 2.4863481521606445s
vote-by-classifier Time: 1.5447382926940918s
Qnodes to lookup: 103643
Qnodes from file: 103224
Outlier removal generates 312 lof-voted candidates
score-using-embedding Time: 174.2413935661316s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.4542596340179443s
compute-tf-idf-class_count Time: 185.37823343276978s
compute-tf-idf-property_count Time: 190.61177301406

50it [2:59:24, 215.30s/it]


In [17]:
def concat_files(i_path, output_file, sep=None):
    df_l = []
    for f in glob.glob(f'{i_path}/*'):
        if sep:
            df_l.append(pd.read_csv(f, sep=sep))
        else:
            df_l.append(pd.read_csv(f))
    if sep:
        pd.concat(df_l).to_csv(output_file, index=False, sep=sep)
    else:
        pd.concat(df_l).to_csv(output_file, index=False)

In [6]:
def run_context_match(features_path, context_path, custom_file, output_path):
    file_list = glob.glob(features_path + '/*.csv')
    for i, f in tqdm(enumerate(file_list)):
        if i > 26:
            f_name = f.split('/')[-1]
            print(f'{f_name}: {i+1} of {len(file_list)}')
            context_file = f"{context_path}/{f_name[:-4]}_context.tsv"
            output_file = f"{output_path}/{f_name}"
            !tl context-match --custom-context-file $custom_file \
            --context-file $context_file --string-separator ";" \
            --similarity-string-threshold $string_threshold $f > $output_file
        

In [7]:
context_path = '/data/amandeep/nih-dataset/temp/context'
features_path = '/data/amandeep/nih-dataset/features'
context_output_path = '/data/amandeep/nih-dataset/features_with_context'

In [8]:
run_context_match(features_path, context_path, '/data/amandeep/nih-dataset/coauthors.context.tsv.gz', context_output_path)

0it [00:00, ?it/s]

split_27.csv: 28 of 50


28it [26:14, 56.23s/it]

split_28.csv: 29 of 50


29it [53:29, 132.88s/it]

split_29.csv: 30 of 50


30it [1:20:50, 230.64s/it]

split_30.csv: 31 of 50


31it [1:47:41, 347.73s/it]

split_31.csv: 32 of 50


32it [2:14:27, 483.63s/it]

split_32.csv: 33 of 50


33it [2:41:56, 639.52s/it]

split_33.csv: 34 of 50


34it [3:07:38, 784.23s/it]

split_34.csv: 35 of 50


35it [3:34:03, 933.42s/it]

split_35.csv: 36 of 50


36it [4:01:15, 1080.41s/it]

split_36.csv: 37 of 50


37it [4:28:00, 1201.52s/it]

split_37.csv: 38 of 50


38it [4:55:00, 1305.31s/it]

split_38.csv: 39 of 50


39it [5:21:44, 1383.69s/it]

split_39.csv: 40 of 50


40it [5:48:03, 1436.78s/it]

split_40.csv: 41 of 50


41it [6:13:21, 1459.48s/it]

split_41.csv: 42 of 50


42it [6:40:13, 1503.08s/it]

split_42.csv: 43 of 50


43it [7:08:52, 1565.54s/it]

split_43.csv: 44 of 50


44it [7:35:28, 1574.59s/it]

split_44.csv: 45 of 50


45it [8:02:04, 1580.72s/it]

split_45.csv: 46 of 50


46it [8:29:25, 1598.74s/it]

split_46.csv: 47 of 50


47it [8:55:47, 1593.77s/it]

split_47.csv: 48 of 50


48it [9:23:19, 1611.22s/it]

split_48.csv: 49 of 50


49it [9:53:12, 1665.37s/it]

split_49.csv: 50 of 50


50it [10:20:31, 744.64s/it] 


In [9]:
prediction_path = '/data/amandeep/nih-dataset/person/predictions'

In [10]:
features_str = ",".join(features)
def run_prediction(features_path, prediction_path):
    file_list = glob.glob(features_path + '/*.csv')
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        print(f'{f_name}: {i+1} of {len(file_list)}')
        output_file = f"{prediction_path}/{f_name}"
        !tl predict-using-model -o siamese_prediction \
        --ranking-model $ranking_model_file_path \
        --features $features_str \
        --normalization-factor $min_max_scaler_path $f > $output_file

In [11]:
run_prediction(context_output_path, prediction_path)

0it [00:00, ?it/s]

split_0.csv: 1 of 50
predict-using-model Time: 9.645196199417114s


1it [01:08, 68.19s/it]

split_1.csv: 2 of 50
predict-using-model Time: 7.076881408691406s


2it [01:23, 36.96s/it]

split_2.csv: 3 of 50
predict-using-model Time: 7.130551338195801s


3it [01:41, 28.23s/it]

split_3.csv: 4 of 50
predict-using-model Time: 7.226583957672119s


4it [02:18, 31.65s/it]

split_4.csv: 5 of 50
predict-using-model Time: 7.799553394317627s


5it [02:34, 26.21s/it]

split_5.csv: 6 of 50
predict-using-model Time: 6.9397900104522705s


6it [03:02, 26.83s/it]

split_6.csv: 7 of 50
predict-using-model Time: 8.625433444976807s


7it [03:39, 30.01s/it]

split_7.csv: 8 of 50
predict-using-model Time: 7.065806150436401s


8it [04:00, 27.27s/it]

split_8.csv: 9 of 50
predict-using-model Time: 7.75649094581604s


9it [04:17, 23.90s/it]

split_9.csv: 10 of 50
predict-using-model Time: 7.754991292953491s


10it [04:44, 25.03s/it]

split_10.csv: 11 of 50
predict-using-model Time: 7.071519136428833s


11it [05:23, 29.42s/it]

split_11.csv: 12 of 50
predict-using-model Time: 7.321372747421265s


12it [05:39, 25.27s/it]

split_12.csv: 13 of 50
predict-using-model Time: 7.252319097518921s


13it [06:20, 29.91s/it]

split_13.csv: 14 of 50
predict-using-model Time: 8.635634422302246s


14it [06:37, 25.96s/it]

split_14.csv: 15 of 50
predict-using-model Time: 8.335168600082397s


15it [06:55, 23.57s/it]

split_15.csv: 16 of 50
predict-using-model Time: 7.788127660751343s


16it [07:38, 29.47s/it]

split_16.csv: 17 of 50
predict-using-model Time: 7.827411413192749s


17it [08:20, 33.42s/it]

split_17.csv: 18 of 50
predict-using-model Time: 6.885857343673706s


18it [08:35, 27.77s/it]

split_18.csv: 19 of 50
predict-using-model Time: 7.514395236968994s


19it [08:53, 24.90s/it]

split_19.csv: 20 of 50
predict-using-model Time: 7.899179458618164s


20it [09:38, 30.85s/it]

split_20.csv: 21 of 50
predict-using-model Time: 6.865811824798584s


21it [09:53, 26.22s/it]

split_21.csv: 22 of 50
predict-using-model Time: 7.0593554973602295s


22it [10:19, 26.01s/it]

split_22.csv: 23 of 50
predict-using-model Time: 6.987247943878174s


23it [10:38, 23.93s/it]

split_23.csv: 24 of 50
predict-using-model Time: 7.262299060821533s


24it [10:56, 22.00s/it]

split_24.csv: 25 of 50
predict-using-model Time: 7.529797315597534s


25it [11:33, 26.70s/it]

split_25.csv: 26 of 50
predict-using-model Time: 6.773606538772583s


26it [11:54, 24.95s/it]

split_26.csv: 27 of 50
predict-using-model Time: 6.766122579574585s


27it [12:10, 22.27s/it]

split_27.csv: 28 of 50
predict-using-model Time: 7.525735855102539s


28it [12:38, 23.96s/it]

split_28.csv: 29 of 50
predict-using-model Time: 7.284029483795166s


29it [12:54, 21.60s/it]

split_29.csv: 30 of 50
predict-using-model Time: 6.971604824066162s


30it [13:13, 20.66s/it]

split_30.csv: 31 of 50
predict-using-model Time: 7.591525554656982s


31it [13:40, 22.68s/it]

split_31.csv: 32 of 50
predict-using-model Time: 7.662627220153809s


32it [14:03, 22.70s/it]

split_32.csv: 33 of 50
predict-using-model Time: 6.665076732635498s


33it [14:18, 20.46s/it]

split_33.csv: 34 of 50
predict-using-model Time: 8.243146181106567s


34it [14:42, 21.44s/it]

split_34.csv: 35 of 50
predict-using-model Time: 7.422563076019287s


35it [14:58, 20.05s/it]

split_35.csv: 36 of 50
predict-using-model Time: 7.438463449478149s


36it [15:17, 19.56s/it]

split_36.csv: 37 of 50
predict-using-model Time: 7.5390753746032715s


37it [15:45, 22.10s/it]

split_37.csv: 38 of 50
predict-using-model Time: 7.765990734100342s


38it [16:02, 20.53s/it]

split_38.csv: 39 of 50
predict-using-model Time: 6.60460638999939s


39it [16:19, 19.61s/it]

split_39.csv: 40 of 50
predict-using-model Time: 7.662938117980957s


40it [16:48, 22.48s/it]

split_40.csv: 41 of 50
predict-using-model Time: 6.90322732925415s


41it [17:14, 23.35s/it]

split_41.csv: 42 of 50
predict-using-model Time: 7.047000885009766s


42it [17:29, 21.00s/it]

split_42.csv: 43 of 50
predict-using-model Time: 7.476243257522583s


43it [17:48, 20.22s/it]

split_43.csv: 44 of 50
predict-using-model Time: 8.246151685714722s


44it [18:20, 23.91s/it]

split_44.csv: 45 of 50
predict-using-model Time: 7.284520626068115s


45it [18:48, 25.06s/it]

split_45.csv: 46 of 50
predict-using-model Time: 6.5975022315979s


46it [19:02, 21.66s/it]

split_46.csv: 47 of 50
predict-using-model Time: 8.153709173202515s


47it [19:18, 19.92s/it]

split_47.csv: 48 of 50
predict-using-model Time: 7.924069166183472s


48it [19:35, 19.17s/it]

split_48.csv: 49 of 50
predict-using-model Time: 7.273780107498169s


49it [19:50, 17.94s/it]

split_49.csv: 50 of 50
predict-using-model Time: 8.205457925796509s


50it [20:05, 24.12s/it]


In [12]:
colorized_path = '/data/amandeep/nih-dataset/person/colorized'

In [13]:
def topk_color(prediction_path, colorized_path):
    file_list = glob.glob(prediction_path + '/*.csv')
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        print(f'{f_name}: {i+1} of {len(file_list)}')
        output_file = f"{colorized_path}/{f_name[:-4]}.xlsx"
        !tl get-kg-links -c $final_score_column -k 5 --k-rows $f \
        / add-color -c "$final_score_column" -k 5 --output "$output_file"

In [14]:
topk_color(prediction_path, colorized_path)

0it [00:00, ?it/s]

split_0.csv: 1 of 50
get-kg-links-siamese_prediction Time: 6.869730472564697s
add-color Time: 1.7595765590667725s


1it [00:15, 15.27s/it]

split_1.csv: 2 of 50
get-kg-links-siamese_prediction Time: 5.5476555824279785s
add-color Time: 1.4353773593902588s


2it [00:28, 14.20s/it]

split_2.csv: 3 of 50
get-kg-links-siamese_prediction Time: 5.605919361114502s
add-color Time: 1.4305763244628906s


3it [00:42, 14.10s/it]

split_3.csv: 4 of 50
get-kg-links-siamese_prediction Time: 6.473447561264038s
add-color Time: 1.4325153827667236s


4it [00:57, 14.33s/it]

split_4.csv: 5 of 50
get-kg-links-siamese_prediction Time: 5.427431583404541s
add-color Time: 1.2795379161834717s


5it [01:10, 14.04s/it]

split_5.csv: 6 of 50
get-kg-links-siamese_prediction Time: 6.655204772949219s
add-color Time: 1.4115636348724365s


6it [01:26, 14.48s/it]

split_6.csv: 7 of 50
get-kg-links-siamese_prediction Time: 6.13248085975647s
add-color Time: 1.7175161838531494s


7it [01:40, 14.50s/it]

split_7.csv: 8 of 50
get-kg-links-siamese_prediction Time: 5.841087341308594s
add-color Time: 1.4092669486999512s


8it [01:54, 14.38s/it]

split_8.csv: 9 of 50
get-kg-links-siamese_prediction Time: 5.5871336460113525s
add-color Time: 1.4793803691864014s


9it [02:08, 14.17s/it]

split_9.csv: 10 of 50
get-kg-links-siamese_prediction Time: 6.266153335571289s
add-color Time: 1.4399797916412354s


10it [02:23, 14.24s/it]

split_10.csv: 11 of 50
get-kg-links-siamese_prediction Time: 5.574220418930054s
add-color Time: 1.653367042541504s


11it [02:37, 14.36s/it]

split_11.csv: 12 of 50
get-kg-links-siamese_prediction Time: 5.804218053817749s
add-color Time: 1.4569971561431885s


12it [02:51, 14.14s/it]

split_12.csv: 13 of 50
get-kg-links-siamese_prediction Time: 5.610262870788574s
add-color Time: 1.440141201019287s


13it [03:05, 14.13s/it]

split_13.csv: 14 of 50
get-kg-links-siamese_prediction Time: 5.5937652587890625s
add-color Time: 1.4604761600494385s


14it [03:19, 14.06s/it]

split_14.csv: 15 of 50
get-kg-links-siamese_prediction Time: 5.729668855667114s
add-color Time: 1.4819111824035645s


15it [03:33, 13.99s/it]

split_15.csv: 16 of 50
get-kg-links-siamese_prediction Time: 5.985253810882568s
add-color Time: 1.483978271484375s


16it [03:47, 14.23s/it]

split_16.csv: 17 of 50
get-kg-links-siamese_prediction Time: 5.859854221343994s
add-color Time: 1.4434282779693604s


17it [04:02, 14.32s/it]

split_17.csv: 18 of 50
get-kg-links-siamese_prediction Time: 6.021014928817749s
add-color Time: 1.4186418056488037s


18it [04:16, 14.30s/it]

split_18.csv: 19 of 50
get-kg-links-siamese_prediction Time: 6.489190340042114s
add-color Time: 1.583705186843872s


19it [04:31, 14.44s/it]

split_19.csv: 20 of 50
get-kg-links-siamese_prediction Time: 5.825910329818726s
add-color Time: 1.4517402648925781s


20it [04:45, 14.22s/it]

split_20.csv: 21 of 50
get-kg-links-siamese_prediction Time: 5.502074718475342s
add-color Time: 1.4109477996826172s


21it [04:59, 14.16s/it]

split_21.csv: 22 of 50
get-kg-links-siamese_prediction Time: 5.686086416244507s
add-color Time: 1.4330735206604004s


22it [05:13, 14.13s/it]

split_22.csv: 23 of 50
get-kg-links-siamese_prediction Time: 5.849212408065796s
add-color Time: 1.6981568336486816s


23it [05:27, 14.13s/it]

split_23.csv: 24 of 50
get-kg-links-siamese_prediction Time: 5.746508359909058s
add-color Time: 1.422515869140625s


24it [05:41, 14.03s/it]

split_24.csv: 25 of 50
get-kg-links-siamese_prediction Time: 5.5991621017456055s
add-color Time: 1.4162545204162598s


25it [05:54, 13.93s/it]

split_25.csv: 26 of 50
get-kg-links-siamese_prediction Time: 5.615600347518921s
add-color Time: 1.4172861576080322s


26it [06:08, 13.89s/it]

split_26.csv: 27 of 50
get-kg-links-siamese_prediction Time: 5.746024131774902s
add-color Time: 1.4324703216552734s


27it [06:22, 13.79s/it]

split_27.csv: 28 of 50
get-kg-links-siamese_prediction Time: 5.570925951004028s
add-color Time: 1.4672470092773438s


28it [06:36, 13.88s/it]

split_28.csv: 29 of 50
get-kg-links-siamese_prediction Time: 5.741398811340332s
add-color Time: 1.4315505027770996s


29it [06:50, 13.95s/it]

split_29.csv: 30 of 50
get-kg-links-siamese_prediction Time: 5.762377023696899s
add-color Time: 1.4969313144683838s


30it [07:04, 14.00s/it]

split_30.csv: 31 of 50
get-kg-links-siamese_prediction Time: 5.637322187423706s
add-color Time: 1.61122465133667s


31it [07:18, 13.96s/it]

split_31.csv: 32 of 50
get-kg-links-siamese_prediction Time: 6.052750110626221s
add-color Time: 1.494025707244873s


32it [07:33, 14.19s/it]

split_32.csv: 33 of 50
get-kg-links-siamese_prediction Time: 5.595155477523804s
add-color Time: 1.5671918392181396s


33it [07:47, 14.09s/it]

split_33.csv: 34 of 50
get-kg-links-siamese_prediction Time: 5.742813587188721s
add-color Time: 1.4433143138885498s


34it [08:01, 14.06s/it]

split_34.csv: 35 of 50
get-kg-links-siamese_prediction Time: 5.6178858280181885s
add-color Time: 1.44110107421875s


35it [08:14, 13.95s/it]

split_35.csv: 36 of 50
get-kg-links-siamese_prediction Time: 5.922359943389893s
add-color Time: 1.3242626190185547s


36it [08:28, 13.96s/it]

split_36.csv: 37 of 50
get-kg-links-siamese_prediction Time: 5.616475343704224s
add-color Time: 1.7249736785888672s


37it [08:42, 14.02s/it]

split_37.csv: 38 of 50
get-kg-links-siamese_prediction Time: 5.580975770950317s
add-color Time: 1.7436583042144775s


38it [08:56, 14.05s/it]

split_38.csv: 39 of 50
get-kg-links-siamese_prediction Time: 5.759387731552124s
add-color Time: 1.4033842086791992s


39it [09:10, 14.02s/it]

split_39.csv: 40 of 50
get-kg-links-siamese_prediction Time: 5.9113500118255615s
add-color Time: 1.4501399993896484s


40it [09:24, 14.01s/it]

split_40.csv: 41 of 50
get-kg-links-siamese_prediction Time: 5.62574315071106s
add-color Time: 1.4274742603302002s


41it [09:38, 13.89s/it]

split_41.csv: 42 of 50
get-kg-links-siamese_prediction Time: 6.042596101760864s
add-color Time: 1.4350101947784424s


42it [09:53, 14.16s/it]

split_42.csv: 43 of 50
get-kg-links-siamese_prediction Time: 6.304434299468994s
add-color Time: 1.4188194274902344s


43it [10:07, 14.22s/it]

split_43.csv: 44 of 50
get-kg-links-siamese_prediction Time: 5.605419397354126s
add-color Time: 1.4379394054412842s


44it [10:21, 14.03s/it]

split_44.csv: 45 of 50
get-kg-links-siamese_prediction Time: 5.632838010787964s
add-color Time: 1.4317662715911865s


45it [10:35, 13.98s/it]

split_45.csv: 46 of 50
get-kg-links-siamese_prediction Time: 6.536598443984985s
add-color Time: 1.4628348350524902s


46it [10:50, 14.34s/it]

split_46.csv: 47 of 50
get-kg-links-siamese_prediction Time: 6.204100847244263s
add-color Time: 1.4505681991577148s


47it [11:05, 14.53s/it]

split_47.csv: 48 of 50
get-kg-links-siamese_prediction Time: 6.293227195739746s
add-color Time: 1.617821216583252s


48it [11:20, 14.62s/it]

split_48.csv: 49 of 50
get-kg-links-siamese_prediction Time: 5.665173768997192s
add-color Time: 1.4994769096374512s


49it [11:34, 14.51s/it]

split_49.csv: 50 of 50
get-kg-links-siamese_prediction Time: 5.610145568847656s
add-color Time: 1.451505184173584s


50it [11:48, 14.16s/it]


In [25]:
def add_NILS(colorized_path, output_path):
    file_list = glob.glob(colorized_path + '/*.xlsx')
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        print(f'{f_name}: {i+1} of {len(file_list)}')
        output_file = f"{output_path}/{f_name[:-5]}.csv"
        
        df = pd.read_excel(f)
        df.loc[df['siamese_prediction'].astype(float) < 0.9, 'kg_id'] = 'NIL'
        df.to_csv(output_file, index=False)

In [18]:
nil_path = '/data/amandeep/nih-dataset/person/with_nils'
add_NILS(colorized_path, nil_path)

0it [00:00, ?it/s]

split_0.xlsx: 1 of 50


1it [00:03,  3.12s/it]

split_1.xlsx: 2 of 50


2it [00:06,  3.14s/it]

split_2.xlsx: 3 of 50


3it [00:09,  3.00s/it]

split_3.xlsx: 4 of 50


4it [00:12,  3.21s/it]

split_4.xlsx: 5 of 50


5it [00:15,  3.25s/it]

split_5.xlsx: 6 of 50


6it [00:18,  3.18s/it]

split_6.xlsx: 7 of 50


7it [00:22,  3.13s/it]

split_7.xlsx: 8 of 50


8it [00:24,  3.03s/it]

split_8.xlsx: 9 of 50


9it [00:27,  2.86s/it]

split_9.xlsx: 10 of 50


10it [00:30,  2.84s/it]

split_10.xlsx: 11 of 50


11it [00:32,  2.74s/it]

split_11.xlsx: 12 of 50


12it [00:35,  2.63s/it]

split_12.xlsx: 13 of 50


13it [00:37,  2.59s/it]

split_13.xlsx: 14 of 50


14it [00:40,  2.58s/it]

split_14.xlsx: 15 of 50


15it [00:42,  2.55s/it]

split_15.xlsx: 16 of 50


16it [00:44,  2.50s/it]

split_16.xlsx: 17 of 50


17it [00:47,  2.54s/it]

split_17.xlsx: 18 of 50


18it [00:50,  2.58s/it]

split_18.xlsx: 19 of 50


19it [00:52,  2.56s/it]

split_19.xlsx: 20 of 50


20it [00:55,  2.64s/it]

split_20.xlsx: 21 of 50


21it [00:58,  2.82s/it]

split_21.xlsx: 22 of 50


22it [01:01,  2.90s/it]

split_22.xlsx: 23 of 50


23it [01:04,  2.85s/it]

split_23.xlsx: 24 of 50


24it [01:07,  2.82s/it]

split_24.xlsx: 25 of 50


25it [01:09,  2.69s/it]

split_25.xlsx: 26 of 50


26it [01:12,  2.63s/it]

split_26.xlsx: 27 of 50


27it [01:14,  2.60s/it]

split_27.xlsx: 28 of 50


28it [01:17,  2.56s/it]

split_28.xlsx: 29 of 50


29it [01:19,  2.50s/it]

split_29.xlsx: 30 of 50


30it [01:22,  2.49s/it]

split_30.xlsx: 31 of 50


31it [01:24,  2.48s/it]

split_31.xlsx: 32 of 50


32it [01:27,  2.48s/it]

split_32.xlsx: 33 of 50


33it [01:29,  2.46s/it]

split_33.xlsx: 34 of 50


34it [01:31,  2.46s/it]

split_34.xlsx: 35 of 50


35it [01:34,  2.48s/it]

split_35.xlsx: 36 of 50


36it [01:37,  2.67s/it]

split_36.xlsx: 37 of 50


37it [01:40,  2.71s/it]

split_37.xlsx: 38 of 50


38it [01:43,  2.78s/it]

split_38.xlsx: 39 of 50


39it [01:46,  2.92s/it]

split_39.xlsx: 40 of 50


40it [01:49,  2.97s/it]

split_40.xlsx: 41 of 50


41it [01:52,  2.99s/it]

split_41.xlsx: 42 of 50


42it [01:55,  2.86s/it]

split_42.xlsx: 43 of 50


43it [01:57,  2.78s/it]

split_43.xlsx: 44 of 50


44it [02:00,  2.77s/it]

split_44.xlsx: 45 of 50


45it [02:03,  2.69s/it]

split_45.xlsx: 46 of 50


46it [02:05,  2.64s/it]

split_46.xlsx: 47 of 50


47it [02:08,  2.57s/it]

split_47.xlsx: 48 of 50


48it [02:10,  2.54s/it]

split_48.xlsx: 49 of 50


49it [02:12,  2.52s/it]

split_49.xlsx: 50 of 50


50it [02:15,  2.71s/it]


In [26]:
def count_non_nils(nil_path):
    file_list = glob.glob(nil_path + '/*.csv')
    o = list()
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        o.append(pd.read_csv(f))
    df = pd.concat(o)
    print(len(df))
    df = df[df['kg_id'] != 'NIL']
    print(len(df))

        

In [27]:
count_non_nils(nil_path)

50it [00:00, 120.06it/s]

51568
21910





In [33]:
def join(nil_path):
    file_list = glob.glob(nil_path + '/*.csv')
    o = list()
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        o.append(pd.read_csv(f))
    df = pd.concat(o)
    df.to_csv('/tmp/joined.csv', index=False)

In [34]:
join(nil_path)

50it [00:06,  7.86it/s]


In [36]:
pd.read_csv(table_path, sep='\t').to_csv(table_path[:-4] + '.csv', index=False)

In [38]:
!tl join -c siamese_prediction -f /data/amandeep/nih-dataset/person/tl_person_coinvestigator_with_qnihid.csv \
--extra-info /tmp/joined.csv > '/data/amandeep/nih-dataset/person/tl_person_coinvestigator_with_qnihid_joined.csv'

join Time: 98.23094987869263s


In [78]:
def create_replace_nodes_mapping(joined_file, mapping_file):
    df = pd.read_csv(joined_file)
    essential_columns = ['person_qnihid', 'person_name_kg_id', 'person_name_score']
    cols = df.columns
    for c in cols:
        if c not in essential_columns:
            df = df.drop(columns=c)
    df['label'] = 'same_as_item'
    df.rename(columns={'person_qnihid': 'node1', 'person_name_kg_id': 'node2', 'person_name_score':'confidence'}, inplace=True)
    df = df[['node1', 'label', 'node2', 'confidence']]
    df.drop_duplicates(subset=['node1', 'node2'], inplace=True)
    oo = []
    for _, gdf in df.groupby('node1'):
        gdf = gdf[gdf['node2'] != 'NIL']
        if len(gdf) > 1:
            oo.append(pd.DataFrame(gdf.head(0)))
        else:
            oo.append(gdf)
    pd.concat(oo).to_csv(mapping_file, sep='\t', index=False)

In [79]:
create_replace_nodes_mapping('/data/amandeep/nih-dataset/person/tl_person_coinvestigator_with_qnihid_joined.csv', '/data/amandeep/nih-dataset/person/replace_nodes_mapping.tsv')

In [60]:
def concat_person_files():
    f_p = "/data/amandeep/nih-dataset/person/kgtk-files-nih V2.0"
    o = []
    for f in glob.glob(f"{f_p}/*tsv"):
        o.append(pd.read_csv(f, sep='\t'))
    df = pd.concat(o)
    df.to_csv('/data/amandeep/nih-dataset/person/nih-person-kgtk.tsv', sep='\t', index=False)

In [61]:
concat_person_files()

kgtk replace-nodes -i nih-person-kgtk.tsv -o nih-person-kgtk-wikidata-qnodes.tsv --mapping-file replace_nodes_mapping.tsv