In [1]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from tqdm import tqdm

In [2]:
HOME_DIR = '/data/amandeep/nih-dataset/organization'
table_path = f'{HOME_DIR}/org_for_tl_with_qnode.tsv'
f_name = table_path.split("/")[-1]
wikify_column_name = "name,city,state,country"
final_score_column = "siamese_prediction"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count,context'
temp_dir= f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/temp/apr_test.csv'
model_file_path = './models/weighted_lr.pkl'
ranking_model_file_path = './models/epoch_5_loss_0.09882864356040955_top1_0.8968926553672316.pth'
min_max_scaler_path = './models/normalization_factor.pkl'

model_voted_candidate_file_path = f'{HOME_DIR}/temp/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/temp/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/temp/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/temp/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/temp/lof_feature.csv'
context_score_file = f'{HOME_DIR}/temp/context_score_file.csv'

output_model_pred_file = f'{HOME_DIR}/temp/model_prediction.csv'
top5_links = f'{HOME_DIR}/temp/top5_links.csv'
colorized_kg_links = f'{HOME_DIR}/temp/{f_name.strip(".csv")}_colorized.xlsx'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
class_count_file = f'{HOME_DIR}/temp/class_count.tsv'
property_count_file = f'{HOME_DIR}/temp/property_count.tsv'
context_file = f'{HOME_DIR}/temp/context.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

string_threshold = 0.9
siamese_threshold = 0.9
custom_context_file = '/Users/grantxie/test/coi3/coauthors.context.tsv.gz'
gt = '/Users/grantxie/Downloads/groundtruth_new.csv'
selection_save_path = 'test_selection.csv'
labeled_path = 'test_eva.csv'

In [3]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score']

In [4]:
print(len(features))

15


In [5]:
!ls "$table_path"

/data/amandeep/nih-dataset/organization/org_for_tl_with_qnode.tsv


### Canonicalize

In [6]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" -s org_node,city_node,state_node,country_node --tsv \
> "$canonical_file_path"

canonicalize Time: 0.13051271438598633s


In [7]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context,filename,column-id
0,0,0,UNIVERSITY OF WASHINGTON,SEATTLE|WA|UNITED STATES,org_for_tl_with_qnode.tsv,org_for_tl_with_qnode.tsv-0
1,0,1,CHARLES R. DREW UNIVERSITY OF MEDICAL & SCIENCE,LOS ANGELES|CA|UNITED STATES,org_for_tl_with_qnode.tsv,org_for_tl_with_qnode.tsv-0
2,0,2,"UNIVERSITY OF CALIFORNIA, SAN DIEGO",LA JOLLA|CA|UNITED STATES,org_for_tl_with_qnode.tsv,org_for_tl_with_qnode.tsv-0
3,0,3,UNIVERSITY OF MIAMI SCHOOL OF MEDICINE,CORAL GABLES|FL|UNITED STATES,org_for_tl_with_qnode.tsv,org_for_tl_with_qnode.tsv-0
4,0,4,BAYLOR UNIVERSITY,WACO|TX|UNITED STATES,org_for_tl_with_qnode.tsv,org_for_tl_with_qnode.tsv-0


In [8]:
!ls -l $canonical_file_path

-rw-r--r-- 1 amandeep isdstaff 2742224 Aug  2 17:48 /data/amandeep/nih-dataset/organization/temp/canonical.csv


In [9]:
def split(f_path, output_path):
    df = pd.read_csv(f_path)
    for column, gdf in df.groupby(by=['column']):
        print(column, len(gdf))
        d_list = np.array_split(gdf, 10)
        for i, d in enumerate(d_list):
            d.to_csv(f'{output_path}/split_{column}_{i}.csv', index=False)

In [10]:
split(canonical_file_path, '/data/amandeep/nih-dataset/organization/nih-split')

0 6025
1 6025
2 6025
3 6025


# Candidate Generation

In [11]:
def candidate_generation(path, output_path, class_count_path, prop_count_path, context_path, graph_embedding):
    file_list = glob.glob(path + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        st = time.time()
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        output_file = f"{output_path}/{filename}"
        
        !tl clean -c label -o label_clean $file / \
        --url http://ckg07:9200 --index wikidatadwd-augmented \
        get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        --url http://ckg07:9200 --index wikidatadwd-augmented \
        get-exact-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" > "$output_file"
        
        for field in aux_field.split(','):
            aux_list = []
            for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
                aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
            aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
            if field == 'class_count':
                class_count_file = f"{class_count_path}/{filename[:-4]}_class_count.tsv"
                aux_df.to_csv(class_count_file, sep='\t', index=False)
            elif field == 'property_count':
                prop_count_file = f"{prop_count_path}/{filename[:-4]}_prop_count.tsv"
                aux_df.to_csv(prop_count_file, sep='\t', index=False)
            elif field == 'context':
                context_file = f"{context_path}/{filename[:-4]}_context.tsv"
                aux_df.to_csv(context_file, sep='\t', index=False)
            else:
                graph_embedding_file = f"{graph_embedding}/{filename[:-4]}_graph_embedding_complex.tsv"
                aux_df.to_csv(graph_embedding_file, sep='\t', index=False)

In [12]:
input_path = '/data/amandeep/nih-dataset/organization/nih-split'
output_path = '/data/amandeep/nih-dataset/organization/candidates'
!mkdir -p $output_path

class_count_path = '/data/amandeep/nih-dataset/organization/temp/class_c'
!mkdir -p $class_count_path
prop_count_path = '/data/amandeep/nih-dataset/organization/temp/prop_c'
!mkdir -p $prop_count_path
context_path = '/data/amandeep/nih-dataset/organization/temp/context'
!mkdir -p $context_path
graph_embedding = '/data/amandeep/nih-dataset/organization/temp/ge'
!mkdir -p $graph_embedding
import time

In [13]:
candidate_generation(input_path, output_path, class_count_path, prop_count_path, context_path, graph_embedding)

0it [00:00, ?it/s]

split_0_0.csv: 1 of 40
clean Time: 0.03534221649169922s
get-fuzzy-augmented-matches Time: 44.592395544052124s
get-exact-matches Time: 3.4459550380706787s


1it [00:58, 58.83s/it]

split_0_1.csv: 2 of 40
clean Time: 0.03396940231323242s
get-fuzzy-augmented-matches Time: 43.86876440048218s
get-exact-matches Time: 3.6438634395599365s


2it [01:56, 58.31s/it]

split_0_2.csv: 3 of 40
clean Time: 0.0399777889251709s
get-fuzzy-augmented-matches Time: 43.40606880187988s
get-exact-matches Time: 3.4931576251983643s


3it [02:54, 57.99s/it]

split_0_3.csv: 4 of 40
clean Time: 0.03672909736633301s
get-fuzzy-augmented-matches Time: 40.4506721496582s
get-exact-matches Time: 3.3564834594726562s


4it [03:48, 56.61s/it]

split_0_4.csv: 5 of 40
clean Time: 0.02532649040222168s
get-fuzzy-augmented-matches Time: 40.627663373947144s
get-exact-matches Time: 3.366626024246216s


5it [04:43, 55.72s/it]

split_0_5.csv: 6 of 40
clean Time: 0.03431987762451172s
get-fuzzy-augmented-matches Time: 39.46259140968323s
get-exact-matches Time: 3.4717140197753906s


6it [05:36, 54.89s/it]

split_0_6.csv: 7 of 40
clean Time: 0.033527374267578125s
get-fuzzy-augmented-matches Time: 37.73267960548401s
get-exact-matches Time: 3.349005699157715s


7it [06:27, 53.56s/it]

split_0_7.csv: 8 of 40
clean Time: 0.03268003463745117s
get-fuzzy-augmented-matches Time: 38.122777462005615s
get-exact-matches Time: 3.3154449462890625s


8it [07:18, 53.02s/it]

split_0_8.csv: 9 of 40
clean Time: 0.040342092514038086s
get-fuzzy-augmented-matches Time: 41.05448532104492s
get-exact-matches Time: 3.2510900497436523s


9it [08:13, 53.59s/it]

split_0_9.csv: 10 of 40
clean Time: 0.04243946075439453s
get-fuzzy-augmented-matches Time: 38.77289056777954s
get-exact-matches Time: 3.4492833614349365s


10it [09:06, 53.30s/it]

split_1_0.csv: 11 of 40
clean Time: 0.03442263603210449s
get-fuzzy-augmented-matches Time: 24.685161113739014s
get-exact-matches Time: 3.703518867492676s


11it [09:48, 49.79s/it]

split_1_1.csv: 12 of 40
clean Time: 0.025600671768188477s
get-fuzzy-augmented-matches Time: 25.4673433303833s
get-exact-matches Time: 4.2526116371154785s


12it [10:30, 47.61s/it]

split_1_2.csv: 13 of 40
clean Time: 0.025448083877563477s
get-fuzzy-augmented-matches Time: 25.372114181518555s
get-exact-matches Time: 4.217370271682739s


13it [11:13, 45.94s/it]

split_1_3.csv: 14 of 40
clean Time: 0.02054762840270996s
get-fuzzy-augmented-matches Time: 24.584006309509277s
get-exact-matches Time: 4.056403160095215s


14it [11:55, 44.94s/it]

split_1_4.csv: 15 of 40
clean Time: 0.03737831115722656s
get-fuzzy-augmented-matches Time: 26.66512441635132s
get-exact-matches Time: 4.6266539096832275s


15it [12:39, 44.75s/it]

split_1_5.csv: 16 of 40
clean Time: 0.025320053100585938s
get-fuzzy-augmented-matches Time: 25.35082244873047s
get-exact-matches Time: 4.280551195144653s


16it [13:23, 44.41s/it]

split_1_6.csv: 17 of 40
clean Time: 0.02686619758605957s
get-fuzzy-augmented-matches Time: 25.074878454208374s
get-exact-matches Time: 4.174172401428223s


17it [14:06, 44.08s/it]

split_1_7.csv: 18 of 40
clean Time: 0.028959035873413086s
get-fuzzy-augmented-matches Time: 24.699471950531006s
get-exact-matches Time: 3.7183055877685547s


18it [14:48, 43.30s/it]

split_1_8.csv: 19 of 40
clean Time: 0.02055954933166504s
get-fuzzy-augmented-matches Time: 23.98292827606201s
get-exact-matches Time: 3.843311071395874s


19it [15:28, 42.24s/it]

split_1_9.csv: 20 of 40
clean Time: 0.025316238403320312s
get-fuzzy-augmented-matches Time: 26.953267097473145s
get-exact-matches Time: 4.234764814376831s


20it [16:13, 43.28s/it]

split_2_0.csv: 21 of 40
clean Time: 0.044812917709350586s
get-fuzzy-augmented-matches Time: 11.02382779121399s
get-exact-matches Time: 2.548438787460327s


21it [16:35, 36.77s/it]

split_2_1.csv: 22 of 40
clean Time: 0.017563581466674805s
get-fuzzy-augmented-matches Time: 9.52591586112976s
get-exact-matches Time: 2.229552984237671s


22it [16:55, 31.80s/it]

split_2_2.csv: 23 of 40
clean Time: 0.02166271209716797s
get-fuzzy-augmented-matches Time: 9.78259015083313s
get-exact-matches Time: 2.5638649463653564s


23it [17:15, 28.34s/it]

split_2_3.csv: 24 of 40
clean Time: 0.02128124237060547s
get-fuzzy-augmented-matches Time: 8.782490968704224s
get-exact-matches Time: 2.4410672187805176s


24it [17:35, 25.56s/it]

split_2_4.csv: 25 of 40
clean Time: 0.017127275466918945s
get-fuzzy-augmented-matches Time: 9.017938137054443s
get-exact-matches Time: 2.5303893089294434s


25it [17:54, 23.72s/it]

split_2_5.csv: 26 of 40
clean Time: 0.021183490753173828s
get-fuzzy-augmented-matches Time: 9.130558729171753s
get-exact-matches Time: 2.5966856479644775s


26it [18:14, 22.53s/it]

split_2_6.csv: 27 of 40
clean Time: 0.03141283988952637s
get-fuzzy-augmented-matches Time: 9.113900184631348s
get-exact-matches Time: 2.6176910400390625s


27it [18:34, 21.80s/it]

split_2_7.csv: 28 of 40
clean Time: 0.021398305892944336s
get-fuzzy-augmented-matches Time: 9.244600296020508s
get-exact-matches Time: 2.516685724258423s


28it [18:53, 21.07s/it]

split_2_8.csv: 29 of 40
clean Time: 0.02190399169921875s
get-fuzzy-augmented-matches Time: 8.66924262046814s
get-exact-matches Time: 2.149139165878296s


29it [19:12, 20.40s/it]

split_2_9.csv: 30 of 40
clean Time: 0.021970272064208984s
get-fuzzy-augmented-matches Time: 8.855459451675415s
get-exact-matches Time: 2.464099645614624s


30it [19:31, 20.12s/it]

split_3_0.csv: 31 of 40
clean Time: 0.027716636657714844s
get-fuzzy-augmented-matches Time: 5.76170539855957s
get-exact-matches Time: 1.264664888381958s


31it [19:47, 18.66s/it]

split_3_1.csv: 32 of 40
clean Time: 0.027205944061279297s
get-fuzzy-augmented-matches Time: 7.109940767288208s
get-exact-matches Time: 1.4206857681274414s


32it [20:04, 18.21s/it]

split_3_2.csv: 33 of 40
clean Time: 0.0270082950592041s
get-fuzzy-augmented-matches Time: 6.5825512409210205s
get-exact-matches Time: 1.4887146949768066s


33it [20:21, 17.79s/it]

split_3_3.csv: 34 of 40
clean Time: 0.02746415138244629s
get-fuzzy-augmented-matches Time: 7.3330769538879395s
get-exact-matches Time: 1.473851203918457s


34it [20:38, 17.67s/it]

split_3_4.csv: 35 of 40
clean Time: 0.034212589263916016s
get-fuzzy-augmented-matches Time: 6.432801008224487s
get-exact-matches Time: 1.3213093280792236s


35it [20:54, 17.04s/it]

split_3_5.csv: 36 of 40
clean Time: 0.03864479064941406s
get-fuzzy-augmented-matches Time: 6.272070646286011s
get-exact-matches Time: 1.4125361442565918s


36it [21:10, 16.92s/it]

split_3_6.csv: 37 of 40
clean Time: 0.031871795654296875s
get-fuzzy-augmented-matches Time: 6.169744491577148s
get-exact-matches Time: 1.2801506519317627s


37it [21:26, 16.52s/it]

split_3_7.csv: 38 of 40
clean Time: 0.02163529396057129s
get-fuzzy-augmented-matches Time: 6.43291449546814s
get-exact-matches Time: 1.7276320457458496s


38it [21:42, 16.52s/it]

split_3_8.csv: 39 of 40
clean Time: 0.02732062339782715s
get-fuzzy-augmented-matches Time: 8.29240608215332s
get-exact-matches Time: 1.5062415599822998s


39it [22:01, 17.15s/it]

split_3_9.csv: 40 of 40
clean Time: 0.026815176010131836s
get-fuzzy-augmented-matches Time: 7.332457542419434s
get-exact-matches Time: 1.5170671939849854s


40it [22:18, 33.47s/it]


In [14]:
features_path = '/data/amandeep/nih-dataset/organization/features'
!mkdir -p $features_path
classifier_features= ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']

In [15]:
def feature_generation(candidate_dir, embedding_dir, class_count_dir, property_count_dir, context_path, output_path):
    file_list = glob.glob(candidate_dir + '/*.csv')
    for i, file in tqdm(enumerate(file_list)):
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        embedding_file = f"{embedding_dir}/{filename[:-4]}_graph_embedding_complex.tsv"
        class_count_file = f"{class_count_dir}/{filename[:-4]}_class_count.tsv"
        property_count_file = f"{property_count_dir}/{filename[:-4]}_prop_count.tsv"
        context_file = f"{context_path}/{filename[:-4]}_context.tsv"
        output_file = f"{output_path}/{filename}"
        if os.path.getsize(file) == 0:
                continue
        classifier_features_str = ",".join(classifier_features)
        !tl align-page-rank "$file" \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
            / string-similarity -i --method jaro_winkler -o jaro_winkler \
            / string-similarity -i --method levenshtein -o levenshtein \
            / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
            / normalize-scores -c des_cont_jaccard / smallest-qnode-number \
            / mosaic-features -c kg_labels --num-char --num-tokens \
            / create-singleton-feature -o singleton \
            / vote-by-classifier  \
            --prob-threshold 0.995 \
            --features "$classifier_features_str" \
            --model "$model_file_path" \
            / score-using-embedding \
            --column-vector-strategy centroid-of-lof \
            --lof-strategy ems-mv \
            -o lof-graph-embedding-score \
            --embedding-file "$embedding_file" \
            / generate-reciprocal-rank  \
            -c lof-graph-embedding-score \
            -o lof-reciprocal-rank \
            / compute-tf-idf  \
            --feature-file "$class_count_file" \
            --feature-name class_count \
            --singleton-column is_lof \
            -o lof_class_count_tf_idf_score \
            / compute-tf-idf \
            --feature-file "$property_count_file" \
            --feature-name property_count \
            --singleton-column is_lof \
            -o lof_property_count_tf_idf_score   > "$output_file"

In [16]:
feature_generation(output_path, graph_embedding, class_count_path, prop_count_path, context_path, features_path)

0it [00:00, ?it/s]

split_0_0.csv: 1 of 40
align-page-rank Time: 1.0750133991241455s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 34.22521376609802s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 60.546825885772705s
string-similarity-['jaro_winkler'] Time: 5.3626708984375s
string-similarity-['levenshtein'] Time: 42.36357617378235s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8475470542907715s
normalize-scores-des_cont_jaccard Time: 0.24726581573486328s
smallest-qnode-number Time: 1.79902982711792s
mosaic-features Time: 0.11835169792175293s
create-singleton-feature Time: 1.5949225425720215s
vote-by-classifier Time: 1.3775908946990967s
Qnodes to lookup: 35492
Qnodes from file: 33985
_centroid_of_lof: Missing 30 of 814
Outlier removal generates 470 lof-voted candidates
score-using-embedding Time: 173.3693995475769s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.8407704830169678s
compute-tf-idf-class_count Time: 179.989426612854s
compute-tf-idf-propert

1it [03:07, 187.89s/it]

split_0_1.csv: 2 of 40
align-page-rank Time: 0.9559783935546875s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 27.57402729988098s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 53.09959959983826s
string-similarity-['jaro_winkler'] Time: 5.0380518436431885s
string-similarity-['levenshtein'] Time: 35.161452293395996s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8215985298156738s
normalize-scores-des_cont_jaccard Time: 0.24749255180358887s
smallest-qnode-number Time: 1.7863819599151611s
mosaic-features Time: 0.11316394805908203s
create-singleton-feature Time: 1.3897294998168945s
vote-by-classifier Time: 0.7831606864929199s
Qnodes to lookup: 34635
Qnodes from file: 32936
_centroid_of_lof: Missing 102 of 1093
Outlier removal generates 595 lof-voted candidates
score-using-embedding Time: 147.37474513053894s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9881770610809326s
compute-tf-idf-class_count Time: 153.34168076515198s
compute-tf-i

2it [05:49, 172.31s/it]

split_0_2.csv: 3 of 40
align-page-rank Time: 1.070600986480713s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 23.103946208953857s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 42.59343600273132s
string-similarity-['jaro_winkler'] Time: 4.423543930053711s
string-similarity-['levenshtein'] Time: 30.491483449935913s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8446979522705078s
normalize-scores-des_cont_jaccard Time: 0.24195575714111328s
smallest-qnode-number Time: 1.7519803047180176s
mosaic-features Time: 0.17167115211486816s
create-singleton-feature Time: 1.4880199432373047s
vote-by-classifier Time: 0.6990902423858643s
Qnodes to lookup: 35248
Qnodes from file: 33157
_centroid_of_lof: Missing 120 of 1197
Outlier removal generates 646 lof-voted candidates
score-using-embedding Time: 127.51598525047302s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.7980434894561768s
compute-tf-idf-class_count Time: 133.2657175064087s
compute-tf-idf

3it [08:10, 158.19s/it]

split_0_3.csv: 4 of 40
align-page-rank Time: 0.9680442810058594s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 21.39000701904297s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 41.134994983673096s
string-similarity-['jaro_winkler'] Time: 4.150817155838013s
string-similarity-['levenshtein'] Time: 28.357872009277344s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8551793098449707s
normalize-scores-des_cont_jaccard Time: 0.23734736442565918s
smallest-qnode-number Time: 2.034531354904175s
mosaic-features Time: 0.11067533493041992s
create-singleton-feature Time: 1.3285672664642334s
vote-by-classifier Time: 0.6594352722167969s
Qnodes to lookup: 33091
Qnodes from file: 31188
_centroid_of_lof: Missing 101 of 1115
Outlier removal generates 608 lof-voted candidates
score-using-embedding Time: 121.26210594177246s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.8329689502716064s
compute-tf-idf-class_count Time: 127.75598788261414s
compute-tf-id

4it [10:26, 149.18s/it]

split_0_4.csv: 5 of 40
align-page-rank Time: 0.9781455993652344s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 21.20128846168518s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 39.52588653564453s
string-similarity-['jaro_winkler'] Time: 4.184878587722778s
string-similarity-['levenshtein'] Time: 27.106751203536987s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8559632301330566s
normalize-scores-des_cont_jaccard Time: 0.2433943748474121s
smallest-qnode-number Time: 1.778841495513916s
mosaic-features Time: 0.11530923843383789s
create-singleton-feature Time: 1.3286466598510742s
vote-by-classifier Time: 0.6325466632843018s
Qnodes to lookup: 34832
Qnodes from file: 33084
_centroid_of_lof: Missing 85 of 899
Outlier removal generates 488 lof-voted candidates
score-using-embedding Time: 117.82135605812073s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.7847495079040527s
compute-tf-idf-class_count Time: 123.36376118659973s
compute-tf-idf-pr

5it [12:37, 142.69s/it]

split_0_5.csv: 6 of 40
align-page-rank Time: 0.9485666751861572s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.44751787185669s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 40.27610158920288s
string-similarity-['jaro_winkler'] Time: 4.276766061782837s
string-similarity-['levenshtein'] Time: 27.562937259674072s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8334810733795166s
normalize-scores-des_cont_jaccard Time: 0.24446892738342285s
smallest-qnode-number Time: 1.7967398166656494s
mosaic-features Time: 0.11156368255615234s
create-singleton-feature Time: 1.3756847381591797s
vote-by-classifier Time: 0.8104357719421387s
Qnodes to lookup: 32574
Qnodes from file: 30737
_centroid_of_lof: Missing 75 of 859
Outlier removal generates 470 lof-voted candidates
score-using-embedding Time: 118.63994431495667s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.7971489429473877s
compute-tf-idf-class_count Time: 124.51496648788452s
compute-tf-idf-

6it [14:50, 139.34s/it]

split_0_6.csv: 7 of 40
align-page-rank Time: 1.0844645500183105s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 18.850862979888916s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 36.245558738708496s
string-similarity-['jaro_winkler'] Time: 3.8333394527435303s
string-similarity-['levenshtein'] Time: 25.920488834381104s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8268592357635498s
normalize-scores-des_cont_jaccard Time: 0.23776888847351074s
smallest-qnode-number Time: 1.7251200675964355s
mosaic-features Time: 0.11553478240966797s
create-singleton-feature Time: 1.5912635326385498s
vote-by-classifier Time: 0.730623722076416s
Qnodes to lookup: 31939
Qnodes from file: 30294
_centroid_of_lof: Missing 67 of 654
Outlier removal generates 352 lof-voted candidates
score-using-embedding Time: 110.799156665802s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9023318290710449s
compute-tf-idf-class_count Time: 116.30188608169556s
compute-tf-idf-

7it [16:54, 134.42s/it]

split_0_7.csv: 8 of 40
align-page-rank Time: 0.9542801380157471s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 19.481122493743896s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 36.679057598114014s
string-similarity-['jaro_winkler'] Time: 3.906893253326416s
string-similarity-['levenshtein'] Time: 25.867220878601074s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8342502117156982s
normalize-scores-des_cont_jaccard Time: 0.2394239902496338s
smallest-qnode-number Time: 1.7721357345581055s
mosaic-features Time: 0.15809845924377441s
create-singleton-feature Time: 1.2859244346618652s
vote-by-classifier Time: 0.7135286331176758s
Qnodes to lookup: 32346
Qnodes from file: 30440
_centroid_of_lof: Missing 85 of 835
Outlier removal generates 450 lof-voted candidates
score-using-embedding Time: 112.17212462425232s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.8177313804626465s
compute-tf-idf-class_count Time: 117.95547938346863s
compute-tf-idf

8it [19:00, 131.70s/it]

split_0_8.csv: 9 of 40
align-page-rank Time: 0.9755654335021973s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 25.655643701553345s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 44.65043377876282s
string-similarity-['jaro_winkler'] Time: 4.537208557128906s
string-similarity-['levenshtein'] Time: 32.56257343292236s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8307263851165771s
normalize-scores-des_cont_jaccard Time: 0.23870229721069336s
smallest-qnode-number Time: 1.780503749847412s
mosaic-features Time: 0.11252784729003906s
create-singleton-feature Time: 1.2868874073028564s
vote-by-classifier Time: 0.592646598815918s
Qnodes to lookup: 33992
Qnodes from file: 32075
_centroid_of_lof: Missing 542 of 3490
Outlier removal generates 1769 lof-voted candidates
score-using-embedding Time: 133.81810855865479s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.8177416324615479s
compute-tf-idf-class_count Time: 139.7933099269867s
compute-tf-idf-

9it [21:28, 136.81s/it]

split_0_9.csv: 10 of 40
align-page-rank Time: 0.9385063648223877s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 21.71793031692505s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 39.583229303359985s
string-similarity-['jaro_winkler'] Time: 4.22122049331665s
string-similarity-['levenshtein'] Time: 29.647648096084595s
string-similarity-['jaccard:tokenizer=word'] Time: 0.8540670871734619s
normalize-scores-des_cont_jaccard Time: 0.23522090911865234s
smallest-qnode-number Time: 1.7117280960083008s
mosaic-features Time: 0.10944533348083496s
create-singleton-feature Time: 1.3033437728881836s
vote-by-classifier Time: 0.6794722080230713s
Qnodes to lookup: 33516
Qnodes from file: 31447
_centroid_of_lof: Missing 303 of 2034
Outlier removal generates 1039 lof-voted candidates
score-using-embedding Time: 120.75607991218567s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.8166201114654541s
compute-tf-idf-class_count Time: 126.20448803901672s
compute-tf-

10it [23:42, 136.04s/it]

split_1_0.csv: 11 of 40
align-page-rank Time: 1.0573222637176514s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 18.31800413131714s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 27.886035442352295s
string-similarity-['jaro_winkler'] Time: 5.283450603485107s
string-similarity-['levenshtein'] Time: 19.450575828552246s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5187363624572754s
normalize-scores-des_cont_jaccard Time: 0.433239221572876s
smallest-qnode-number Time: 1.9124846458435059s
mosaic-features Time: 0.20587730407714844s
create-singleton-feature Time: 2.334709644317627s
vote-by-classifier Time: 0.665135383605957s
Qnodes to lookup: 46316
Qnodes from file: 45394
_centroid_of_lof: Missing 51 of 1234
Outlier removal generates 710 lof-voted candidates
score-using-embedding Time: 112.89667248725891s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.0838372707366943s
compute-tf-idf-class_count Time: 123.73420286178589s
compute-tf-idf-p

11it [25:57, 135.69s/it]

split_1_1.csv: 12 of 40
align-page-rank Time: 0.9967143535614014s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.2237446308136s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 26.921199798583984s
string-similarity-['jaro_winkler'] Time: 4.652204513549805s
string-similarity-['levenshtein'] Time: 19.140451431274414s
string-similarity-['jaccard:tokenizer=word'] Time: 1.447845458984375s
normalize-scores-des_cont_jaccard Time: 0.4280102252960205s
smallest-qnode-number Time: 1.9291799068450928s
mosaic-features Time: 0.1907360553741455s
create-singleton-feature Time: 2.2093942165374756s
vote-by-classifier Time: 0.8305025100708008s
Qnodes to lookup: 52571
Qnodes from file: 51509
_centroid_of_lof: Missing 107 of 1679
Outlier removal generates 943 lof-voted candidates
score-using-embedding Time: 109.11089730262756s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9560253620147705s
compute-tf-idf-class_count Time: 119.43995976448059s
compute-tf-idf-

12it [28:07, 133.94s/it]

split_1_2.csv: 13 of 40
align-page-rank Time: 0.9912090301513672s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.4128155708313s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 27.099634408950806s
string-similarity-['jaro_winkler'] Time: 4.7454962730407715s
string-similarity-['levenshtein'] Time: 18.590408086776733s
string-similarity-['jaccard:tokenizer=word'] Time: 1.442460060119629s
normalize-scores-des_cont_jaccard Time: 0.41457104682922363s
smallest-qnode-number Time: 2.1308350563049316s
mosaic-features Time: 0.19502997398376465s
create-singleton-feature Time: 2.117342948913574s
vote-by-classifier Time: 0.7718424797058105s
Qnodes to lookup: 49648
Qnodes from file: 48779
_centroid_of_lof: Missing 83 of 1955
Outlier removal generates 1123 lof-voted candidates
score-using-embedding Time: 107.50864863395691s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.181166648864746s
compute-tf-idf-class_count Time: 117.99520778656006s
compute-tf-idf

13it [30:15, 132.04s/it]

split_1_3.csv: 14 of 40
align-page-rank Time: 1.0083098411560059s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 17.265345811843872s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 27.513655424118042s
string-similarity-['jaro_winkler'] Time: 4.859482765197754s
string-similarity-['levenshtein'] Time: 18.93462896347046s
string-similarity-['jaccard:tokenizer=word'] Time: 1.4107532501220703s
normalize-scores-des_cont_jaccard Time: 0.4056735038757324s
smallest-qnode-number Time: 2.1586766242980957s
mosaic-features Time: 0.19090795516967773s
create-singleton-feature Time: 2.221668243408203s
vote-by-classifier Time: 0.8132326602935791s
Qnodes to lookup: 49692
Qnodes from file: 48775
_centroid_of_lof: Missing 75 of 1875
Outlier removal generates 1080 lof-voted candidates
score-using-embedding Time: 108.7851243019104s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9716954231262207s
compute-tf-idf-class_count Time: 118.70809864997864s
compute-tf-idf

14it [32:24, 131.11s/it]

split_1_4.csv: 15 of 40
align-page-rank Time: 0.9710037708282471s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 15.982155799865723s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 26.879578113555908s
string-similarity-['jaro_winkler'] Time: 5.38646388053894s
string-similarity-['levenshtein'] Time: 19.012408018112183s
string-similarity-['jaccard:tokenizer=word'] Time: 1.464144229888916s
normalize-scores-des_cont_jaccard Time: 0.40741968154907227s
smallest-qnode-number Time: 2.1949641704559326s
mosaic-features Time: 0.19363856315612793s
create-singleton-feature Time: 2.202255964279175s
vote-by-classifier Time: 0.8198893070220947s
Qnodes to lookup: 52650
Qnodes from file: 51640
_centroid_of_lof: Missing 81 of 1728
Outlier removal generates 988 lof-voted candidates
score-using-embedding Time: 108.48448348045349s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9495508670806885s
compute-tf-idf-class_count Time: 118.00812673568726s
compute-tf-idf

15it [34:33, 130.52s/it]

split_1_5.csv: 16 of 40
align-page-rank Time: 0.9757528305053711s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.285701036453247s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 27.736695289611816s
string-similarity-['jaro_winkler'] Time: 4.661637306213379s
string-similarity-['levenshtein'] Time: 17.938193321228027s
string-similarity-['jaccard:tokenizer=word'] Time: 1.4134094715118408s
normalize-scores-des_cont_jaccard Time: 0.4119267463684082s
smallest-qnode-number Time: 1.9306418895721436s
mosaic-features Time: 0.19400525093078613s
create-singleton-feature Time: 2.2346317768096924s
vote-by-classifier Time: 0.8670575618743896s
Qnodes to lookup: 51939
Qnodes from file: 50996
_centroid_of_lof: Missing 100 of 1690
Outlier removal generates 954 lof-voted candidates
score-using-embedding Time: 106.82111072540283s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9573066234588623s
compute-tf-idf-class_count Time: 116.19929814338684s
compute-tf-

16it [36:39, 129.22s/it]

split_1_6.csv: 17 of 40
align-page-rank Time: 1.01318359375s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.055179834365845s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 26.911964893341064s
string-similarity-['jaro_winkler'] Time: 4.573680639266968s
string-similarity-['levenshtein'] Time: 17.504245281219482s
string-similarity-['jaccard:tokenizer=word'] Time: 1.396026611328125s
normalize-scores-des_cont_jaccard Time: 0.39748120307922363s
smallest-qnode-number Time: 1.888335943222046s
mosaic-features Time: 0.18941617012023926s
create-singleton-feature Time: 2.179323196411133s
vote-by-classifier Time: 0.8543825149536133s
Qnodes to lookup: 48906
Qnodes from file: 47960
_centroid_of_lof: Missing 58 of 1339
Outlier removal generates 769 lof-voted candidates
score-using-embedding Time: 104.118901014328s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.977102518081665s
compute-tf-idf-class_count Time: 113.91616725921631s
compute-tf-idf-propert

17it [38:43, 127.73s/it]

split_1_7.csv: 18 of 40
align-page-rank Time: 0.9882001876831055s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 15.869466304779053s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 27.015326261520386s
string-similarity-['jaro_winkler'] Time: 4.847265005111694s
string-similarity-['levenshtein'] Time: 17.750433921813965s
string-similarity-['jaccard:tokenizer=word'] Time: 1.643383264541626s
normalize-scores-des_cont_jaccard Time: 0.46599650382995605s
smallest-qnode-number Time: 2.453918933868408s
mosaic-features Time: 0.19055867195129395s
create-singleton-feature Time: 2.03989577293396s
vote-by-classifier Time: 0.8721237182617188s
Qnodes to lookup: 48616
Qnodes from file: 47708
_centroid_of_lof: Missing 75 of 1367
Outlier removal generates 775 lof-voted candidates
score-using-embedding Time: 105.83360362052917s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9575884342193604s
compute-tf-idf-class_count Time: 114.380539894104s
compute-tf-idf-pr

18it [40:48, 126.95s/it]

split_1_8.csv: 19 of 40
align-page-rank Time: 0.9867458343505859s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 15.925240755081177s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 27.387571811676025s
string-similarity-['jaro_winkler'] Time: 4.5482141971588135s
string-similarity-['levenshtein'] Time: 17.913103342056274s
string-similarity-['jaccard:tokenizer=word'] Time: 1.3873648643493652s
normalize-scores-des_cont_jaccard Time: 0.4158172607421875s
smallest-qnode-number Time: 2.493894338607788s
mosaic-features Time: 0.18622827529907227s
create-singleton-feature Time: 2.1718382835388184s
vote-by-classifier Time: 0.9146573543548584s
Qnodes to lookup: 48078
Qnodes from file: 47047
_centroid_of_lof: Missing 411 of 4774
Outlier removal generates 2619 lof-voted candidates
score-using-embedding Time: 106.95807361602783s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.0321979522705078s
compute-tf-idf-class_count Time: 116.56032609939575s
compute-tf

19it [42:55, 126.99s/it]

split_1_9.csv: 20 of 40
align-page-rank Time: 0.9692437648773193s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.567383766174316s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 26.56819438934326s
string-similarity-['jaro_winkler'] Time: 5.3401405811309814s
string-similarity-['levenshtein'] Time: 18.509801626205444s
string-similarity-['jaccard:tokenizer=word'] Time: 1.430771827697754s
normalize-scores-des_cont_jaccard Time: 0.4224414825439453s
smallest-qnode-number Time: 1.9090683460235596s
mosaic-features Time: 0.1721198558807373s
create-singleton-feature Time: 2.475177526473999s
vote-by-classifier Time: 0.8406307697296143s
Qnodes to lookup: 53874
Qnodes from file: 52826
_centroid_of_lof: Missing 222 of 3041
Outlier removal generates 1691 lof-voted candidates
score-using-embedding Time: 108.83278179168701s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9629831314086914s
compute-tf-idf-class_count Time: 118.83333039283752s
compute-tf-id

20it [45:05, 127.90s/it]

split_2_0.csv: 21 of 40
align-page-rank Time: 0.9773943424224854s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.372907161712646s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.85562562942505s
string-similarity-['jaro_winkler'] Time: 3.6814422607421875s
string-similarity-['levenshtein'] Time: 8.088920593261719s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5912425518035889s
normalize-scores-des_cont_jaccard Time: 0.4352710247039795s
smallest-qnode-number Time: 2.107914686203003s
mosaic-features Time: 0.2822556495666504s
create-singleton-feature Time: 2.9430036544799805s
vote-by-classifier Time: 0.8968322277069092s
Qnodes to lookup: 9982
Qnodes from file: 9715
Outlier removal generates 298 lof-voted candidates
score-using-embedding Time: 86.42095184326172s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.1488852500915527s
compute-tf-idf-class_count Time: 96.19055795669556s
compute-tf-idf-property_count Time: 100.71248459815979s


21it [46:52, 121.60s/it]

split_2_1.csv: 22 of 40
align-page-rank Time: 1.027571439743042s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.489678621292114s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 19.743183851242065s
string-similarity-['jaro_winkler'] Time: 3.6698548793792725s
string-similarity-['levenshtein'] Time: 7.7189905643463135s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5717785358428955s
normalize-scores-des_cont_jaccard Time: 0.40588974952697754s
smallest-qnode-number Time: 2.301467180252075s
mosaic-features Time: 0.2145547866821289s
create-singleton-feature Time: 2.8144824504852295s
vote-by-classifier Time: 0.7012887001037598s
Qnodes to lookup: 10359
Qnodes from file: 10077
Outlier removal generates 431 lof-voted candidates
score-using-embedding Time: 83.4658727645874s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9879591464996338s
compute-tf-idf-class_count Time: 92.82403898239136s
compute-tf-idf-property_count Time: 97.53423738479614

22it [48:36, 116.19s/it]

split_2_2.csv: 23 of 40
align-page-rank Time: 1.007856845855713s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.298721551895142s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.2051043510437s
string-similarity-['jaro_winkler'] Time: 3.92425537109375s
string-similarity-['levenshtein'] Time: 7.636511325836182s
string-similarity-['jaccard:tokenizer=word'] Time: 1.773076057434082s
normalize-scores-des_cont_jaccard Time: 0.4129362106323242s
smallest-qnode-number Time: 2.0393261909484863s
mosaic-features Time: 0.21813607215881348s
create-singleton-feature Time: 3.026293992996216s
vote-by-classifier Time: 0.7742204666137695s
Qnodes to lookup: 10432
Qnodes from file: 10147
Outlier removal generates 244 lof-voted candidates
score-using-embedding Time: 83.7257878780365s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9686839580535889s
compute-tf-idf-class_count Time: 93.07697892189026s
compute-tf-idf-property_count Time: 97.52963876724243s


23it [50:19, 112.38s/it]

split_2_3.csv: 24 of 40
align-page-rank Time: 0.9869875907897949s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.396312236785889s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 19.645426750183105s
string-similarity-['jaro_winkler'] Time: 3.5934629440307617s
string-similarity-['levenshtein'] Time: 7.673677682876587s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5204963684082031s
normalize-scores-des_cont_jaccard Time: 0.4204103946685791s
smallest-qnode-number Time: 1.9214038848876953s
mosaic-features Time: 0.21980929374694824s
create-singleton-feature Time: 2.490551710128784s
vote-by-classifier Time: 0.756756067276001s
Qnodes to lookup: 9069
Qnodes from file: 8815
Outlier removal generates 374 lof-voted candidates
score-using-embedding Time: 82.23032450675964s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9849913120269775s
compute-tf-idf-class_count Time: 91.77732872962952s
compute-tf-idf-property_count Time: 96.06878280639648s


24it [52:02, 109.33s/it]

split_2_4.csv: 25 of 40
align-page-rank Time: 1.0070891380310059s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.36080551147461s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.244444608688354s
string-similarity-['jaro_winkler'] Time: 3.6728451251983643s
string-similarity-['levenshtein'] Time: 7.906152963638306s
string-similarity-['jaccard:tokenizer=word'] Time: 1.537628412246704s
normalize-scores-des_cont_jaccard Time: 0.4151942729949951s
smallest-qnode-number Time: 1.9152886867523193s
mosaic-features Time: 0.2214653491973877s
create-singleton-feature Time: 2.5282199382781982s
vote-by-classifier Time: 0.7281317710876465s
Qnodes to lookup: 9672
Qnodes from file: 9405
Outlier removal generates 436 lof-voted candidates
score-using-embedding Time: 84.2214343547821s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.1451067924499512s
compute-tf-idf-class_count Time: 94.26185488700867s
compute-tf-idf-property_count Time: 98.82797908782959s


25it [53:47, 108.06s/it]

split_2_5.csv: 26 of 40
align-page-rank Time: 0.9982004165649414s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.354867935180664s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.223699808120728s
string-similarity-['jaro_winkler'] Time: 3.934480667114258s
string-similarity-['levenshtein'] Time: 8.531984329223633s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5378735065460205s
normalize-scores-des_cont_jaccard Time: 0.41260766983032227s
smallest-qnode-number Time: 2.0926058292388916s
mosaic-features Time: 0.22326898574829102s
create-singleton-feature Time: 2.396209716796875s
vote-by-classifier Time: 1.7350029945373535s
Qnodes to lookup: 9162
Qnodes from file: 8912
_centroid_of_lof: Missing 1 of 546
Outlier removal generates 445 lof-voted candidates
score-using-embedding Time: 85.26600432395935s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9734196662902832s
compute-tf-idf-class_count Time: 94.81391620635986s
compute-tf-idf-prop

26it [55:32, 107.19s/it]

split_2_6.csv: 27 of 40
align-page-rank Time: 1.042668104171753s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.428284168243408s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.1690411567688s
string-similarity-['jaro_winkler'] Time: 3.7859716415405273s
string-similarity-['levenshtein'] Time: 8.241723775863647s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5737419128417969s
normalize-scores-des_cont_jaccard Time: 0.42809534072875977s
smallest-qnode-number Time: 1.9183526039123535s
mosaic-features Time: 0.22122907638549805s
create-singleton-feature Time: 2.5017616748809814s
vote-by-classifier Time: 0.6486265659332275s
Qnodes to lookup: 9328
Qnodes from file: 9090
Outlier removal generates 437 lof-voted candidates
score-using-embedding Time: 83.77024173736572s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.0175490379333496s
compute-tf-idf-class_count Time: 94.10895466804504s
compute-tf-idf-property_count Time: 98.52270913124084s


27it [57:20, 107.52s/it]

split_2_7.csv: 28 of 40
align-page-rank Time: 1.0164411067962646s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 11.005845069885254s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.4557523727417s
string-similarity-['jaro_winkler'] Time: 3.7629635334014893s
string-similarity-['levenshtein'] Time: 8.41431212425232s
string-similarity-['jaccard:tokenizer=word'] Time: 1.7634539604187012s
normalize-scores-des_cont_jaccard Time: 0.4176468849182129s
smallest-qnode-number Time: 1.9592900276184082s
mosaic-features Time: 0.2642476558685303s
create-singleton-feature Time: 2.8491127490997314s
vote-by-classifier Time: 0.7009799480438232s
Qnodes to lookup: 8934
Qnodes from file: 8695
Outlier removal generates 438 lof-voted candidates
score-using-embedding Time: 86.69547390937805s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9981646537780762s
compute-tf-idf-class_count Time: 95.97393441200256s
compute-tf-idf-property_count Time: 100.56166505813599s


28it [59:07, 107.34s/it]

split_2_8.csv: 29 of 40
align-page-rank Time: 0.9828195571899414s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 8.614214897155762s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.684813737869263s
string-similarity-['jaro_winkler'] Time: 3.337986946105957s
string-similarity-['levenshtein'] Time: 6.406893253326416s
string-similarity-['jaccard:tokenizer=word'] Time: 1.2936913967132568s
normalize-scores-des_cont_jaccard Time: 0.3452444076538086s
smallest-qnode-number Time: 1.8376729488372803s
mosaic-features Time: 0.22709083557128906s
create-singleton-feature Time: 2.417585849761963s
vote-by-classifier Time: 0.8362329006195068s
Qnodes to lookup: 9788
Qnodes from file: 9532
Outlier removal generates 327 lof-voted candidates
score-using-embedding Time: 70.38600516319275s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9566497802734375s
compute-tf-idf-class_count Time: 78.31398391723633s
compute-tf-idf-property_count Time: 81.81538891792297s


29it [1:00:35, 101.44s/it]

split_2_9.csv: 30 of 40
align-page-rank Time: 1.0083086490631104s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 9.319494009017944s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 18.391188621520996s
string-similarity-['jaro_winkler'] Time: 3.7086360454559326s
string-similarity-['levenshtein'] Time: 7.4245171546936035s
string-similarity-['jaccard:tokenizer=word'] Time: 1.703660488128662s
normalize-scores-des_cont_jaccard Time: 0.40096521377563477s
smallest-qnode-number Time: 2.136570692062378s
mosaic-features Time: 0.20950841903686523s
create-singleton-feature Time: 2.246825695037842s
vote-by-classifier Time: 0.8680691719055176s
Qnodes to lookup: 9270
Qnodes from file: 9017
Outlier removal generates 106 lof-voted candidates
score-using-embedding Time: 79.54322075843811s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.96329665184021s
compute-tf-idf-class_count Time: 89.00326251983643s
compute-tf-idf-property_count Time: 92.75247192382812s


30it [1:02:14, 100.68s/it]

split_3_0.csv: 31 of 40
align-page-rank Time: 1.005408525466919s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 71.51389217376709s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 153.793771982193s
string-similarity-['jaro_winkler'] Time: 13.104278802871704s
string-similarity-['levenshtein'] Time: 78.44478440284729s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5445148944854736s
normalize-scores-des_cont_jaccard Time: 0.4132075309753418s
smallest-qnode-number Time: 1.889674425125122s
mosaic-features Time: 0.295076847076416s
create-singleton-feature Time: 2.553874969482422s
vote-by-classifier Time: 0.7522153854370117s
Qnodes to lookup: 2782
Qnodes from file: 2724
_centroid_of_lof: Missing 58 of 1050
Outlier removal generates 595 lof-voted candidates
score-using-embedding Time: 362.88651633262634s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9698855876922607s
compute-tf-idf-class_count Time: 372.99128556251526s
compute-tf-idf-propert

31it [1:08:38, 185.87s/it]

split_3_1.csv: 32 of 40
align-page-rank Time: 1.0876317024230957s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 69.92683219909668s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 151.65000653266907s
string-similarity-['jaro_winkler'] Time: 13.033940076828003s
string-similarity-['levenshtein'] Time: 80.05111241340637s
string-similarity-['jaccard:tokenizer=word'] Time: 1.741365909576416s
normalize-scores-des_cont_jaccard Time: 0.4036722183227539s
smallest-qnode-number Time: 1.914891004562378s
mosaic-features Time: 0.23042702674865723s
create-singleton-feature Time: 2.8283557891845703s
vote-by-classifier Time: 1.8110120296478271s
Qnodes to lookup: 4955
Qnodes from file: 4840
_centroid_of_lof: Missing 96 of 1621
Outlier removal generates 915 lof-voted candidates
score-using-embedding Time: 363.9993929862976s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9622275829315186s
compute-tf-idf-class_count Time: 374.5683045387268s
compute-tf-idf-prop

32it [1:15:04, 245.93s/it]

split_3_2.csv: 33 of 40
align-page-rank Time: 1.0458261966705322s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 68.26721858978271s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 149.9673306941986s
string-similarity-['jaro_winkler'] Time: 13.29618787765503s
string-similarity-['levenshtein'] Time: 80.79345774650574s
string-similarity-['jaccard:tokenizer=word'] Time: 1.6632800102233887s
normalize-scores-des_cont_jaccard Time: 0.4608423709869385s
smallest-qnode-number Time: 1.9873237609863281s
mosaic-features Time: 0.29924559593200684s
create-singleton-feature Time: 2.737806558609009s
vote-by-classifier Time: 2.003826379776001s
Qnodes to lookup: 4882
Qnodes from file: 4781
_centroid_of_lof: Missing 109 of 1931
Outlier removal generates 1093 lof-voted candidates
score-using-embedding Time: 361.9149696826935s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.989680290222168s
compute-tf-idf-class_count Time: 372.5349838733673s
compute-tf-idf-prope

33it [1:21:36, 289.74s/it]

split_3_3.csv: 34 of 40
align-page-rank Time: 0.970714807510376s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 68.07570433616638s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 151.56164073944092s
string-similarity-['jaro_winkler'] Time: 12.652451992034912s
string-similarity-['levenshtein'] Time: 78.57598996162415s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5696561336517334s
normalize-scores-des_cont_jaccard Time: 0.4479207992553711s
smallest-qnode-number Time: 2.2006454467773438s
mosaic-features Time: 0.23843121528625488s
create-singleton-feature Time: 2.5543575286865234s
vote-by-classifier Time: 0.789085865020752s
Qnodes to lookup: 5822
Qnodes from file: 5664
_centroid_of_lof: Missing 139 of 1860
Outlier removal generates 1033 lof-voted candidates
score-using-embedding Time: 358.82942748069763s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9851958751678467s
compute-tf-idf-class_count Time: 369.1953582763672s
compute-tf-idf-p

34it [1:27:59, 317.48s/it]

split_3_4.csv: 35 of 40
align-page-rank Time: 0.9604651927947998s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 68.8801589012146s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 149.3949375152588s
string-similarity-['jaro_winkler'] Time: 12.705446243286133s
string-similarity-['levenshtein'] Time: 80.4415442943573s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5469698905944824s
normalize-scores-des_cont_jaccard Time: 0.4033503532409668s
smallest-qnode-number Time: 1.9382994174957275s
mosaic-features Time: 0.3001687526702881s
create-singleton-feature Time: 2.8490023612976074s
vote-by-classifier Time: 0.7552905082702637s
Qnodes to lookup: 4287
Qnodes from file: 4173
_centroid_of_lof: Missing 107 of 1832
Outlier removal generates 1035 lof-voted candidates
score-using-embedding Time: 360.1274127960205s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9541661739349365s
compute-tf-idf-class_count Time: 369.89387130737305s
compute-tf-idf-pro

35it [1:34:20, 336.64s/it]

split_3_5.csv: 36 of 40
align-page-rank Time: 0.9989457130432129s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 69.25024938583374s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 149.36655712127686s
string-similarity-['jaro_winkler'] Time: 13.158967733383179s
string-similarity-['levenshtein'] Time: 79.98825335502625s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5624144077301025s
normalize-scores-des_cont_jaccard Time: 0.40759825706481934s
smallest-qnode-number Time: 1.9363396167755127s
mosaic-features Time: 0.3013439178466797s
create-singleton-feature Time: 2.4601247310638428s
vote-by-classifier Time: 1.1176466941833496s
Qnodes to lookup: 3986
Qnodes from file: 3889
_centroid_of_lof: Missing 94 of 1628
Outlier removal generates 920 lof-voted candidates
score-using-embedding Time: 356.68072962760925s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9948852062225342s
compute-tf-idf-class_count Time: 367.9118103981018s
compute-tf-idf-p

36it [1:40:40, 349.66s/it]

split_3_6.csv: 37 of 40
align-page-rank Time: 0.9666447639465332s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 71.38448691368103s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 158.45361137390137s
string-similarity-['jaro_winkler'] Time: 13.920427322387695s
string-similarity-['levenshtein'] Time: 80.0201735496521s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5579187870025635s
normalize-scores-des_cont_jaccard Time: 0.40819621086120605s
smallest-qnode-number Time: 1.8885338306427002s
mosaic-features Time: 0.30228662490844727s
create-singleton-feature Time: 2.7536470890045166s
vote-by-classifier Time: 0.8426344394683838s
Qnodes to lookup: 4262
Qnodes from file: 4192
_centroid_of_lof: Missing 46 of 968
Outlier removal generates 553 lof-voted candidates
score-using-embedding Time: 371.0001118183136s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9828605651855469s
compute-tf-idf-class_count Time: 381.87881422042847s
compute-tf-idf-pr

37it [1:47:14, 363.07s/it]

split_3_7.csv: 38 of 40
align-page-rank Time: 0.9567022323608398s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 71.29396724700928s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 153.96324944496155s
string-similarity-['jaro_winkler'] Time: 12.853589057922363s
string-similarity-['levenshtein'] Time: 79.8963475227356s
string-similarity-['jaccard:tokenizer=word'] Time: 1.5176613330841064s
normalize-scores-des_cont_jaccard Time: 0.4012172222137451s
smallest-qnode-number Time: 1.9884123802185059s
mosaic-features Time: 0.2955818176269531s
create-singleton-feature Time: 2.434213161468506s
vote-by-classifier Time: 1.7708020210266113s
Qnodes to lookup: 3726
Qnodes from file: 3630
_centroid_of_lof: Missing 83 of 1276
Outlier removal generates 716 lof-voted candidates
score-using-embedding Time: 365.863094329834s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9690451622009277s
compute-tf-idf-class_count Time: 375.6534779071808s
compute-tf-idf-proper

38it [1:53:42, 370.33s/it]

split_3_8.csv: 39 of 40
align-page-rank Time: 1.0018589496612549s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 61.890140533447266s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 130.56145191192627s
string-similarity-['jaro_winkler'] Time: 11.904874324798584s
string-similarity-['levenshtein'] Time: 71.60157465934753s
string-similarity-['jaccard:tokenizer=word'] Time: 1.7614617347717285s
normalize-scores-des_cont_jaccard Time: 0.39958667755126953s
smallest-qnode-number Time: 2.393639087677002s
mosaic-features Time: 0.28822803497314453s
create-singleton-feature Time: 2.805938243865967s
vote-by-classifier Time: 1.7723073959350586s
Qnodes to lookup: 7701
Qnodes from file: 7458
_centroid_of_lof: Missing 478 of 6086
Outlier removal generates 3365 lof-voted candidates
score-using-embedding Time: 327.4672396183014s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.9420521259307861s
compute-tf-idf-class_count Time: 336.2871313095093s
compute-tf-idf-

39it [1:59:33, 364.80s/it]

split_3_9.csv: 40 of 40
align-page-rank Time: 1.0586178302764893s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 67.3774425983429s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 142.59416317939758s
string-similarity-['jaro_winkler'] Time: 12.160614013671875s
string-similarity-['levenshtein'] Time: 76.88030791282654s
string-similarity-['jaccard:tokenizer=word'] Time: 1.7703096866607666s
normalize-scores-des_cont_jaccard Time: 0.406919002532959s
smallest-qnode-number Time: 1.8648591041564941s
mosaic-features Time: 0.2920200824737549s
create-singleton-feature Time: 3.3149168491363525s
vote-by-classifier Time: 0.7969043254852295s
Qnodes to lookup: 7341
Qnodes from file: 7117
_centroid_of_lof: Missing 290 of 3444
Outlier removal generates 1897 lof-voted candidates
score-using-embedding Time: 346.1379041671753s
generate-reciprocal-rank-lof-graph-embedding-score Time: 1.1644363403320312s
compute-tf-idf-class_count Time: 357.21888852119446s
compute-tf-idf-pr

40it [2:05:45, 188.63s/it]


In [None]:
def concat_files(i_path, output_file, sep=None):
    df_l = []
    for f in glob.glob(f'{i_path}/*'):
        if sep:
            df_l.append(pd.read_csv(f, sep=sep))
        else:
            df_l.append(pd.read_csv(f))
    if sep:
        pd.concat(df_l).to_csv(output_file, index=False, sep=sep)
    else:
        pd.concat(df_l).to_csv(output_file, index=False)

In [17]:
def run_context_match(features_path, context_path, custom_file, output_path):
    file_list = glob.glob(features_path + '/*.csv')
    for i, f in tqdm(enumerate(file_list)):
        if i > -1:
            f_name = f.split('/')[-1]
            print(f'{f_name}: {i+1} of {len(file_list)}')
            context_file = f"{context_path}/{f_name[:-4]}_context.tsv"
            output_file = f"{output_path}/{f_name}"
            !tl context-match --custom-context-file $custom_file \
            --context-file $context_file --string-separator ";" \
            --similarity-string-threshold $string_threshold $f > $output_file
        

In [18]:
context_path = '/data/amandeep/nih-dataset/organization/temp/context'
features_path = '/data/amandeep/nih-dataset/organization/features'
context_output_path = '/data/amandeep/nih-dataset/organization/features_with_context'

In [19]:
run_context_match(features_path, context_path, '/data/amandeep/nih-dataset/coauthors.context.tsv.gz', context_output_path)

0it [00:00, ?it/s]

split_0_0.csv: 1 of 40


1it [08:55, 535.23s/it]

split_0_1.csv: 2 of 40


2it [17:29, 523.16s/it]

split_0_2.csv: 3 of 40


3it [25:47, 511.48s/it]

split_0_3.csv: 4 of 40


4it [33:52, 501.14s/it]

split_0_4.csv: 5 of 40


5it [42:15, 501.69s/it]

split_0_5.csv: 6 of 40


6it [50:25, 497.57s/it]

split_0_6.csv: 7 of 40


7it [58:40, 496.97s/it]

split_0_7.csv: 8 of 40


8it [1:06:40, 491.50s/it]

split_0_8.csv: 9 of 40


9it [1:14:15, 480.05s/it]

split_0_9.csv: 10 of 40


10it [1:21:54, 473.60s/it]

split_1_0.csv: 11 of 40


11it [1:54:23, 924.97s/it]

split_1_1.csv: 12 of 40


12it [2:22:01, 1148.14s/it]

split_1_2.csv: 13 of 40


13it [2:50:50, 1324.01s/it]

split_1_3.csv: 14 of 40


14it [3:18:33, 1426.45s/it]

split_1_4.csv: 15 of 40


15it [3:46:47, 1507.27s/it]

split_1_5.csv: 16 of 40


16it [4:14:04, 1546.24s/it]

split_1_6.csv: 17 of 40


17it [4:40:30, 1558.16s/it]

split_1_7.csv: 18 of 40


18it [5:08:59, 1603.55s/it]

split_1_8.csv: 19 of 40


19it [5:34:20, 1578.75s/it]

split_1_9.csv: 20 of 40


20it [6:02:30, 1612.01s/it]

split_2_0.csv: 21 of 40


21it [6:37:42, 1762.00s/it]

split_2_1.csv: 22 of 40


22it [7:11:16, 1837.78s/it]

split_2_2.csv: 23 of 40


23it [7:43:54, 1873.99s/it]

split_2_3.csv: 24 of 40


24it [8:16:48, 1903.89s/it]

split_2_4.csv: 25 of 40


25it [8:50:23, 1937.24s/it]

split_2_5.csv: 26 of 40


26it [9:23:28, 1951.58s/it]

split_2_6.csv: 27 of 40


27it [9:58:24, 1994.80s/it]

split_2_7.csv: 28 of 40


28it [10:33:25, 2026.71s/it]

split_2_8.csv: 29 of 40


29it [10:55:36, 1818.08s/it]

split_2_9.csv: 30 of 40


30it [11:24:30, 1792.65s/it]

split_3_0.csv: 31 of 40


31it [11:56:16, 1826.70s/it]

split_3_1.csv: 32 of 40


32it [12:27:33, 1841.84s/it]

split_3_2.csv: 33 of 40


33it [12:58:28, 1845.91s/it]

split_3_3.csv: 34 of 40


34it [13:28:52, 1839.25s/it]

split_3_4.csv: 35 of 40


35it [13:59:57, 1847.05s/it]

split_3_5.csv: 36 of 40


36it [14:30:52, 1849.45s/it]

split_3_6.csv: 37 of 40


37it [15:01:25, 1844.54s/it]

split_3_7.csv: 38 of 40


38it [15:31:38, 1835.00s/it]

split_3_8.csv: 39 of 40


39it [15:59:44, 1790.19s/it]

split_3_9.csv: 40 of 40


40it [16:29:43, 1484.60s/it]


In [20]:
prediction_path = '/data/amandeep/nih-dataset/organization/predictions'
!mkdir -p $prediction_path

In [21]:
features_str = ",".join(features)
def run_prediction(features_path, prediction_path):
    file_list = glob.glob(features_path + '/*.csv')
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        print(f'{f_name}: {i+1} of {len(file_list)}')
        output_file = f"{prediction_path}/{f_name}"
        !tl predict-using-model -o siamese_prediction \
        --ranking-model $ranking_model_file_path \
        --features $features_str \
        --normalization-factor $min_max_scaler_path $f > $output_file

In [22]:
run_prediction(context_output_path, prediction_path)

0it [00:00, ?it/s]

split_0_0.csv: 1 of 40
predict-using-model Time: 4.797147989273071s


1it [00:10, 10.57s/it]

split_0_1.csv: 2 of 40
predict-using-model Time: 4.323748350143433s


2it [00:20,  9.93s/it]

split_0_2.csv: 3 of 40
predict-using-model Time: 4.141092538833618s


3it [00:29,  9.55s/it]

split_0_3.csv: 4 of 40
predict-using-model Time: 4.062241554260254s


4it [00:38,  9.50s/it]

split_0_4.csv: 5 of 40
predict-using-model Time: 4.240553379058838s


5it [00:48,  9.69s/it]

split_0_5.csv: 6 of 40
predict-using-model Time: 4.036336898803711s


6it [00:57,  9.50s/it]

split_0_6.csv: 7 of 40
predict-using-model Time: 4.240834712982178s


7it [01:06,  9.41s/it]

split_0_7.csv: 8 of 40
predict-using-model Time: 4.141998291015625s


8it [01:16,  9.39s/it]

split_0_8.csv: 9 of 40
predict-using-model Time: 4.2000837326049805s


9it [01:25,  9.39s/it]

split_0_9.csv: 10 of 40
predict-using-model Time: 4.21091628074646s


10it [01:34,  9.30s/it]

split_1_0.csv: 11 of 40
predict-using-model Time: 4.49596905708313s


11it [01:46, 10.00s/it]

split_1_1.csv: 12 of 40
predict-using-model Time: 4.288828611373901s


12it [01:57, 10.23s/it]

split_1_2.csv: 13 of 40
predict-using-model Time: 5.0411858558654785s


13it [02:09, 10.98s/it]

split_1_3.csv: 14 of 40
predict-using-model Time: 4.6633217334747314s


14it [02:21, 11.15s/it]

split_1_4.csv: 15 of 40
predict-using-model Time: 4.735842704772949s


15it [02:33, 11.31s/it]

split_1_5.csv: 16 of 40
predict-using-model Time: 4.329231023788452s


16it [02:44, 11.24s/it]

split_1_6.csv: 17 of 40
predict-using-model Time: 4.514092445373535s


17it [02:55, 11.42s/it]

split_1_7.csv: 18 of 40
predict-using-model Time: 4.644038438796997s


18it [03:07, 11.37s/it]

split_1_8.csv: 19 of 40
predict-using-model Time: 4.527430534362793s


19it [03:19, 11.50s/it]

split_1_9.csv: 20 of 40
predict-using-model Time: 4.2891685962677s


20it [03:30, 11.46s/it]

split_2_0.csv: 21 of 40
predict-using-model Time: 4.419566869735718s


21it [03:42, 11.54s/it]

split_2_1.csv: 22 of 40
predict-using-model Time: 4.513932228088379s


22it [03:53, 11.63s/it]

split_2_2.csv: 23 of 40
predict-using-model Time: 4.778185844421387s


23it [04:06, 11.75s/it]

split_2_3.csv: 24 of 40
predict-using-model Time: 4.97555136680603s


24it [04:18, 11.83s/it]

split_2_4.csv: 25 of 40
predict-using-model Time: 4.743338584899902s


25it [04:29, 11.73s/it]

split_2_5.csv: 26 of 40
predict-using-model Time: 4.858398914337158s


26it [04:41, 11.92s/it]

split_2_6.csv: 27 of 40
predict-using-model Time: 4.796367645263672s


27it [04:54, 12.00s/it]

split_2_7.csv: 28 of 40
predict-using-model Time: 4.876089811325073s


28it [05:06, 11.99s/it]

split_2_8.csv: 29 of 40
predict-using-model Time: 4.336930751800537s


29it [05:16, 11.63s/it]

split_2_9.csv: 30 of 40
predict-using-model Time: 4.520034313201904s


30it [05:28, 11.60s/it]

split_3_0.csv: 31 of 40
predict-using-model Time: 4.332199811935425s


31it [05:40, 11.67s/it]

split_3_1.csv: 32 of 40
predict-using-model Time: 4.4243903160095215s


32it [05:52, 11.79s/it]

split_3_2.csv: 33 of 40
predict-using-model Time: 4.572021007537842s


33it [06:04, 11.86s/it]

split_3_3.csv: 34 of 40
predict-using-model Time: 4.807182788848877s


34it [06:16, 11.98s/it]

split_3_4.csv: 35 of 40
predict-using-model Time: 4.282679080963135s


35it [06:28, 12.01s/it]

split_3_5.csv: 36 of 40
predict-using-model Time: 4.642938852310181s


36it [06:41, 12.16s/it]

split_3_6.csv: 37 of 40
predict-using-model Time: 4.646281480789185s


37it [06:53, 12.22s/it]

split_3_7.csv: 38 of 40
predict-using-model Time: 4.777512311935425s


38it [07:06, 12.32s/it]

split_3_8.csv: 39 of 40
predict-using-model Time: 4.266456842422485s


39it [07:17, 12.01s/it]

split_3_9.csv: 40 of 40
predict-using-model Time: 4.788407325744629s


40it [07:29, 11.24s/it]


In [23]:
colorized_path = '/data/amandeep/nih-dataset/organization/colorized'
!mkdir -p $colorized_path

In [24]:
def topk_color(prediction_path, colorized_path):
    file_list = glob.glob(prediction_path + '/*.csv')
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        print(f'{f_name}: {i+1} of {len(file_list)}')
        output_file = f"{colorized_path}/{f_name[:-4]}.xlsx"
        !tl get-kg-links -c $final_score_column -k 5 --k-rows $f \
        / add-color -c "$final_score_column" -k 5 --output "$output_file"

In [25]:
topk_color(prediction_path, colorized_path)

0it [00:00, ?it/s]

split_0_0.csv: 1 of 40
get-kg-links-siamese_prediction Time: 3.5695652961730957s
add-color Time: 0.9500653743743896s


1it [00:10, 10.02s/it]

split_0_1.csv: 2 of 40
get-kg-links-siamese_prediction Time: 3.1562962532043457s
add-color Time: 0.8754355907440186s


2it [00:19,  9.77s/it]

split_0_2.csv: 3 of 40
get-kg-links-siamese_prediction Time: 3.1087517738342285s
add-color Time: 0.8810536861419678s


3it [00:29,  9.61s/it]

split_0_3.csv: 4 of 40
get-kg-links-siamese_prediction Time: 3.0367207527160645s
add-color Time: 0.8764631748199463s


4it [00:38,  9.46s/it]

split_0_4.csv: 5 of 40
get-kg-links-siamese_prediction Time: 3.174614191055298s
add-color Time: 0.852297306060791s


5it [00:48,  9.59s/it]

split_0_5.csv: 6 of 40
get-kg-links-siamese_prediction Time: 3.5338332653045654s
add-color Time: 0.8408055305480957s


6it [00:57,  9.65s/it]

split_0_6.csv: 7 of 40
get-kg-links-siamese_prediction Time: 3.0713090896606445s
add-color Time: 0.8683221340179443s


7it [01:07,  9.51s/it]

split_0_7.csv: 8 of 40
get-kg-links-siamese_prediction Time: 3.146402359008789s
add-color Time: 0.8479042053222656s


8it [01:16,  9.49s/it]

split_0_8.csv: 9 of 40
get-kg-links-siamese_prediction Time: 3.070218324661255s
add-color Time: 0.8730368614196777s


9it [01:26,  9.53s/it]

split_0_9.csv: 10 of 40
get-kg-links-siamese_prediction Time: 3.4023256301879883s
add-color Time: 0.867779016494751s


10it [01:35,  9.61s/it]

split_1_0.csv: 11 of 40
get-kg-links-siamese_prediction Time: 3.4714574813842773s
add-color Time: 0.8327503204345703s


11it [01:46,  9.82s/it]

split_1_1.csv: 12 of 40
get-kg-links-siamese_prediction Time: 3.4646682739257812s
add-color Time: 0.8299241065979004s


12it [01:56,  9.98s/it]

split_1_2.csv: 13 of 40
get-kg-links-siamese_prediction Time: 3.4258387088775635s
add-color Time: 0.8433747291564941s


13it [02:06, 10.06s/it]

split_1_3.csv: 14 of 40
get-kg-links-siamese_prediction Time: 3.34476375579834s
add-color Time: 0.8604891300201416s


14it [02:17, 10.24s/it]

split_1_4.csv: 15 of 40
get-kg-links-siamese_prediction Time: 3.3901212215423584s
add-color Time: 0.8506553173065186s


15it [02:27, 10.23s/it]

split_1_5.csv: 16 of 40
get-kg-links-siamese_prediction Time: 3.4789376258850098s
add-color Time: 0.8505980968475342s


16it [02:37, 10.23s/it]

split_1_6.csv: 17 of 40
get-kg-links-siamese_prediction Time: 3.927231788635254s
add-color Time: 0.8532097339630127s


17it [02:48, 10.36s/it]

split_1_7.csv: 18 of 40
get-kg-links-siamese_prediction Time: 3.596982717514038s
add-color Time: 0.8506174087524414s


18it [02:58, 10.35s/it]

split_1_8.csv: 19 of 40
get-kg-links-siamese_prediction Time: 3.414686679840088s
add-color Time: 0.8181829452514648s


19it [03:08, 10.20s/it]

split_1_9.csv: 20 of 40
get-kg-links-siamese_prediction Time: 3.418820381164551s
add-color Time: 0.9064044952392578s


20it [03:19, 10.30s/it]

split_2_0.csv: 21 of 40
get-kg-links-siamese_prediction Time: 3.4765284061431885s
add-color Time: 0.8239572048187256s


21it [03:29, 10.34s/it]

split_2_1.csv: 22 of 40
get-kg-links-siamese_prediction Time: 3.4405221939086914s
add-color Time: 0.8213872909545898s


22it [03:39, 10.23s/it]

split_2_2.csv: 23 of 40
get-kg-links-siamese_prediction Time: 3.8976736068725586s
add-color Time: 0.8087537288665771s


23it [03:50, 10.51s/it]

split_2_3.csv: 24 of 40
get-kg-links-siamese_prediction Time: 3.4199416637420654s
add-color Time: 0.8317258358001709s


24it [04:01, 10.44s/it]

split_2_4.csv: 25 of 40
get-kg-links-siamese_prediction Time: 3.3803727626800537s
add-color Time: 0.8288850784301758s


25it [04:11, 10.36s/it]

split_2_5.csv: 26 of 40
get-kg-links-siamese_prediction Time: 3.594879388809204s
add-color Time: 0.8140954971313477s


26it [04:21, 10.39s/it]

split_2_6.csv: 27 of 40
get-kg-links-siamese_prediction Time: 3.575378179550171s
add-color Time: 0.7958765029907227s


27it [04:32, 10.50s/it]

split_2_7.csv: 28 of 40
get-kg-links-siamese_prediction Time: 3.3726232051849365s
add-color Time: 0.8033864498138428s


28it [04:43, 10.56s/it]

split_2_8.csv: 29 of 40
get-kg-links-siamese_prediction Time: 3.2376821041107178s
add-color Time: 0.7221157550811768s


29it [04:53, 10.38s/it]

split_2_9.csv: 30 of 40
get-kg-links-siamese_prediction Time: 3.4694011211395264s
add-color Time: 0.6996941566467285s


30it [05:02, 10.21s/it]

split_3_0.csv: 31 of 40
get-kg-links-siamese_prediction Time: 3.3562047481536865s
add-color Time: 0.8355593681335449s


31it [05:13, 10.21s/it]

split_3_1.csv: 32 of 40
get-kg-links-siamese_prediction Time: 4.069069147109985s
add-color Time: 0.8135066032409668s


32it [05:24, 10.40s/it]

split_3_2.csv: 33 of 40
get-kg-links-siamese_prediction Time: 3.4110913276672363s
add-color Time: 0.8383035659790039s


33it [05:33, 10.26s/it]

split_3_3.csv: 34 of 40
get-kg-links-siamese_prediction Time: 3.738407850265503s
add-color Time: 0.8228514194488525s


34it [05:44, 10.32s/it]

split_3_4.csv: 35 of 40
get-kg-links-siamese_prediction Time: 3.3720908164978027s
add-color Time: 0.8112969398498535s


35it [05:54, 10.34s/it]

split_3_5.csv: 36 of 40
get-kg-links-siamese_prediction Time: 3.6143009662628174s
add-color Time: 0.7425711154937744s


36it [06:05, 10.31s/it]

split_3_6.csv: 37 of 40
get-kg-links-siamese_prediction Time: 3.886761426925659s
add-color Time: 0.7476999759674072s


37it [06:15, 10.38s/it]

split_3_7.csv: 38 of 40
get-kg-links-siamese_prediction Time: 4.106557607650757s
add-color Time: 0.858238697052002s


38it [06:26, 10.63s/it]

split_3_8.csv: 39 of 40
get-kg-links-siamese_prediction Time: 4.09746527671814s
add-color Time: 0.8207793235778809s


39it [06:38, 10.83s/it]

split_3_9.csv: 40 of 40
get-kg-links-siamese_prediction Time: 3.471024513244629s
add-color Time: 0.7944426536560059s


40it [06:48, 10.20s/it]


In [48]:
def add_NILS(colorized_path, output_path):
    file_list = glob.glob(colorized_path + '/*.xlsx')
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        print(f'{f_name}: {i+1} of {len(file_list)}')
        output_file = f"{output_path}/{f_name[:-5]}.csv"
        
        df = pd.read_excel(f)
        df.loc[df['siamese_prediction'].astype(float) < 0.9, 'kg_id'] = 'NIL'
        df.to_csv(output_file, index=False)

In [26]:
def add_NIL(colorized_path, output_path):
    file_list = glob.glob(colorized_path + '/*.xlsx')
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        print(f'{f_name}: {i+1} of {len(file_list)}')
        output_file = f"{output_path}/{f_name[:-5]}.csv"
        
        df = pd.read_excel(f)
        ls = df.columns
        arr = []
        for i in range(0, len(ls)):
            arr.append('')

        arr[0] = 0
        arr[7] = 'NIL'
        arr[len(arr)-1] = ''
        df_list = df.values.tolist()
        nil = pd.DataFrame(columns = df.columns)
        new = pd.DataFrame(columns = df.columns)
        done = []
        b = []
        for i in range (0, len(df)):
            if df['row'][i] in done:
                continue


            arr[1] = df['row'][i]
            arr[2] = df['label'][i]
            arr[3] = df['context'][i]
            arr[4] = df['filename'][i]
            arr[5] = df['column-id'][i]
            arr[6] = df['label_clean'][i]


            nil.loc[len(nil)] = arr


            done.append(df['row'][i])
        nil_list = nil.values.tolist()
        for i in range(0, len(df)):
            new.loc[len(new)] = df_list[i]
            if i % 5 == 4:
                new.loc[len(new)] = (nil_list[int(i/5)])
        for i in range(0, len(new)):
            #print(new['siamese_prediction'][i])
            if (new['siamese_prediction'][i] == ''):
                new['siamese_prediction'][i] = 0
            if (new['context_score'][i] == ''):
                new['context_score'][i] = 0
            new['siamese_prediction'][i] = float(new['siamese_prediction'][i])
            new['context_score'][i] = float(new['context_score'][i])
        select = new[(new['siamese_prediction'] > siamese_threshold) | (new['kg_id'] == 'NIL')]
        select = select.reset_index(drop = True)
        final = pd.DataFrame(columns = df.columns)
        done = []
        select_list = select.values.tolist()
        for i in range(0, len(select)):
            if select['row'][i] in done:
                continue
            final.loc[len(final)] = select_list[i]
            done.append(select['row'][i])
        final.to_csv(output_file, index=False)

In [27]:
nil_path = '/data/amandeep/nih-dataset/organization/with_nils'
!mkdir -p $nil_path

In [49]:
add_NILS(colorized_path, nil_path)

0it [00:00, ?it/s]

split_0_0.xlsx: 1 of 40


1it [00:01,  1.68s/it]

split_0_1.xlsx: 2 of 40


2it [00:03,  1.56s/it]

split_0_2.xlsx: 3 of 40


3it [00:04,  1.49s/it]

split_0_3.xlsx: 4 of 40


4it [00:05,  1.45s/it]

split_0_4.xlsx: 5 of 40


5it [00:07,  1.46s/it]

split_0_5.xlsx: 6 of 40


6it [00:08,  1.44s/it]

split_0_6.xlsx: 7 of 40


7it [00:10,  1.44s/it]

split_0_7.xlsx: 8 of 40


8it [00:11,  1.42s/it]

split_0_8.xlsx: 9 of 40


9it [00:13,  1.42s/it]

split_0_9.xlsx: 10 of 40


10it [00:14,  1.41s/it]

split_1_0.xlsx: 11 of 40


11it [00:15,  1.40s/it]

split_1_1.xlsx: 12 of 40


12it [00:17,  1.40s/it]

split_1_2.xlsx: 13 of 40


13it [00:18,  1.40s/it]

split_1_3.xlsx: 14 of 40


14it [00:20,  1.41s/it]

split_1_4.xlsx: 15 of 40


15it [00:21,  1.40s/it]

split_1_5.xlsx: 16 of 40


16it [00:22,  1.40s/it]

split_1_6.xlsx: 17 of 40


17it [00:24,  1.41s/it]

split_1_7.xlsx: 18 of 40


18it [00:25,  1.41s/it]

split_1_8.xlsx: 19 of 40


19it [00:27,  1.40s/it]

split_1_9.xlsx: 20 of 40


20it [00:28,  1.41s/it]

split_2_0.xlsx: 21 of 40


21it [00:29,  1.37s/it]

split_2_1.xlsx: 22 of 40


22it [00:31,  1.34s/it]

split_2_2.xlsx: 23 of 40


23it [00:32,  1.31s/it]

split_2_3.xlsx: 24 of 40


24it [00:33,  1.29s/it]

split_2_4.xlsx: 25 of 40


25it [00:34,  1.31s/it]

split_2_5.xlsx: 26 of 40


26it [00:36,  1.30s/it]

split_2_6.xlsx: 27 of 40


27it [00:37,  1.29s/it]

split_2_7.xlsx: 28 of 40


28it [00:38,  1.28s/it]

split_2_8.xlsx: 29 of 40


29it [00:39,  1.22s/it]

split_2_9.xlsx: 30 of 40


30it [00:40,  1.21s/it]

split_3_0.xlsx: 31 of 40


31it [00:42,  1.27s/it]

split_3_1.xlsx: 32 of 40


32it [00:43,  1.29s/it]

split_3_2.xlsx: 33 of 40


33it [00:44,  1.30s/it]

split_3_3.xlsx: 34 of 40


34it [00:46,  1.31s/it]

split_3_4.xlsx: 35 of 40


35it [00:47,  1.32s/it]

split_3_5.xlsx: 36 of 40


36it [00:49,  1.46s/it]

split_3_6.xlsx: 37 of 40


37it [00:51,  1.55s/it]

split_3_7.xlsx: 38 of 40


38it [00:52,  1.53s/it]

split_3_8.xlsx: 39 of 40


39it [00:54,  1.47s/it]

split_3_9.xlsx: 40 of 40


40it [00:55,  1.38s/it]


In [45]:
def count_non_nils(nil_path):
    file_list = glob.glob(nil_path + '/*.csv')
    o = list()
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        o.append(pd.read_csv(f))
    df = pd.concat(o)
    print(len(df[df['column'] == 2]))
    df = df[(df['kg_id'] != 'NIL') & (df['kg_id'] != '') & (df['column'] == 2)]
    print(len(df))

        

In [46]:
count_non_nils(nil_path)

40it [00:00, 99.01it/s] 


618
618


In [50]:
def join(nil_path):
    file_list = glob.glob(nil_path + '/*.csv')
    o = list()
    for i, f in tqdm(enumerate(file_list)):
        f_name = f.split('/')[-1]
        o.append(pd.read_csv(f))
    df = pd.concat(o)
    df.to_csv('/tmp/joined.csv', index=False)

In [51]:
join(nil_path)

40it [00:02, 19.99it/s]


In [52]:
pd.read_csv(table_path, sep='\t').to_csv(table_path[:-4] + '.csv', index=False)

In [53]:
!tl join -c siamese_prediction -f /data/amandeep/nih-dataset/organization/org_for_tl_with_qnode.csv \
--extra-info /tmp/joined.csv > '/data/amandeep/nih-dataset/organization/org_for_tl_with_qnode_joined.csv'

join Time: 44.72819471359253s


In [54]:
def create_replace_nodes_mapping(joined_file, mapping_file):
    df = pd.read_csv(joined_file)
    o = []
    for i, row in df.iterrows():
        if row['name_kg_id'].strip() != "":
            o.append({
                'node1': row['org_node'],
                'label': 'same_as_item',
                'node2': row['name_kg_id'],
                'confidence': row['name_score']
            })
        if row['city_kg_id'].strip() != "":
            o.append({
                'node1': row['city_node'],
                'label': 'same_as_item',
                'node2': row['city_kg_id'],
                'confidence': row['city_score']
            })
        if row['state_kg_id'].strip() != "":
            o.append({
                'node1': row['state_node'],
                'label': 'same_as_item',
                'node2': row['state_kg_id'],
                'confidence': row['state_score']
            })
        
        if row['country_kg_id'].strip() != "":
            o.append({
                'node1': row['country_node'],
                'label': 'same_as_item',
                'node2': row['country_kg_id'],
                'confidence': row['country_score']
            })

    oo = []
    odf = pd.DataFrame(o)
    odf.drop_duplicates(subset=['node1', 'node2'], inplace=True)
    for _, gdf in odf.groupby('node1'):
        gdf = gdf[gdf['node2'] != 'NIL']
        if len(gdf) > 1:
            oo.append(pd.DataFrame(gdf.head(0)))
        else:
            oo.append(gdf)
    pd.concat(oo).to_csv(mapping_file, sep='\t', index=False)

In [55]:
create_replace_nodes_mapping('/data/amandeep/nih-dataset/organization/org_for_tl_with_qnode_joined.csv', '/data/amandeep/nih-dataset/organization/replace_nodes_mapping.tsv')

In [60]:
def concat_org_files():
    f_p = "/data/amandeep/nih-dataset/organization/kgtk-files-nih-V3.0"
    o = []
    for f in glob.glob(f"{f_p}/*tsv"):
        o.append(pd.read_csv(f, sep='\t'))
    df = pd.concat(o)
    df.to_csv('/data/amandeep/nih-dataset/organization/nih-org-kgtk.tsv', sep='\t', index=False)

In [61]:
concat_org_files()

kgtk replace-nodes -i nih-org-kgtk.tsv -o nih-org-kgtk-wikidata-qnodes.tsv --mapping-file replace_nodes_mapping.tsv