In [14]:
import glob
import time
import os
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import pickle
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
from tqdm import tqdm
import copy
import shutil
import pickle

In [53]:
es_url = 'http://ckg07:9200'
es_index = 'wikidatadwd-augmented'

# Input Paths

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-train-canonical/
train_path = "/home/sriamazingram/USC/Others/ISI/data/t2dv2/train-canonical"

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-dev-canonical/
dev_path = "/home/sriamazingram/USC/Others/ISI/data/t2dv2/dev-canonical"

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/ground_truth/Xinting_GT_csv
ground_truth_files = "/home/sriamazingram/USC/Others/ISI/data/t2dv2/GT"

# OUTPUT PATHS
output_path = "/home/sriamazingram/USC/Others/ISI/data/t2dv2"
train_output_path = f'{output_path}/train1-output'
dev_output_path = f'{output_path}/dev-output'

# increase version to create a new folder for an experiment
VERSION = "12_0"

train_candidate_path = f'{train_output_path}/{VERSION}/candidates'
train_feature_path = f'{train_output_path}/{VERSION}/features'

dev_candidate_path = f'{dev_output_path}/{VERSION}/candidates'
dev_feature_path = f'{dev_output_path}/{VERSION}/features'
dev_output_predictions = f'{dev_output_path}/{VERSION}/dev_predictions'
dev_predictions_top_k = f'{dev_output_path}/{VERSION}/dev_predictions_top_k'
dev_colorized_path = f'{dev_output_path}/{VERSION}/dev_predictions_colorized'
dev_metrics_path = f'{dev_output_path}/{VERSION}/dev_predictions_metrics'
dev_metrics_col_wise = f'{dev_output_path}/{VERSION}/dev_predictions_column_metrics'

aux_field = 'class_count,property_count,context'


train_prop_count = f'{train_output_path}/{VERSION}/train_prop_count' 
train_class_count = f'{train_output_path}/{VERSION}/train_class_count'
train_context_path = f'{train_output_path}/{VERSION}/train_context'

dev_prop_count = f'{dev_output_path}/{VERSION}/dev_prop_count'
dev_class_count = f'{dev_output_path}/{VERSION}/dev_class_count'
dev_context_path = f'{dev_output_path}/{VERSION}/dev_context'

temp_dir = f'{output_path}/temp'

pos_output = f'{temp_dir}/training_data/pos_features.pkl'
neg_output = f'{temp_dir}/training_data/neg_features.pkl'
min_max_scaler_path = f'{temp_dir}/training_data/gt_normalization_factor.pkl'

final_score_column = 'gt_score'
threshold = final_score_column+":meantop10"

model_save_path = f'{dev_output_path}/{VERSION}/saved_models'
best_model_path = ''

In [54]:
!mkdir -p "$temp_dir"

!mkdir -p "$train_prop_count"
!mkdir -p "$dev_prop_count"
!mkdir -p "$train_class_count"
!mkdir -p "$dev_class_count"
!mkdir -p "$train_context_path"
!mkdir -p "$dev_context_path"

!mkdir -p "$train_candidate_path"
!mkdir -p "$dev_candidate_path"

!mkdir -p "$train_feature_path"
!mkdir -p "$dev_feature_path"

!mkdir -p "$temp_dir/training_data"
!mkdir -p "$dev_output_predictions"
!mkdir -p "$model_save_path"
!mkdir -p "$dev_predictions_top_k"
!mkdir -p "$dev_colorized_path"
!mkdir -p "$dev_metrics_path"
!mkdir -p "$dev_metrics_col_wise"

In [55]:
features = ['pgr_rts','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton', 'context_score', 'smc_class_score', 'smc_property_score']

## Candidate Generation

In [5]:
def candidate_generation(path, gt_path, output_path, class_count_path, prop_count_path, context_path):
    file_list = glob.glob(path + '/*.csv')
    for i, file in enumerate(file_list):
        st = time.time()
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        gt_file = f"{ground_truth_files}/{filename}"
        output_file = f"{output_path}/{filename}"
        
        !tl clean -c label -o label_clean "$file" / \
        --url $es_url --index $es_index \
        get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        --url $es_url --index $es_index \
        get-exact-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        ground-truth-labeler --gt-file "$gt_file" > "$output_file"
        
        for field in aux_field.split(','):
            aux_list = []
            for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
                aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
            aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
            if field == 'class_count':
                class_count_file = f"{class_count_path}/{filename.strip('.csv')}_class_count.tsv"
                aux_df.to_csv(class_count_file, sep='\t', index=False)
            elif field == 'property_count':
                prop_count_file = f"{prop_count_path}/{filename.strip('.csv')}_prop_count.tsv"
                aux_df.to_csv(prop_count_file, sep='\t', index=False)
            elif field == 'context':
                context_file = f"{context_path}/{filename.strip('.csv')}_context.tsv"
                aux_df.to_csv(context_file, sep='\t', index=False)
        print(time.time() - st)

In [6]:
candidate_generation(train_path, ground_truth_files, train_candidate_path, train_class_count, train_prop_count, train_context_path)

37856682_0_6818907050314633217.csv: 1 of 342
clean Time: 0.03520512580871582s
get-fuzzy-augmented-matches Time: 122.17212414741516s
get-exact-matches Time: 9.87580943107605s
ground-truth-labeler Time: 0.7231040000915527s
141.83829832077026
B8QWQQAB.csv: 2 of 342
clean Time: 0.002103567123413086s
get-fuzzy-augmented-matches Time: 56.31139922142029s
get-exact-matches Time: 5.7275989055633545s
ground-truth-labeler Time: 0.09405112266540527s
66.31150913238525
HIFQAGMX.csv: 3 of 342
clean Time: 0.0027904510498046875s
get-fuzzy-augmented-matches Time: 83.41829252243042s
get-exact-matches Time: 5.695766448974609s
ground-truth-labeler Time: 0.13534283638000488s
93.62711596488953
1LD1MWA8.csv: 4 of 342
clean Time: 0.0037474632263183594s
get-fuzzy-augmented-matches Time: 49.38313817977905s
get-exact-matches Time: 20.301311492919922s
ground-truth-labeler Time: 0.1480875015258789s
74.40536093711853
1XNHBBRZ.csv: 5 of 342
clean Time: 0.003772735595703125s
get-fuzzy-augmented-matches Time: 26.786935

ground-truth-labeler Time: 0.48311519622802734s
92.07979416847229
NUTCUXCN.csv: 38 of 342
clean Time: 0.003473043441772461s
get-fuzzy-augmented-matches Time: 52.087647914886475s
get-exact-matches Time: 6.321840047836304s
ground-truth-labeler Time: 0.10838651657104492s
62.914206743240356
YXYVNO79.csv: 39 of 342
clean Time: 0.001617431640625s
get-fuzzy-augmented-matches Time: 23.022461891174316s
get-exact-matches Time: 1.046865463256836s
ground-truth-labeler Time: 0.04500293731689453s
28.122326135635376
O668CSQ3.csv: 40 of 342
clean Time: 0.002938508987426758s
get-fuzzy-augmented-matches Time: 44.73450708389282s
get-exact-matches Time: 6.484825134277344s
ground-truth-labeler Time: 0.14144372940063477s
55.770992279052734
E0LR4TZL.csv: 41 of 342
clean Time: 0.0024805068969726562s
get-fuzzy-augmented-matches Time: 30.303590059280396s
get-exact-matches Time: 5.867246627807617s
ground-truth-labeler Time: 0.08238673210144043s
40.591859579086304
R4K6322V.csv: 42 of 342
clean Time: 0.00217628479

get-exact-matches Time: 8.34312653541565s
ground-truth-labeler Time: 0.36221814155578613s
73.34976720809937
DBG7O38T.csv: 75 of 342
clean Time: 0.0016009807586669922s
get-fuzzy-augmented-matches Time: 31.391658782958984s
get-exact-matches Time: 6.344956636428833s
ground-truth-labeler Time: 0.05674552917480469s
41.959388732910156
WCUYLCJM.csv: 76 of 342
clean Time: 0.0016536712646484375s
get-fuzzy-augmented-matches Time: 25.951189517974854s
get-exact-matches Time: 5.62328314781189s
ground-truth-labeler Time: 0.03211617469787598s
35.58208513259888
RVL1NJ18.csv: 77 of 342
clean Time: 0.0038688182830810547s
get-fuzzy-augmented-matches Time: 40.590646266937256s
get-exact-matches Time: 6.7525975704193115s
ground-truth-labeler Time: 0.15732550621032715s
52.132763624191284
W67MJHRA.csv: 78 of 342
clean Time: 0.0031363964080810547s
get-fuzzy-augmented-matches Time: 30.470330953598022s
get-exact-matches Time: 8.035136461257935s
ground-truth-labeler Time: 0.09586334228515625s
42.94183683395386
3I

ground-truth-labeler Time: 0.59659743309021s
85.87610864639282
7YVY712V.csv: 111 of 342
clean Time: 0.00229644775390625s
get-fuzzy-augmented-matches Time: 23.27476453781128s
get-exact-matches Time: 6.125195503234863s
ground-truth-labeler Time: 0.08166766166687012s
35.01461386680603
TWXZYBCU.csv: 112 of 342
clean Time: 0.003178119659423828s
get-fuzzy-augmented-matches Time: 18.37284255027771s
get-exact-matches Time: 5.618309020996094s
ground-truth-labeler Time: 0.1199500560760498s
28.700717449188232
C9OCMYGQ.csv: 113 of 342
clean Time: 0.0023956298828125s
get-fuzzy-augmented-matches Time: 10.539973020553589s
get-exact-matches Time: 6.212232351303101s
ground-truth-labeler Time: 0.0759270191192627s
21.013391971588135
384SR1N3.csv: 114 of 342
clean Time: 0.0028963088989257812s
get-fuzzy-augmented-matches Time: 29.54988121986389s
get-exact-matches Time: 6.364333629608154s
ground-truth-labeler Time: 0.07062077522277832s
40.11709117889404
OJ08UX38.csv: 115 of 342
clean Time: 0.003110170364379

clean Time: 0.0016853809356689453s
get-fuzzy-augmented-matches Time: 8.100330591201782s
get-exact-matches Time: 1.7681396007537842s
ground-truth-labeler Time: 0.06084799766540527s
13.984266996383667
YW6QLRK5.csv: 148 of 342
clean Time: 0.0018324851989746094s
get-fuzzy-augmented-matches Time: 8.91373896598816s
get-exact-matches Time: 0.7159581184387207s
ground-truth-labeler Time: 0.0450434684753418s
13.680109977722168
BPI3VNGL.csv: 149 of 342
clean Time: 0.004750251770019531s
get-fuzzy-augmented-matches Time: 20.9000883102417s
get-exact-matches Time: 5.90377950668335s
ground-truth-labeler Time: 0.05553865432739258s
30.944586515426636
91959037_0_7907661684242014480.csv: 150 of 342
clean Time: 0.017433643341064453s
get-fuzzy-augmented-matches Time: 58.361629247665405s
get-exact-matches Time: 13.06949520111084s
ground-truth-labeler Time: 0.603661060333252s
79.35833597183228
9567241_0_5666388268510912770.csv: 151 of 342
clean Time: 0.00530552864074707s
get-fuzzy-augmented-matches Time: 16.7

get-fuzzy-augmented-matches Time: 26.181690454483032s
get-exact-matches Time: 6.113345384597778s
ground-truth-labeler Time: 0.17467284202575684s
37.1001398563385
QCMUCPC1.csv: 184 of 342
clean Time: 0.0025894641876220703s
get-fuzzy-augmented-matches Time: 14.501461744308472s
get-exact-matches Time: 6.350944995880127s
ground-truth-labeler Time: 0.10763931274414062s
25.301499366760254
UJNATN8A.csv: 185 of 342
clean Time: 0.002496957778930664s
get-fuzzy-augmented-matches Time: 12.844167470932007s
get-exact-matches Time: 6.351037502288818s
ground-truth-labeler Time: 0.10976362228393555s
23.702625513076782
29414811_13_8724394428539174350.csv: 186 of 342
clean Time: 0.002164125442504883s
get-fuzzy-augmented-matches Time: 4.672652006149292s
get-exact-matches Time: 1.0240342617034912s
ground-truth-labeler Time: 0.047216176986694336s
9.81840443611145
SMGR9VOV.csv: 187 of 342
clean Time: 0.0033321380615234375s
get-fuzzy-augmented-matches Time: 29.541704893112183s
get-exact-matches Time: 15.04777

20.91012191772461
DKRE7U28.csv: 220 of 342
clean Time: 0.0026361942291259766s
get-fuzzy-augmented-matches Time: 14.452039957046509s
get-exact-matches Time: 1.9488921165466309s
ground-truth-labeler Time: 0.10125303268432617s
20.91125798225403
QAPGT6E5.csv: 221 of 342
clean Time: 0.0028960704803466797s
get-fuzzy-augmented-matches Time: 12.878634214401245s
get-exact-matches Time: 5.911509037017822s
ground-truth-labeler Time: 0.35927653312683105s
23.3819682598114
M0XIN8I8.csv: 222 of 342
clean Time: 0.0024874210357666016s
get-fuzzy-augmented-matches Time: 8.737671375274658s
get-exact-matches Time: 5.940567970275879s
ground-truth-labeler Time: 0.11243796348571777s
19.17186951637268
2389HYHH.csv: 223 of 342
clean Time: 0.003414154052734375s
get-fuzzy-augmented-matches Time: 16.138039112091064s
get-exact-matches Time: 6.430931329727173s
ground-truth-labeler Time: 0.10726356506347656s
26.843656301498413
XZWX8MO4.csv: 224 of 342
clean Time: 0.0015716552734375s
get-fuzzy-augmented-matches Time: 

get-exact-matches Time: 6.362860441207886s
ground-truth-labeler Time: 0.0840303897857666s
21.607245445251465
6VLKFW8J.csv: 257 of 342
clean Time: 0.002388477325439453s
get-fuzzy-augmented-matches Time: 5.9229419231414795s
get-exact-matches Time: 0.5721273422241211s
ground-truth-labeler Time: 0.17957353591918945s
10.662206172943115
0AQOU1Z2.csv: 258 of 342
clean Time: 0.004986286163330078s
get-fuzzy-augmented-matches Time: 8.632534980773926s
get-exact-matches Time: 6.241451263427734s
ground-truth-labeler Time: 0.06235361099243164s
19.10352659225464
8EVYXGLE.csv: 259 of 342
clean Time: 0.0030968189239501953s
get-fuzzy-augmented-matches Time: 14.93391752243042s
get-exact-matches Time: 6.163578271865845s
ground-truth-labeler Time: 0.15067315101623535s
25.95561146736145
BCNHZUB2.csv: 260 of 342
clean Time: 0.002566099166870117s
get-fuzzy-augmented-matches Time: 16.207199335098267s
get-exact-matches Time: 5.870135545730591s
ground-truth-labeler Time: 0.09568929672241211s
26.54399275779724
DV

ground-truth-labeler Time: 0.05984163284301758s
15.809678554534912
U5L8U1OL.csv: 293 of 342
clean Time: 0.003976106643676758s
get-fuzzy-augmented-matches Time: 14.456955671310425s
get-exact-matches Time: 6.736159324645996s
ground-truth-labeler Time: 0.15021538734436035s
25.8457453250885
ZZKCAAOP.csv: 294 of 342
clean Time: 0.009017705917358398s
get-fuzzy-augmented-matches Time: 16.96294856071472s
get-exact-matches Time: 6.015062570571899s
ground-truth-labeler Time: 0.24359774589538574s
28.010817527770996
QH8JJV75.csv: 295 of 342
clean Time: 0.0017042160034179688s
get-fuzzy-augmented-matches Time: 11.356359004974365s
get-exact-matches Time: 0.6049404144287109s
ground-truth-labeler Time: 0.056589365005493164s
16.172306299209595
77694908_0_6083291340991074532.csv: 296 of 342
clean Time: 0.005019664764404297s
get-fuzzy-augmented-matches Time: 19.02751874923706s
get-exact-matches Time: 7.237406969070435s
ground-truth-labeler Time: 0.19749808311462402s
31.66292691230774
I3MK4TC6.csv: 297 of 

clean Time: 0.0022687911987304688s
get-fuzzy-augmented-matches Time: 10.761326313018799s
get-exact-matches Time: 6.011868000030518s
ground-truth-labeler Time: 0.09258747100830078s
20.92359757423401
DSK6KM67.csv: 330 of 342
clean Time: 0.003546476364135742s
get-fuzzy-augmented-matches Time: 15.244858741760254s
get-exact-matches Time: 1.02199125289917s
ground-truth-labeler Time: 0.10959267616271973s
20.705830335617065
EPFTNCPD.csv: 331 of 342
clean Time: 0.0038459300994873047s
get-fuzzy-augmented-matches Time: 18.125633716583252s
get-exact-matches Time: 6.3903725147247314s
ground-truth-labeler Time: 0.15848565101623535s
29.40982222557068
A3UJTT4U.csv: 332 of 342
clean Time: 0.0032346248626708984s
get-fuzzy-augmented-matches Time: 17.830005645751953s
get-exact-matches Time: 5.62897801399231s
ground-truth-labeler Time: 0.1188514232635498s
28.04587173461914
HFRDW66L.csv: 333 of 342
clean Time: 0.009133338928222656s
get-fuzzy-augmented-matches Time: 7.476991176605225s
get-exact-matches Time:

In [13]:
candidate_generation(dev_path, ground_truth_files, dev_candidate_path, dev_class_count, dev_prop_count, dev_context_path)

BOXTVP7V.csv: 1 of 58
clean Time: 0.0021827220916748047s
get-fuzzy-augmented-matches Time: 10.79483699798584s
get-exact-matches Time: 5.794044256210327s
ground-truth-labeler Time: 0.11544585227966309s
21.14206886291504
E5SHJSQZ.csv: 2 of 58
clean Time: 0.0017838478088378906s
get-fuzzy-augmented-matches Time: 13.324609279632568s
get-exact-matches Time: 0.9668505191802979s
ground-truth-labeler Time: 0.08446455001831055s
18.812026023864746
DBH21J5D.csv: 3 of 58
clean Time: 0.002680063247680664s
get-fuzzy-augmented-matches Time: 8.544639348983765s
get-exact-matches Time: 5.57153844833374s
ground-truth-labeler Time: 0.05738210678100586s
18.52062225341797
84575189_0_6365692015941409487.csv: 4 of 58
clean Time: 0.005482673645019531s
get-fuzzy-augmented-matches Time: 25.751927852630615s
get-exact-matches Time: 6.418263673782349s
ground-truth-labeler Time: 0.20720553398132324s
37.29926371574402
3OX1PGQD.csv: 5 of 58
clean Time: 0.0026772022247314453s
get-fuzzy-augmented-matches Time: 9.57892179

32.11924624443054
OEMDOUBY.csv: 38 of 58
clean Time: 0.0018184185028076172s
get-fuzzy-augmented-matches Time: 10.459358215332031s
get-exact-matches Time: 5.738584995269775s
ground-truth-labeler Time: 0.06346535682678223s
20.997936964035034
14380604_4_3329235705746762392.csv: 39 of 58
clean Time: 0.0015645027160644531s
get-fuzzy-augmented-matches Time: 12.006526231765747s
get-exact-matches Time: 2.005307674407959s
ground-truth-labeler Time: 0.09351229667663574s
18.511366367340088
VEKB4XZC.csv: 40 of 58
clean Time: 0.007445335388183594s
get-fuzzy-augmented-matches Time: 20.063264846801758s
get-exact-matches Time: 7.450686931610107s
ground-truth-labeler Time: 0.2518634796142578s
33.154908418655396
1UEUW7EP.csv: 41 of 58
clean Time: 0.0029883384704589844s
get-fuzzy-augmented-matches Time: 10.571527004241943s
get-exact-matches Time: 1.0868043899536133s
ground-truth-labeler Time: 0.32866525650024414s
16.763110160827637
14067031_0_559833072073397908.csv: 42 of 58
clean Time: 0.012163639068603

## Feature Generation

In [5]:
def feature_generation(candidate_dir, class_count_dir, property_count_dir, context_path, output_path):
    file_list = glob.glob(candidate_dir + '/*.csv')
    for i, file in enumerate(file_list):
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        class_count_file = f"{class_count_dir}/{filename.strip('.csv')}_class_count.tsv"
        property_count_file = f"{property_count_dir}/{filename.strip('.csv')}_prop_count.tsv"
        context_file = f"{context_path}/{filename.strip('.csv')}_context.tsv"
        output_file = f"{output_path}/{filename}"
        !time tl string-similarity "$file" -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
            / string-similarity -i --method jaro_winkler -o jaro_winkler \
            / string-similarity -i --method levenshtein -o levenshtein \
            / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
            / create-singleton-feature -o singleton \
            / context-match \
            --context-file "$context_file" \
            -o context_score \
            / pgt-semantic-tf-idf \
            -o smc_class_score \
            --pagerank-column pagerank \
            --retrieval-score-column retrieval_score \
            --feature-file "$class_count_file" \
            --feature-name class_count \
            / pgt-semantic-tf-idf \
            -o smc_property_score \
            --pagerank-column pagerank \
            --retrieval-score-column retrieval_score \
            --feature-file "$property_count_file" \
            --feature-name property_count \
            > $output_file

In [15]:
feature_generation(train_candidate_path, train_class_count, train_prop_count, train_context_path, train_feature_path)

37856682_0_6818907050314633217.csv: 1 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 18.74190902709961s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 29.70429229736328s
string-similarity-['jaro_winkler'] Time: 3.8532135486602783s
string-similarity-['levenshtein'] Time: 21.455522060394287s
string-similarity-['jaccard:tokenizer=word'] Time: 0.3229188919067383s
create-singleton-feature Time: 1.2193577289581299s
context-match Time: 377.80142068862915s
pgt-semantic-tf-idf-class_count Time: 464.7099299430847s
pgt-semantic-tf-idf-property_count Time: 466.881630897522s

real	7m54.564s
user	9m44.752s
sys	0m3.718s
B8QWQQAB.csv: 2 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7267422676086426s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7468051910400391s
string-similarity-['jaro_winkler'] Time: 0.26842784881591797s
string-similarity-['levenshtein'] Time: 0.8882400989532471s
string-similarity-['jaccard:t

pgt-semantic-tf-idf-property_count Time: 67.36579704284668s

real	1m13.575s
user	2m58.272s
sys	0m2.842s
6NO3AH02.csv: 14 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7756280899047852s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7893507480621338s
string-similarity-['jaro_winkler'] Time: 0.18189048767089844s
string-similarity-['levenshtein'] Time: 0.5845987796783447s
string-similarity-['jaccard:tokenizer=word'] Time: 0.0960087776184082s
create-singleton-feature Time: 0.12553644180297852s
context-match Time: 31.580562591552734s
pgt-semantic-tf-idf-class_count Time: 37.70407319068909s
pgt-semantic-tf-idf-property_count Time: 38.53632974624634s

real	0m44.275s
user	1m15.567s
sys	0m2.150s
FVFG3EJ2.csv: 15 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.2406318187713623s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.4232456684112549s
string-similarity-['jaro_winkler'] Time: 0.08282756805419922s


context-match Time: 4.51966667175293s
pgt-semantic-tf-idf-class_count Time: 25.122530937194824s
pgt-semantic-tf-idf-property_count Time: 25.070634841918945s

real	0m31.175s
user	0m49.195s
sys	0m1.964s
EL9S7KDR.csv: 27 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.1940972805023193s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 9.866961240768433s
string-similarity-['jaro_winkler'] Time: 0.4433436393737793s
string-similarity-['levenshtein'] Time: 3.8539044857025146s
string-similarity-['jaccard:tokenizer=word'] Time: 0.06305980682373047s
create-singleton-feature Time: 0.12804198265075684s
context-match Time: 71.09407234191895s
pgt-semantic-tf-idf-class_count Time: 93.04676985740662s
pgt-semantic-tf-idf-property_count Time: 93.9788293838501s

real	1m40.165s
user	4m38.406s
sys	0m3.496s
HTUXRVUC.csv: 28 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 5.124638080596924s
string-similarity-['symmetric_monge_elkan:tokenizer=wo

string-similarity-['jaccard:tokenizer=word'] Time: 0.023972511291503906s
create-singleton-feature Time: 0.053076982498168945s
context-match Time: 4.705904006958008s
pgt-semantic-tf-idf-class_count Time: 10.392773389816284s
pgt-semantic-tf-idf-property_count Time: 10.576191425323486s

real	0m16.775s
user	0m34.757s
sys	0m1.917s
O668CSQ3.csv: 40 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.8688127994537354s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 5.3239569664001465s
string-similarity-['jaro_winkler'] Time: 0.5056025981903076s
string-similarity-['levenshtein'] Time: 3.227963924407959s
string-similarity-['jaccard:tokenizer=word'] Time: 0.08940482139587402s
create-singleton-feature Time: 0.20178914070129395s
context-match Time: 54.57637429237366s
pgt-semantic-tf-idf-class_count Time: 71.53738355636597s
pgt-semantic-tf-idf-property_count Time: 72.50505256652832s

real	1m18.311s
user	3m8.745s
sys	0m3.021s
E0LR4TZL.csv: 41 of 342
string-simi

string-similarity-['levenshtein'] Time: 8.11410927772522s
string-similarity-['jaccard:tokenizer=word'] Time: 0.1172335147857666s
create-singleton-feature Time: 0.19588518142700195s
context-match Time: 132.6273910999298s
pgt-semantic-tf-idf-class_count Time: 166.61294031143188s
pgt-semantic-tf-idf-property_count Time: 167.67595863342285s

real	2m53.338s
user	7m35.468s
sys	0m4.621s
21245481_0_8730460088443117515.csv: 53 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.046582221984863s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 12.36160659790039s
string-similarity-['jaro_winkler'] Time: 1.170417308807373s
string-similarity-['levenshtein'] Time: 6.679468393325806s
string-similarity-['jaccard:tokenizer=word'] Time: 0.21462035179138184s
create-singleton-feature Time: 0.4894990921020508s
context-match Time: 157.3623607158661s
pgt-semantic-tf-idf-class_count Time: 190.5755100250244s
pgt-semantic-tf-idf-property_count Time: 191.88728976249695s

rea

string-similarity-['jaro_winkler'] Time: 0.34464406967163086s
string-similarity-['levenshtein'] Time: 1.6878294944763184s
string-similarity-['jaccard:tokenizer=word'] Time: 0.11107134819030762s
create-singleton-feature Time: 0.16151952743530273s
context-match Time: 49.89823532104492s
pgt-semantic-tf-idf-class_count Time: 61.5256462097168s
pgt-semantic-tf-idf-property_count Time: 61.99608564376831s

real	1m9.061s
user	2m19.454s
sys	0m2.611s
EXUMDXPS.csv: 66 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.8777565956115723s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.365236282348633s
string-similarity-['jaro_winkler'] Time: 0.5816066265106201s
string-similarity-['levenshtein'] Time: 3.199824571609497s
string-similarity-['jaccard:tokenizer=word'] Time: 0.06521368026733398s
create-singleton-feature Time: 0.13793706893920898s
context-match Time: 32.49871230125427s
pgt-semantic-tf-idf-class_count Time: 46.39902400970459s
pgt-semantic-tf-idf-pro

string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.490307331085205s
string-similarity-['jaro_winkler'] Time: 0.39293861389160156s
string-similarity-['levenshtein'] Time: 2.3038814067840576s
string-similarity-['jaccard:tokenizer=word'] Time: 0.07378315925598145s
create-singleton-feature Time: 0.12813735008239746s
context-match Time: 16.209436655044556s
pgt-semantic-tf-idf-class_count Time: 27.699949264526367s
pgt-semantic-tf-idf-property_count Time: 28.095407485961914s

real	0m34.160s
user	0m52.745s
sys	0m1.899s
3IB68W0T.csv: 79 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.5127739906311035s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 5.513079643249512s
string-similarity-['jaro_winkler'] Time: 0.3908219337463379s
string-similarity-['levenshtein'] Time: 1.7127094268798828s
string-similarity-['jaccard:tokenizer=word'] Time: 0.06710338592529297s
create-singleton-feature Time: 0.149766206741333s
context-match Time: 46.40352201


real	3m40.928s
user	11m7.084s
sys	0m5.952s
CCCNRESE.csv: 91 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.3323829174041748s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.42959046363830566s
string-similarity-['jaro_winkler'] Time: 0.09394311904907227s
string-similarity-['levenshtein'] Time: 0.31748247146606445s
string-similarity-['jaccard:tokenizer=word'] Time: 0.04865002632141113s
create-singleton-feature Time: 0.06188511848449707s
context-match Time: 13.116556406021118s
pgt-semantic-tf-idf-class_count Time: 18.39237141609192s
pgt-semantic-tf-idf-property_count Time: 18.400864124298096s

real	0m24.445s
user	0m44.964s
sys	0m1.931s
DPUA686B.csv: 92 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.9324302673339844s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.9117846488952637s
string-similarity-['jaro_winkler'] Time: 0.3033459186553955s
string-similarity-['levenshtein'] Time: 2.198651790618896

context-match Time: 156.00437808036804s
pgt-semantic-tf-idf-class_count Time: 188.27671599388123s
pgt-semantic-tf-idf-property_count Time: 188.92753744125366s

real	3m15.058s
user	9m21.442s
sys	0m5.128s
9IN60Q67.csv: 104 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.8981401920318604s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.8194994926452637s
string-similarity-['jaro_winkler'] Time: 0.19493842124938965s
string-similarity-['levenshtein'] Time: 0.9623541831970215s
string-similarity-['jaccard:tokenizer=word'] Time: 0.04442596435546875s
create-singleton-feature Time: 0.10950875282287598s
context-match Time: 11.895580053329468s
pgt-semantic-tf-idf-class_count Time: 19.95370316505432s
pgt-semantic-tf-idf-property_count Time: 20.32580852508545s

real	0m26.358s
user	0m44.665s
sys	0m1.937s
24X84XDL.csv: 105 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.726620197296143s
string-similarity-['symmetric_monge_elkan:toke

string-similarity-['jaccard:tokenizer=word'] Time: 0.04493832588195801s
create-singleton-feature Time: 0.09569859504699707s
context-match Time: 43.66433620452881s
pgt-semantic-tf-idf-class_count Time: 66.5331027507782s
pgt-semantic-tf-idf-property_count Time: 67.06984758377075s

real	1m13.062s
user	3m11.257s
sys	0m2.782s
UMMA6HQO.csv: 117 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.7110037803649902s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.0703444480896s
string-similarity-['jaro_winkler'] Time: 0.5635697841644287s
string-similarity-['levenshtein'] Time: 3.210825204849243s
string-similarity-['jaccard:tokenizer=word'] Time: 0.11823511123657227s
create-singleton-feature Time: 0.15955805778503418s
context-match Time: 67.87682867050171s
pgt-semantic-tf-idf-class_count Time: 85.08775854110718s
pgt-semantic-tf-idf-property_count Time: 85.88330507278442s

real	1m31.756s
user	3m40.090s
sys	0m3.071s
33401079_0_9127583903019856402.csv: 118 o

string-similarity-['jaro_winkler'] Time: 0.09591412544250488s
string-similarity-['levenshtein'] Time: 0.4578685760498047s
string-similarity-['jaccard:tokenizer=word'] Time: 0.030853271484375s
create-singleton-feature Time: 0.055023908615112305s
context-match Time: 7.0944459438323975s
pgt-semantic-tf-idf-class_count Time: 12.47563910484314s
pgt-semantic-tf-idf-property_count Time: 12.484347343444824s

real	0m18.622s
user	0m37.296s
sys	0m2.197s
NUMBFEKZ.csv: 130 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.1813015937805176s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 7.670512676239014s
string-similarity-['jaro_winkler'] Time: 0.5312025547027588s
string-similarity-['levenshtein'] Time: 2.4209787845611572s
string-similarity-['jaccard:tokenizer=word'] Time: 0.21931171417236328s
create-singleton-feature Time: 0.24759578704833984s
context-match Time: 174.16347289085388s
pgt-semantic-tf-idf-class_count Time: 193.0566999912262s
pgt-semantic-tf-i

string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.0900983810424805s
string-similarity-['jaro_winkler'] Time: 0.12053155899047852s
string-similarity-['levenshtein'] Time: 0.565626859664917s
string-similarity-['jaccard:tokenizer=word'] Time: 0.024423599243164062s
create-singleton-feature Time: 0.05257868766784668s
context-match Time: 4.6994469165802s
pgt-semantic-tf-idf-class_count Time: 10.902629852294922s
pgt-semantic-tf-idf-property_count Time: 10.930433511734009s

real	0m16.977s
user	0m35.015s
sys	0m1.865s
TT9ZEHWW.csv: 143 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.0651705265045166s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.395673990249634s
string-similarity-['jaro_winkler'] Time: 0.7818784713745117s
string-similarity-['levenshtein'] Time: 4.288828134536743s
string-similarity-['jaccard:tokenizer=word'] Time: 0.13373756408691406s
create-singleton-feature Time: 0.22670531272888184s
context-match Time: 94.61302828


real	0m56.509s
user	2m19.665s
sys	0m2.815s
J8SUPBQ6.csv: 155 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.9290852546691895s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.2112162113189697s
string-similarity-['jaro_winkler'] Time: 0.492201566696167s
string-similarity-['levenshtein'] Time: 1.9936394691467285s
string-similarity-['jaccard:tokenizer=word'] Time: 0.1070396900177002s
create-singleton-feature Time: 0.1909644603729248s
context-match Time: 56.13386845588684s
pgt-semantic-tf-idf-class_count Time: 68.41173124313354s
pgt-semantic-tf-idf-property_count Time: 68.8980164527893s

real	1m15.059s
user	2m53.796s
sys	0m2.587s
XGSTK6V5.csv: 156 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.495148181915283s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 9.987946033477783s
string-similarity-['jaro_winkler'] Time: 0.6151320934295654s
string-similarity-['levenshtein'] Time: 4.628959894180298s
string-

create-singleton-feature Time: 0.03964567184448242s
context-match Time: 5.218155384063721s
pgt-semantic-tf-idf-class_count Time: 9.869882583618164s
pgt-semantic-tf-idf-property_count Time: 10.133336067199707s

real	0m16.109s
user	0m34.385s
sys	0m1.871s
QFDEZDAG.csv: 168 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.31526780128479004s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.4547572135925293s
string-similarity-['jaro_winkler'] Time: 0.09070491790771484s
string-similarity-['levenshtein'] Time: 0.29871368408203125s
string-similarity-['jaccard:tokenizer=word'] Time: 0.026055574417114258s
create-singleton-feature Time: 0.058449745178222656s
context-match Time: 5.152363300323486s
pgt-semantic-tf-idf-class_count Time: 10.082261323928833s
pgt-semantic-tf-idf-property_count Time: 10.301798343658447s

real	0m16.312s
user	0m34.495s
sys	0m1.830s
G2K4GSYB.csv: 169 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.66065478

string-similarity-['levenshtein'] Time: 7.2166972160339355s
string-similarity-['jaccard:tokenizer=word'] Time: 0.0679478645324707s
create-singleton-feature Time: 0.14167523384094238s
context-match Time: 89.1014609336853s
pgt-semantic-tf-idf-class_count Time: 130.0861623287201s
pgt-semantic-tf-idf-property_count Time: 130.22042536735535s

real	2m16.412s
user	6m14.320s
sys	0m3.763s
USXB8M5L.csv: 181 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.5803391933441162s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7130942344665527s
string-similarity-['jaro_winkler'] Time: 0.13817334175109863s
string-similarity-['levenshtein'] Time: 0.50677490234375s
string-similarity-['jaccard:tokenizer=word'] Time: 0.03303384780883789s
create-singleton-feature Time: 0.07482671737670898s
context-match Time: 6.8431737422943115s
pgt-semantic-tf-idf-class_count Time: 12.742563962936401s
pgt-semantic-tf-idf-property_count Time: 12.764857053756714s

real	0m18.836s
use

string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 14.409458875656128s
string-similarity-['jaro_winkler'] Time: 0.8875350952148438s
string-similarity-['levenshtein'] Time: 7.623546123504639s
string-similarity-['jaccard:tokenizer=word'] Time: 0.07515358924865723s
create-singleton-feature Time: 0.16426515579223633s
context-match Time: 56.566773653030396s
pgt-semantic-tf-idf-class_count Time: 90.28965592384338s
pgt-semantic-tf-idf-property_count Time: 90.66830778121948s

real	1m36.834s
user	3m37.447s
sys	0m3.215s
277M2T3K.csv: 194 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7123599052429199s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.9238381385803223s
string-similarity-['jaro_winkler'] Time: 0.1621096134185791s
string-similarity-['levenshtein'] Time: 0.788156270980835s
string-similarity-['jaccard:tokenizer=word'] Time: 0.03382563591003418s
create-singleton-feature Time: 0.05976557731628418s
context-match Time: 5.927788734


real	0m34.902s
user	1m24.366s
sys	0m2.124s
TZ10O44M.csv: 206 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.194180727005005s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.366128206253052s
string-similarity-['jaro_winkler'] Time: 0.4556570053100586s
string-similarity-['levenshtein'] Time: 4.157804250717163s
string-similarity-['jaccard:tokenizer=word'] Time: 0.039243221282958984s
create-singleton-feature Time: 0.09803199768066406s
context-match Time: 29.27287530899048s
pgt-semantic-tf-idf-class_count Time: 47.829997062683105s
pgt-semantic-tf-idf-property_count Time: 47.94389605522156s

real	0m53.919s
user	1m55.882s
sys	0m2.482s
PG0TP6O0.csv: 207 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.925415277481079s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.932405471801758s
string-similarity-['jaro_winkler'] Time: 0.6752026081085205s
string-similarity-['levenshtein'] Time: 4.759991884231567s
stri

pgt-semantic-tf-idf-class_count Time: 45.53165316581726s
pgt-semantic-tf-idf-property_count Time: 45.80756616592407s

real	0m52.040s
user	1m49.565s
sys	0m2.020s
2E9RPOT7.csv: 219 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.4678201675415039s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.20354914665222168s
string-similarity-['jaro_winkler'] Time: 0.11683082580566406s
string-similarity-['levenshtein'] Time: 0.6964602470397949s
string-similarity-['jaccard:tokenizer=word'] Time: 0.0203549861907959s
create-singleton-feature Time: 0.04969072341918945s
context-match Time: 5.579686641693115s
pgt-semantic-tf-idf-class_count Time: 10.584716320037842s
pgt-semantic-tf-idf-property_count Time: 10.818206310272217s

real	0m16.858s
user	0m35.172s
sys	0m1.683s
DKRE7U28.csv: 220 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.722327470779419s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.9058003425598145s
st

string-similarity-['jaccard:tokenizer=word'] Time: 0.29651427268981934s
create-singleton-feature Time: 0.3255300521850586s
context-match Time: 223.49694538116455s
pgt-semantic-tf-idf-class_count Time: 247.48900508880615s
pgt-semantic-tf-idf-property_count Time: 248.0434672832489s

real	4m14.383s
user	11m40.183s
sys	0m5.864s
T3W112BN.csv: 232 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.7221434116363525s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.720227956771851s
string-similarity-['jaro_winkler'] Time: 0.5117058753967285s
string-similarity-['levenshtein'] Time: 3.1475629806518555s
string-similarity-['jaccard:tokenizer=word'] Time: 0.06616735458374023s
create-singleton-feature Time: 0.1511390209197998s
context-match Time: 50.00909495353699s
pgt-semantic-tf-idf-class_count Time: 67.86556124687195s
pgt-semantic-tf-idf-property_count Time: 68.49480295181274s

real	1m14.434s
user	3m4.185s
sys	0m2.997s
YLD9Y1VG.csv: 233 of 342
string-simil

string-similarity-['jaro_winkler'] Time: 0.43154215812683105s
string-similarity-['levenshtein'] Time: 3.1618552207946777s
string-similarity-['jaccard:tokenizer=word'] Time: 0.06303048133850098s
create-singleton-feature Time: 0.13531827926635742s
context-match Time: 34.12195444107056s
pgt-semantic-tf-idf-class_count Time: 48.0223388671875s
pgt-semantic-tf-idf-property_count Time: 48.65214681625366s

real	0m54.517s
user	2m3.900s
sys	0m2.735s
IUBTQXYO.csv: 245 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.5709969997406006s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.2550642490386963s
string-similarity-['jaro_winkler'] Time: 0.12610912322998047s
string-similarity-['levenshtein'] Time: 0.6801192760467529s
string-similarity-['jaccard:tokenizer=word'] Time: 0.025798559188842773s
create-singleton-feature Time: 0.05715608596801758s
context-match Time: 4.243163347244263s
pgt-semantic-tf-idf-class_count Time: 11.155981063842773s
pgt-semantic-tf-i

string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 4.8481433391571045s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.618717670440674s
string-similarity-['jaro_winkler'] Time: 0.49963974952697754s
string-similarity-['levenshtein'] Time: 6.696329355239868s
string-similarity-['jaccard:tokenizer=word'] Time: 0.04571723937988281s
create-singleton-feature Time: 0.05149960517883301s
context-match Time: 6.179361820220947s
pgt-semantic-tf-idf-class_count Time: 25.66747212409973s
pgt-semantic-tf-idf-property_count Time: 25.958947896957397s

real	0m31.994s
user	0m50.472s
sys	0m1.786s
0AQOU1Z2.csv: 258 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.5469012260437012s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.778803825378418s
string-similarity-['jaro_winkler'] Time: 0.10634994506835938s
string-similarity-['levenshtein'] Time: 0.46341395378112793s
string-similarity-['jaccard:tokenizer=word'] Time: 0.026525020599365

pgt-semantic-tf-idf-property_count Time: 14.805718183517456s

real	0m20.873s
user	0m42.354s
sys	0m1.888s
JJE5Y3ME.csv: 270 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.290430784225464s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.394907474517822s
string-similarity-['jaro_winkler'] Time: 0.6033909320831299s
string-similarity-['levenshtein'] Time: 4.134339094161987s
string-similarity-['jaccard:tokenizer=word'] Time: 0.059015750885009766s
create-singleton-feature Time: 0.1392374038696289s
context-match Time: 45.30470585823059s
pgt-semantic-tf-idf-class_count Time: 64.10933804512024s
pgt-semantic-tf-idf-property_count Time: 64.74096131324768s

real	1m10.733s
user	2m58.184s
sys	0m2.874s
JUGX6J1F.csv: 271 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.43866968154907227s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.86444091796875s
string-similarity-['jaro_winkler'] Time: 0.12137961387634277s
st

string-similarity-['jaccard:tokenizer=word'] Time: 0.024585962295532227s
create-singleton-feature Time: 0.054572343826293945s
context-match Time: 4.645577907562256s
pgt-semantic-tf-idf-class_count Time: 13.27946138381958s
pgt-semantic-tf-idf-property_count Time: 13.6292724609375s

real	0m19.591s
user	0m37.678s
sys	0m1.778s
IL8WDV2Y.csv: 283 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.8875880241394043s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.617981433868408s
string-similarity-['jaro_winkler'] Time: 0.4295642375946045s
string-similarity-['levenshtein'] Time: 2.0573229789733887s
string-similarity-['jaccard:tokenizer=word'] Time: 0.10340189933776855s
create-singleton-feature Time: 0.14741206169128418s
context-match Time: 34.93084955215454s
pgt-semantic-tf-idf-class_count Time: 47.50832653045654s
pgt-semantic-tf-idf-property_count Time: 47.8423855304718s

real	0m53.915s
user	1m46.537s
sys	0m2.369s
PZH1SMYZ.csv: 284 of 342
string-simil

string-similarity-['jaro_winkler'] Time: 0.10274481773376465s
string-similarity-['levenshtein'] Time: 0.3954904079437256s
string-similarity-['jaccard:tokenizer=word'] Time: 0.025585412979125977s
create-singleton-feature Time: 0.05592679977416992s
context-match Time: 5.267137765884399s
pgt-semantic-tf-idf-class_count Time: 10.54161548614502s
pgt-semantic-tf-idf-property_count Time: 10.657447814941406s

real	0m16.778s
user	0m34.814s
sys	0m1.913s
77694908_0_6083291340991074532.csv: 296 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 4.186017274856567s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 11.351802349090576s
string-similarity-['jaro_winkler'] Time: 0.7772042751312256s
string-similarity-['levenshtein'] Time: 4.335475206375122s
string-similarity-['jaccard:tokenizer=word'] Time: 0.1970536708831787s
create-singleton-feature Time: 0.2700357437133789s
context-match Time: 82.4577705860138s
pgt-semantic-tf-idf-class_count Time: 108.97961711883545

string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.7303154468536377s
string-similarity-['jaro_winkler'] Time: 0.2250819206237793s
string-similarity-['levenshtein'] Time: 0.9115884304046631s
string-similarity-['jaccard:tokenizer=word'] Time: 0.04660439491271973s
create-singleton-feature Time: 0.10689759254455566s
context-match Time: 12.285377979278564s
pgt-semantic-tf-idf-class_count Time: 20.02699065208435s
pgt-semantic-tf-idf-property_count Time: 20.28623104095459s

real	0m26.505s
user	0m52.631s
sys	0m2.159s
RA12VETD.csv: 309 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.36510753631591797s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.2711408138275146s
string-similarity-['jaro_winkler'] Time: 0.10414671897888184s
string-similarity-['levenshtein'] Time: 0.3915283679962158s
string-similarity-['jaccard:tokenizer=word'] Time: 0.028743743896484375s
create-singleton-feature Time: 0.06455230712890625s
context-match Time: 5.9199


real	0m18.378s
user	0m38.369s
sys	0m1.860s
ZT4Q61TK.csv: 321 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.734029531478882s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.313394069671631s
string-similarity-['jaro_winkler'] Time: 0.6505756378173828s
string-similarity-['levenshtein'] Time: 4.937998056411743s
string-similarity-['jaccard:tokenizer=word'] Time: 0.0681619644165039s
create-singleton-feature Time: 0.1547868251800537s
context-match Time: 49.72838068008423s
pgt-semantic-tf-idf-class_count Time: 65.87399792671204s
pgt-semantic-tf-idf-property_count Time: 66.37882399559021s

real	1m12.389s
user	3m0.810s
sys	0m3.085s
OJXA73X7.csv: 322 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.752303600311279s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 7.767775774002075s
string-similarity-['jaro_winkler'] Time: 0.7094571590423584s
string-similarity-['levenshtein'] Time: 8.238847494125366s
string-si

pgt-semantic-tf-idf-class_count Time: 76.03556656837463s
pgt-semantic-tf-idf-property_count Time: 75.90059423446655s

real	1m21.995s
user	4m7.619s
sys	0m3.150s
J4XOF8WJ.csv: 334 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.384493350982666s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7615349292755127s
string-similarity-['jaro_winkler'] Time: 0.11473870277404785s
string-similarity-['levenshtein'] Time: 0.4104905128479004s
string-similarity-['jaccard:tokenizer=word'] Time: 0.030095338821411133s
create-singleton-feature Time: 0.06683969497680664s
context-match Time: 5.8975207805633545s
pgt-semantic-tf-idf-class_count Time: 11.292962312698364s
pgt-semantic-tf-idf-property_count Time: 11.79198169708252s

real	0m17.675s
user	0m36.031s
sys	0m1.746s
P11KZF71.csv: 335 of 342
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 5.4409143924713135s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.22904109954834s
stri

In [6]:
feature_generation(dev_candidate_path, dev_class_count, dev_prop_count, dev_context_path, dev_feature_path)

BOXTVP7V.csv: 1 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.8111116886138916s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.8052897453308105s
string-similarity-['jaro_winkler'] Time: 0.4336521625518799s
string-similarity-['levenshtein'] Time: 2.0259130001068115s
string-similarity-['jaccard:tokenizer=word'] Time: 0.058176517486572266s
create-singleton-feature Time: 0.21676373481750488s
context-match Time: 25.926607847213745s
pgt-semantic-tf-idf-class_count Time: 42.966874837875366s
pgt-semantic-tf-idf-property_count Time: 43.56195569038391s

real	0m52.010s
user	1m43.758s
sys	0m2.959s
E5SHJSQZ.csv: 2 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.5176534652709961s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.33423352241516113s
string-similarity-['jaro_winkler'] Time: 0.13557100296020508s
string-similarity-['levenshtein'] Time: 0.6195898056030273s
string-similarity-['jaccard:tokenizer=word'] 

pgt-semantic-tf-idf-property_count Time: 93.94451951980591s

real	1m39.828s
user	4m26.626s
sys	0m3.558s
FU7P6GOF.csv: 14 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.6970853805542s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.148318529129028s
string-similarity-['jaro_winkler'] Time: 1.5289182662963867s
string-similarity-['levenshtein'] Time: 16.338645458221436s
string-similarity-['jaccard:tokenizer=word'] Time: 0.08259940147399902s
create-singleton-feature Time: 0.18345117568969727s
context-match Time: 91.01616907119751s
pgt-semantic-tf-idf-class_count Time: 135.08351516723633s
pgt-semantic-tf-idf-property_count Time: 135.43852734565735s

real	2m21.661s
user	5m53.943s
sys	0m4.244s
6T4QNE30.csv: 15 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.3362550735473633s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 4.8301842212677s
string-similarity-['jaro_winkler'] Time: 0.502957820892334s
string-s

pgt-semantic-tf-idf-class_count Time: 14.55730152130127s
pgt-semantic-tf-idf-property_count Time: 14.827415943145752s

real	0m21.144s
user	0m44.550s
sys	0m1.862s
RWEJTWBK.csv: 27 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.4485132694244385s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 9.20625615119934s
string-similarity-['jaro_winkler'] Time: 0.6433548927307129s
string-similarity-['levenshtein'] Time: 4.065220355987549s
string-similarity-['jaccard:tokenizer=word'] Time: 0.08710408210754395s
create-singleton-feature Time: 0.20815014839172363s
context-match Time: 99.94128012657166s
pgt-semantic-tf-idf-class_count Time: 122.74575400352478s
pgt-semantic-tf-idf-property_count Time: 123.80489778518677s

real	2m10.126s
user	5m44.311s
sys	0m3.772s
34LOX8E9.csv: 28 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.7230961322784424s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.773207902908325s
string-si

create-singleton-feature Time: 0.06643104553222656s
context-match Time: 23.976974725723267s
pgt-semantic-tf-idf-class_count Time: 31.33631443977356s
pgt-semantic-tf-idf-property_count Time: 31.608833074569702s

real	0m38.118s
user	1m15.054s
sys	0m2.111s
VEKB4XZC.csv: 40 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 4.098602294921875s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 7.724795341491699s
string-similarity-['jaro_winkler'] Time: 0.7167987823486328s
string-similarity-['levenshtein'] Time: 4.358417272567749s
string-similarity-['jaccard:tokenizer=word'] Time: 0.2196791172027588s
create-singleton-feature Time: 0.27959489822387695s
context-match Time: 244.242915391922s
pgt-semantic-tf-idf-class_count Time: 267.43952655792236s
pgt-semantic-tf-idf-property_count Time: 268.3101592063904s

real	4m35.218s
user	12m42.297s
sys	0m6.676s
1UEUW7EP.csv: 41 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 6.6634368896484375s
str

string-similarity-['levenshtein'] Time: 0.5769243240356445s
string-similarity-['jaccard:tokenizer=word'] Time: 0.04831385612487793s
create-singleton-feature Time: 0.11677360534667969s
context-match Time: 9.131039142608643s
pgt-semantic-tf-idf-class_count Time: 15.815504312515259s
pgt-semantic-tf-idf-property_count Time: 15.891691207885742s

real	0m22.231s
user	0m40.658s
sys	0m1.828s
EJMFROMS.csv: 53 of 58
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.1703214645385742s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 1.7192113399505615s
string-similarity-['jaro_winkler'] Time: 0.17856955528259277s
string-similarity-['levenshtein'] Time: 1.2533912658691406s
string-similarity-['jaccard:tokenizer=word'] Time: 0.02434849739074707s
create-singleton-feature Time: 0.051328420639038086s
context-match Time: 4.49133563041687s
pgt-semantic-tf-idf-class_count Time: 12.883211135864258s
pgt-semantic-tf-idf-property_count Time: 12.665292739868164s

real	0m18.739s
u

## Generate training data

In [17]:
def merge_files(args):
    datapath = args.train_path
    eval_file_names = []
    for (dirpath, dirnames, filenames) in os.walk(datapath):
        for fn in filenames:
            if "csv" not in fn:
                continue
            abs_fn = f"{dirpath}/{fn}"
            assert os.path.isfile(abs_fn)
            if os.path.getsize(abs_fn) == 0:
                continue
            eval_file_names.append(abs_fn)
    df_list = []
    for fn in eval_file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        df_list.append(df)
    return pd.concat(df_list) 

def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    pickle.dump(scaler, open(min_max_scaler_path, 'wb'))
    return scaler

def generate_train_data(args):
    scaler_path = args.min_max_scaler_path
    scaler = pickle.load(open(scaler_path, 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features) + ['evaluation_label']
    print(sfeatures)
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    for i,file in enumerate(glob.glob(args.train_path + '/*.csv')):
        file_name = file.split('/')[-1]
        print(file_name)
        if os.path.getsize(file) == 0:
                continue
        d_sample = pd.read_csv(file)
        d_sample = d_sample[(d_sample["smc_class_score"]>0)].reset_index(drop=True)
        grouped_obj = d_sample.groupby(['column', 'row'])
        for cell in grouped_obj:
            cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
            pos_features = []
            neg_features = []
            a = cell[1][cell[1]['evaluation_label'] == 1]
            if a.empty:
                continue
            pos_rows = cell[1][cell[1]['evaluation_label'] == 1][features].to_numpy()
            for i in range(len(pos_rows)):
                pos_features.append(pos_rows[i])
            neg_rows = cell[1][cell[1]['evaluation_label'] == -1][features].to_numpy()
            for i in range(len(neg_rows)):
                neg_features.append(neg_rows[i])
            random.shuffle(pos_features)
            random.shuffle(pos_features)
            positive_features_final.append(pos_features)
            negative_features_final.append(neg_features)
            
    print(len(positive_features_final), len(positive_features_final[0]))
    print(len(negative_features_final), len(negative_features_final[0]))
    pickle.dump(positive_features_final,open(args.pos_output,'wb'))
    pickle.dump(negative_features_final,open(args.neg_output,'wb'))


In [18]:
gen_training_data_args = Namespace(train_path=train_feature_path, pos_output=pos_output, neg_output=neg_output, 
                 min_max_scaler_path=min_max_scaler_path)
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)
generate_train_data(gen_training_data_args)

['pgr_rts', 'monge_elkan', 'monge_elkan_aliases', 'des_cont_jaccard', 'jaro_winkler', 'levenshtein', 'singleton', 'context_score', 'smc_class_score', 'smc_property_score', 'evaluation_label']
37856682_0_6818907050314633217.csv
B8QWQQAB.csv
HIFQAGMX.csv
1LD1MWA8.csv
1XNHBBRZ.csv
38428277_0_1311643810102462607.csv
1KJ39NFE.csv
HFNU4Y9W.csv
MZNZLWYW.csv
K2V1VODK.csv
E5DKRW4W.csv
YCXXPVD2.csv
ZX8GERJC.csv
6NO3AH02.csv
FVFG3EJ2.csv
26XDNAJB.csv
NPGBDBM4.csv
ERPSWFMM.csv
ZDAZ5PQ5.csv
2LM6W2JV.csv
0KL64BZL.csv
2JN1R1VW.csv
X0TEEJCK.csv
UL2BYXAR.csv
QTYEU8F5.csv
9XF3SP0B.csv
EL9S7KDR.csv
HTUXRVUC.csv
29414811_12_251152470253168163.csv
F487BS0V.csv
8DOZTMTY.csv
XSUGP66N.csv
6D4OURQN.csv
J3P3ZJZ0.csv
BLUL2XZW.csv
64ZFZ4K2.csv
9834884_0_3871985887467090123.csv
NUTCUXCN.csv
YXYVNO79.csv
O668CSQ3.csv
E0LR4TZL.csv
R4K6322V.csv
L2WQ1RA3.csv
BBN3425A.csv
FCG0YNIZ.csv
2IEVUWPV.csv
6FGUGZF9.csv
57681CMM.csv
CLW28GXT.csv
0ZH7HCT0.csv
5INQ2HVE.csv
L94PAXBK.csv
21245481_0_8730460088443117515.csv
3QIWU8Z7.c

## Model definition

In [10]:
# Dataset
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features
    
    def __len__(self):
        return len(self.pos_features)
    
    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]

# Model
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        #original 10x20, 20x10, 10x10, 10x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc2 = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)
    
    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))
        
        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))
        
        return pos_out, neg_out
    
    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out

# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0
    
    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss

## Training

In [11]:
def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))

    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))

    train_dataset = T2DV2Dataset(pos_features_flatten, neg_features_flatten)
    train_dataloader = DataLoader(train_dataset, batch_size=64)
    return train_dataloader

def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model):
    scaler = pickle.load(open(min_max_scaler_path, 'rb'))
    normalize_features = features
    for file in glob.glob(input_table_path + '/*.csv'):
        file_name = file.split('/')[-1]
        if os.path.getsize(file) == 0:
                continue
                
        print(file_name)
        d_sample = pd.read_csv(file)
#         d_sample = d_sample[d_sample["smc_class_score"]>0]
        grouped_obj = d_sample.groupby(['column', 'row'])
        new_df_list = []
        pred = []
        for cell in grouped_obj:
            cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
            sorted_df = cell[1].sort_values('smc_class_score',ascending=False)
            sorted_df_features = sorted_df[normalize_features]
            new_df_list.append(sorted_df)
            arr = sorted_df_features.to_numpy()
            test_inp = []
            for a in arr:
                test_inp.append(a)
            test_tensor = torch.tensor(test_inp).float()
            scores = model.predict(test_tensor)
            scores_list = torch.squeeze(scores).tolist()
            if not type(scores_list) is list:
                pred.append(scores_list)
            else:
                pred.extend(scores_list)
        test_df = pd.concat(new_df_list)
        test_df[final_score_column] = pred
        test_df.to_csv(f"{output_table_path}/{file_name}", index=False)

def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')
    
    else:
        device = torch.device('cpu')
    train_dataloader = generate_dataloader(args.positive_feat_path, args.negative_feat_path)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(features)).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid, batch in tqdm(enumerate(train_dataloader), position=0, leave=True):
            positive_feat = torch.tensor(batch[0].float())
            negative_feat = torch.tensor(batch[1].float())
            optimizer.zero_grad()
            pos_out, neg_out = model(positive_feat, negative_feat)
            loss = criterion(pos_out, neg_out)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid

        # Evaluation
        model.eval()
        infer_scores(args.min_max_scaler_path, args.dev_path, args.dev_output, model)
        eval_data = merge_eval_files(args.dev_output)
        res, candidate_eval_data = parse_eval_files_stats(eval_data, final_score_column)
        top1_precision = res['num_tasks_with_model_score_top_one_accurate']/res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            top1_max_prec = top1_precision
            model_save_name = 'epoch_{}_loss_{}_top1_{}.pth'.format(epoch, avg_loss, top1_max_prec)
            best_model_path = os.path.join(args.model_save_path, model_save_name)
            torch.save(model.state_dict(), best_model_path)
        
        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision, top1_max_prec))
    return best_model_path

In [12]:
def merge_eval_files(final_score_path):
    eval_file_names = []
    df_list = []
    for (dirpath, dirnames, filenames) in os.walk(final_score_path):
        for fn in filenames:
            if "csv" not in fn:
                continue
            abs_fn = os.path.join(dirpath, fn)
            assert os.path.isfile(abs_fn)
            if os.path.getsize(abs_fn) == 0:
                continue
            eval_file_names.append(abs_fn)
    
    for fn in eval_file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        df_list.append(df)
    return pd.concat(df_list)

def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    for i, row in candidate_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)
                    
        #rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_model_score_top_five_accurate.append(1)
        else:
            num_tasks_with_model_score_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_model_score_top_ten_accurate.append(1)
        else:
            num_tasks_with_model_score_top_ten_accurate.append(0)
            
    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    res['num_tasks_with_model_score_top_five_accurate'] = sum(num_tasks_with_model_score_top_five_accurate)
    res['num_tasks_with_model_score_top_ten_accurate'] = sum(num_tasks_with_model_score_top_ten_accurate)
    return res, candidate_eval_data

In [13]:
training_args = Namespace(num_epochs=20, lr=0.001, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=dev_feature_path, dev_output=dev_output_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)

In [15]:
## Call Training
best_model_path = train(training_args)

  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
295it [00:02, 140.32it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
51it [00:00, 509.40it/s]

Epoch 0, Avg Loss is 0.7028646469116211, epoch top1 0.778207381370826, max top1 0.778207381370826


295it [00:00, 596.14it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
59it [00:00, 582.43it/s]

Epoch 1, Avg Loss is 0.22091759741306305, epoch top1 0.7964850615114235, max top1 0.7964850615114235


295it [00:00, 630.17it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
67it [00:00, 662.20it/s]

Epoch 2, Avg Loss is 0.17433594167232513, epoch top1 0.8017574692442883, max top1 0.8017574692442883


295it [00:00, 646.46it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
63it [00:00, 624.88it/s]

Epoch 3, Avg Loss is 0.1588694304227829, epoch top1 0.809841827768014, max top1 0.809841827768014


295it [00:00, 636.08it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
66it [00:00, 657.17it/s]

Epoch 4, Avg Loss is 0.15033908188343048, epoch top1 0.81195079086116, max top1 0.81195079086116


295it [00:00, 645.22it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
65it [00:00, 640.79it/s]

Epoch 5, Avg Loss is 0.14332109689712524, epoch top1 0.8084358523725835, max top1 0.81195079086116


295it [00:00, 636.16it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
63it [00:00, 621.87it/s]

Epoch 6, Avg Loss is 0.13850349187850952, epoch top1 0.810896309314587, max top1 0.81195079086116


295it [00:00, 631.57it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
64it [00:00, 635.22it/s]

Epoch 7, Avg Loss is 0.1347717046737671, epoch top1 0.8126537785588752, max top1 0.8126537785588752


295it [00:00, 642.29it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
58it [00:00, 574.06it/s]

Epoch 8, Avg Loss is 0.1309974491596222, epoch top1 0.8133567662565905, max top1 0.8133567662565905


295it [00:00, 588.16it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
68it [00:00, 679.53it/s]

Epoch 9, Avg Loss is 0.12449239939451218, epoch top1 0.8175746924428823, max top1 0.8175746924428823


295it [00:00, 606.56it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
55it [00:00, 542.11it/s]

Epoch 10, Avg Loss is 0.12107826769351959, epoch top1 0.8182776801405975, max top1 0.8182776801405975


295it [00:00, 604.12it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
52it [00:00, 512.70it/s]

Epoch 11, Avg Loss is 0.11964122205972672, epoch top1 0.8165202108963093, max top1 0.8182776801405975


295it [00:00, 471.26it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
67it [00:00, 669.19it/s]

Epoch 12, Avg Loss is 0.11871244758367538, epoch top1 0.8154657293497364, max top1 0.8182776801405975


295it [00:00, 644.03it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
69it [00:00, 681.52it/s]

Epoch 13, Avg Loss is 0.1180984377861023, epoch top1 0.8147627416520211, max top1 0.8182776801405975


295it [00:00, 683.50it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
71it [00:00, 703.04it/s]

Epoch 14, Avg Loss is 0.11772656440734863, epoch top1 0.8144112478031634, max top1 0.8182776801405975


295it [00:00, 684.15it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
69it [00:00, 689.00it/s]

Epoch 15, Avg Loss is 0.1175118014216423, epoch top1 0.8112478031634447, max top1 0.8182776801405975


295it [00:00, 687.34it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
56it [00:00, 553.62it/s]

Epoch 16, Avg Loss is 0.11731259524822235, epoch top1 0.8063268892794376, max top1 0.8182776801405975


295it [00:00, 606.78it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
62it [00:00, 612.97it/s]

Epoch 17, Avg Loss is 0.1170322373509407, epoch top1 0.7985940246045694, max top1 0.8182776801405975


295it [00:00, 641.71it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
69it [00:00, 687.30it/s]

Epoch 18, Avg Loss is 0.11658038198947906, epoch top1 0.7961335676625659, max top1 0.8182776801405975


295it [00:00, 674.25it/s]


BOXTVP7V.csv
E5SHJSQZ.csv
DBH21J5D.csv
84575189_0_6365692015941409487.csv
3OX1PGQD.csv
4HYT5D2J.csv
28086084_0_3127660530989916727.csv
U4430LA9.csv
50270082_0_444360818941411589.csv
PT0GTLGV.csv
FV3PPNAQ.csv
JYC6D9MU.csv
VADKVBSJ.csv
FU7P6GOF.csv
6T4QNE30.csv
TLAL3B63.csv
VNSUNG1M.csv
FDOC6GMJ.csv
29414811_2_4773219892816395776.csv
CR0Q0GDE.csv
RCL5LZUM.csv
9V2P69CI.csv
RF6RSJ5W.csv
SYRX0I75.csv
54SEC9F3.csv
MBCHQ4TC.csv
RWEJTWBK.csv
34LOX8E9.csv
OYFD9B7F.csv
ZR25NVUN.csv
JTWZYYBU.csv
NE9XVY42.csv
VB0WL533.csv
XXYFPD8I.csv
8N4ZTXDV.csv
YV0V8O3A.csv
39759273_0_1427898308030295194.csv
OEMDOUBY.csv
14380604_4_3329235705746762392.csv
VEKB4XZC.csv
1UEUW7EP.csv
14067031_0_559833072073397908.csv
093BPOP2.csv
J5WTHYK6.csv
IYEDUUIU.csv
JAV53EZQ.csv
DHDSWRU2.csv
RPS3P53T.csv
U7PSL9LZ.csv
45073662_0_3179937335063201739.csv
DKR353LM.csv
KL3RUA2V.csv
EJMFROMS.csv
292E016E.csv
V1MLK9TP.csv
NBYU3S9Y.csv
52299421_0_4473286348258170200.csv
PMSAYLPC.csv
Epoch 19, Avg Loss is 0.11605416983366013, epoch t

In [16]:
best_model_path

'/home/sriamazingram/USC/Others/ISI/data/t2dv2/dev-output/12_0/saved_models/epoch_10_loss_0.12107826769351959_top1_0.8182776801405975.pth'

## Dev Prediction

In [27]:
def dev_prediction(dev_feature_path, dev_predictions_top_k, saved_model, output_column, min_max_scaler_path, k=5):
    for file in glob.glob(dev_feature_path + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        feature_str =  ",".join(features)
        if os.path.getsize(file) == 0:
            continue
        # location where the output generated by the predictions wil be stored.
        dev_output = f"{dev_predictions_top_k}/{filename}"
        !tl predict-using-model $file -o $output_column \
            --features $feature_str \
            --ranking-model $saved_model \
            --normalization-factor $min_max_scaler_path \
            / create-pseudo-gt \
            --column-thresholds $threshold \
            --filter smc_class_score:0 \
            / get-kg-links -c $output_column -k $k --k-rows \
            > $dev_output

In [28]:
def add_color(dev_predictions_top_k, dev_colorized_path, score_column, k=5):
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_color_file = f"{dev_colorized_path}/{filename.strip('.csv')}.xlsx"
        !tl add-color $file -c "$score_column,evaluation_label" -k $k --output $dev_color_file

In [29]:
def compute_metrics(dev_predictions_top_k, dev_predictions_metrics, score_column, k=5):
    df_list = []
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        !tl metrics $file -k $k -c pseudo_gt --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))
        df = pd.read_csv(file)
        col_df_list = []
        for col, gdf in df.groupby(by=["column"]):
            gdf.to_csv("column_wise_file.csv", sep=",", index=False)
            !tl metrics column_wise_file.csv -k $k -c pseudo_gt --tag $filename > column_wise_output.csv
            try:
                odf = pd.read_csv("column_wise_output.csv")
            except pd.errors.EmptyDataError:
                odf = pd.DataFrame([{"k":k, "f1": 0.0, "precision": 0.0, "recall": 0.0, "tag": filename}])
            !rm -f column_wise_output.csv
            !rm -f column_wise_file.csv
            odf["column"] = col
            col_df_list.append(odf)
        col_df =  pd.concat(col_df_list)
        col_df.to_csv(dev_metrics_col_wise+"/"+filename, sep=",", index=False)
    return pd.concat(df_list)

In [56]:
best_model_path = '/home/sriamazingram/USC/Others/ISI/data/t2dv2/dev-output/12_0/saved_models/epoch_10_loss_0.12107826769351959_top1_0.8182776801405975.pth'
dev_prediction(dev_feature_path, dev_predictions_top_k, best_model_path, final_score_column, min_max_scaler_path, k=1)

BOXTVP7V.csv
predict-using-model Time: 0.5791568756103516s
create-pseudo-gt Time: 0.08759069442749023s
get-kg-links-gt_score Time: 0.30208396911621094s
E5SHJSQZ.csv
predict-using-model Time: 0.43768835067749023s
create-pseudo-gt Time: 0.04762911796569824s
get-kg-links-gt_score Time: 0.1506819725036621s
DBH21J5D.csv
predict-using-model Time: 0.43293190002441406s
create-pseudo-gt Time: 0.046639204025268555s
get-kg-links-gt_score Time: 0.1579608917236328s
84575189_0_6365692015941409487.csv
predict-using-model Time: 0.9185373783111572s
create-pseudo-gt Time: 0.16157245635986328s
get-kg-links-gt_score Time: 0.6543638706207275s
3OX1PGQD.csv
predict-using-model Time: 0.4346468448638916s
create-pseudo-gt Time: 0.046515703201293945s
get-kg-links-gt_score Time: 0.14801955223083496s
4HYT5D2J.csv
predict-using-model Time: 0.541600227355957s
create-pseudo-gt Time: 0.0705723762512207s
get-kg-links-gt_score Time: 0.25599050521850586s
28086084_0_3127660530989916727.csv
predict-using-model Time: 1.7818

292E016E.csv
predict-using-model Time: 0.6849284172058105s
create-pseudo-gt Time: 0.10889577865600586s
get-kg-links-gt_score Time: 0.400195837020874s
V1MLK9TP.csv
predict-using-model Time: 0.7371792793273926s
create-pseudo-gt Time: 0.10777783393859863s
get-kg-links-gt_score Time: 0.39931750297546387s
NBYU3S9Y.csv
predict-using-model Time: 0.564507246017456s
create-pseudo-gt Time: 0.049449920654296875s
get-kg-links-gt_score Time: 0.2140340805053711s
52299421_0_4473286348258170200.csv
predict-using-model Time: 0.9876995086669922s
create-pseudo-gt Time: 0.16230463981628418s
get-kg-links-gt_score Time: 0.7046506404876709s
PMSAYLPC.csv
predict-using-model Time: 0.7086181640625s
create-pseudo-gt Time: 0.11059808731079102s
get-kg-links-gt_score Time: 0.41428184509277344s


In [57]:
metrics_df = compute_metrics(dev_predictions_top_k, dev_metrics_path, final_score_column, k=1)

BOXTVP7V.csv
metrics Time: 0.13967609405517578s
metrics Time: 0.09749889373779297s
metrics Time: 0.09590911865234375s
E5SHJSQZ.csv
metrics Time: 0.07848072052001953s
metrics Time: 0.08321404457092285s
DBH21J5D.csv
metrics Time: 0.07933640480041504s
metrics Time: 0.0778956413269043s
84575189_0_6365692015941409487.csv
metrics Time: 0.44889402389526367s
metrics Time: 0.32321691513061523s
3OX1PGQD.csv
metrics Time: 0.08590865135192871s
metrics Time: 0.0857689380645752s
4HYT5D2J.csv
metrics Time: 0.12857675552368164s
metrics Time: 0.1283116340637207s
28086084_0_3127660530989916727.csv
metrics Time: 0.6862027645111084s
metrics Time: 0.7066755294799805s
U4430LA9.csv
metrics Time: 0.2717759609222412s
metrics Time: 0.08355951309204102s
metrics Time: 0.08433294296264648s
50270082_0_444360818941411589.csv
metrics Time: 0.553708553314209s
metrics Time: 0.5519015789031982s
PT0GTLGV.csv
metrics Time: 0.20294666290283203s
metrics Time: 0.14369845390319824s
metrics Time: 0.08022427558898926s
metrics T

In [58]:
metrics_df

Unnamed: 0,k,f1,precision,recall,tag
0,1,1.0,1.0,1.0,BOXTVP7V.csv
0,1,0.55,0.55,0.55,E5SHJSQZ.csv
0,1,1.0,1.0,1.0,DBH21J5D.csv
0,1,0.857143,0.857143,0.857143,84575189_0_6365692015941409487.csv
0,1,1.0,1.0,1.0,3OX1PGQD.csv
0,1,1.0,1.0,1.0,4HYT5D2J.csv
0,1,0.768182,0.768182,0.768182,28086084_0_3127660530989916727.csv
0,1,0.925,0.925,0.925,U4430LA9.csv
0,1,0.886905,0.886905,0.886905,50270082_0_444360818941411589.csv
0,1,0.95,0.95,0.95,PT0GTLGV.csv


In [59]:
metrics_df['precision'].mean()

0.8392687631571489

In [60]:
add_color(dev_predictions_top_k, dev_colorized_path, final_score_column, k=1)

BOXTVP7V.csv
add-color Time: 0.05705690383911133s
E5SHJSQZ.csv
add-color Time: 0.041837215423583984s
DBH21J5D.csv
add-color Time: 0.037858009338378906s
84575189_0_6365692015941409487.csv
add-color Time: 0.0854647159576416s
3OX1PGQD.csv
add-color Time: 0.03836774826049805s
4HYT5D2J.csv
add-color Time: 0.04429912567138672s
28086084_0_3127660530989916727.csv
add-color Time: 0.1522679328918457s
U4430LA9.csv
add-color Time: 0.04514360427856445s
50270082_0_444360818941411589.csv
add-color Time: 0.1192631721496582s
PT0GTLGV.csv
add-color Time: 0.06360697746276855s
FV3PPNAQ.csv
add-color Time: 0.04279303550720215s
JYC6D9MU.csv
add-color Time: 0.06297707557678223s
VADKVBSJ.csv
add-color Time: 0.06454062461853027s
FU7P6GOF.csv
add-color Time: 0.05101275444030762s
6T4QNE30.csv
add-color Time: 0.06225085258483887s
TLAL3B63.csv
add-color Time: 0.0523371696472168s
VNSUNG1M.csv
add-color Time: 0.049198150634765625s
FDOC6GMJ.csv
add-color Time: 0.0635690689086914s
29414811_2_4773219892816395776.csv
ad

In [61]:
metrics_df.to_csv(f"{dev_metrics_path}/metrics_1.csv", index=False)