In [1]:
import glob
import time
import os
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import pickle
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
from tqdm import tqdm
import copy
import shutil
import pickle

I assume that the candidate generation and feature genration has already be run on the training and dev tables

In [2]:
es_url = 'http://ckg07:9200'
es_index = 'wikidatadwd-augmented'

# Input Paths

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-train-canonical/
train_path = "/home/sriamazingram/USC/Others/ISI/data/t2dv2/t2dv2-train-canonical"

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-dev-canonical/
dev_path = "/home/sriamazingram/USC/Others/ISI/data/t2dv2/t2dv2-dev-canonical"

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/ground_truth/Xinting_GT_csv
ground_truth_files = "/home/sriamazingram/USC/Others/ISI/data/t2dv2/round_1"

# can be downloaded from https://github.com/usc-isi-i2/table-linker-pipelines/blob/main/table-linker-full-pipeline/models/weighted_lr.pkl
classifier_model_path = '/home/sriamazingram/USC/Others/ISI/Repos/table-linker-pipelines/table-linker-full-pipeline/models/weighted_lr.pkl'


# OUTPUT PATHS
output_path = "/home/sriamazingram/USC/Others/ISI/data/t2dv2"
train_output_path = f'{output_path}/train1-output'
dev_output_path = f'{output_path}/dev-output'

# increase version to create a new folder for an experiment
VERSION = "8_1"

train_candidate_path = f'{train_output_path}/{VERSION}/candidates'
train_feature_path = f'{train_output_path}/{VERSION}/features'

dev_candidate_path = f'{dev_output_path}/{VERSION}/candidates'
dev_feature_path = f'{dev_output_path}/{VERSION}/features'
dev_output_predictions = f'{dev_output_path}/{VERSION}/dev_predictions'
dev_predictions_top_k = f'{dev_output_path}/{VERSION}/dev_predictions_top_k'
dev_colorized_path = f'{dev_output_path}/{VERSION}/dev_predictions_colorized'
dev_metrics_path = f'{dev_output_path}/{VERSION}/dev_predictions_metrics'

aux_field = 'graph_embedding_complex,class_count,property_count,context'


train_prop_count = f'{train_output_path}/{VERSION}/train_prop_count' 
train_class_count = f'{train_output_path}/{VERSION}/train_class_count'
train_context_path = f'{train_output_path}/{VERSION}/train_context'
train_graph_embedding = f'{train_output_path}/{VERSION}/train_graph_embedding'

dev_prop_count = f'{dev_output_path}/{VERSION}/dev_prop_count'
dev_class_count = f'{dev_output_path}/{VERSION}/dev_class_count'
dev_context_path = f'{dev_output_path}/{VERSION}/dev_context'
dev_graph_embedding = f'{dev_output_path}/{VERSION}/dev_graph_embedding'

temp_dir = f'{output_path}/temp'

pos_output = f'{temp_dir}/training_data/pos_features.pkl'
neg_output = f'{temp_dir}/training_data/neg_features.pkl'
min_max_scaler_path = f'{temp_dir}/training_data/normalization_factor.pkl'

final_score_column = 'siamese_prediction'

model_save_path = f'{dev_output_path}/{VERSION}/saved_models'
best_model_path = ''

In [3]:
!mkdir -p "$temp_dir"

!mkdir -p "$train_prop_count"
!mkdir -p "$dev_prop_count"
!mkdir -p "$train_class_count"
!mkdir -p "$dev_class_count"
!mkdir -p "$train_graph_embedding"
!mkdir -p "$dev_graph_embedding"
!mkdir -p "$train_context_path"
!mkdir -p "$dev_context_path"

!mkdir -p "$train_candidate_path"
!mkdir -p "$dev_candidate_path"

!mkdir -p "$train_feature_path"
!mkdir -p "$dev_feature_path"

!mkdir -p "$temp_dir/training_data"
!mkdir -p "$dev_output_predictions"
!mkdir -p "$model_save_path"
!mkdir -p "$dev_predictions_top_k"
!mkdir -p "$dev_colorized_path"
!mkdir -p "$dev_metrics_path"

In [4]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score', 'pseudo_gt']

In [5]:
classifier_features = ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']

## Candidate Generation

In [6]:
def candidate_generation(path, gt_path, output_path, class_count_path, prop_count_path, context_path, graph_embedding):
    file_list = glob.glob(path + '/*.csv')
    for i, file in enumerate(file_list):
        st = time.time()
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        gt_file = f"{ground_truth_files}/{filename}"
        output_file = f"{output_path}/{filename}"
        
        !tl clean -c label -o label_clean "$file" / \
        --url $es_url --index $es_index \
        get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        --url $es_url --index $es_index \
        get-exact-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder "$temp_dir" / \
        ground-truth-labeler --gt-file "$gt_file" > "$output_file"
        
        for field in aux_field.split(','):
            aux_list = []
            for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
                aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
            aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
            if field == 'class_count':
                class_count_file = f"{class_count_path}/{filename.strip('.csv')}_class_count.tsv"
                aux_df.to_csv(class_count_file, sep='\t', index=False)
            elif field == 'property_count':
                prop_count_file = f"{prop_count_path}/{filename.strip('.csv')}_prop_count.tsv"
                aux_df.to_csv(prop_count_file, sep='\t', index=False)
            elif field == 'context':
                context_file = f"{context_path}/{filename.strip('.csv')}_context.tsv"
                aux_df.to_csv(context_file, sep='\t', index=False)
            else:
                graph_embedding_file = f"{graph_embedding}/{filename.strip('.csv')}_graph_embedding_complex.tsv"
                aux_df.to_csv(graph_embedding_file, sep='\t', index=False)
        
        print(time.time() - st)


In [7]:
candidate_generation(train_path, ground_truth_files, train_candidate_path, train_class_count, train_prop_count, train_context_path,train_graph_embedding)

37856682_0_6818907050314633217.csv: 1 of 44
clean Time: 0.01623368263244629s
get-fuzzy-augmented-matches Time: 340.1948094367981s
get-exact-matches Time: 29.81961441040039s
ground-truth-labeler Time: 0.8395054340362549s
381.86977076530457
38428277_0_1311643810102462607.csv: 2 of 44
clean Time: 0.01183176040649414s
get-fuzzy-augmented-matches Time: 51.43580222129822s
get-exact-matches Time: 9.95887541770935s
ground-truth-labeler Time: 0.21856999397277832s
71.64451193809509
29414811_12_251152470253168163.csv: 3 of 44
clean Time: 0.001531362533569336s
get-fuzzy-augmented-matches Time: 15.03918743133545s
get-exact-matches Time: 5.983585834503174s
ground-truth-labeler Time: 0.06758475303649902s
25.41498613357544
9834884_0_3871985887467090123.csv: 4 of 44
clean Time: 0.014010906219482422s
get-fuzzy-augmented-matches Time: 103.59203791618347s
get-exact-matches Time: 17.642162799835205s
ground-truth-labeler Time: 0.4782121181488037s
129.64985156059265
21245481_0_8730460088443117515.csv: 5 of 4

ground-truth-labeler Time: 0.2848701477050781s
35.12664556503296
90196673_0_5458330029110291950.csv: 35 of 44
clean Time: 0.018003225326538086s
get-fuzzy-augmented-matches Time: 77.07895374298096s
get-exact-matches Time: 11.618256568908691s
ground-truth-labeler Time: 0.6195883750915527s
96.99213147163391
39173938_0_7916056990138658530.csv: 36 of 44
clean Time: 0.007390499114990234s
get-fuzzy-augmented-matches Time: 22.474761247634888s
get-exact-matches Time: 6.711711645126343s
ground-truth-labeler Time: 0.20262646675109863s
34.99547553062439
10579449_0_1681126353774891032.csv: 37 of 44
clean Time: 0.0022656917572021484s
get-fuzzy-augmented-matches Time: 7.462184429168701s
get-exact-matches Time: 1.1595327854156494s
ground-truth-labeler Time: 0.0486598014831543s
12.971152067184448
1438042989018_40_20150728002309-00067-ip-10-236-191-2_57714692_2.csv: 38 of 44
clean Time: 0.0018749237060546875s
get-fuzzy-augmented-matches Time: 15.469257354736328s
get-exact-matches Time: 2.642426729202270

In [8]:
candidate_generation(dev_path, ground_truth_files, dev_candidate_path, dev_class_count, dev_prop_count, dev_context_path, dev_graph_embedding)

84575189_0_6365692015941409487.csv: 1 of 9
clean Time: 0.012584447860717773s
get-fuzzy-augmented-matches Time: 26.79003620147705s
get-exact-matches Time: 6.977028846740723s
ground-truth-labeler Time: 0.17999911308288574s
39.51597881317139
28086084_0_3127660530989916727.csv: 2 of 9
clean Time: 0.007283210754394531s
get-fuzzy-augmented-matches Time: 33.79224157333374s
get-exact-matches Time: 7.863823175430298s
ground-truth-labeler Time: 0.39851856231689453s
49.422181844711304
50270082_0_444360818941411589.csv: 3 of 9
clean Time: 0.006052732467651367s
get-fuzzy-augmented-matches Time: 45.74336504936218s
get-exact-matches Time: 9.39232325553894s
ground-truth-labeler Time: 0.48946475982666016s
62.08046865463257
29414811_2_4773219892816395776.csv: 4 of 9
clean Time: 0.002878904342651367s
get-fuzzy-augmented-matches Time: 9.171539068222046s
get-exact-matches Time: 6.042067050933838s
ground-truth-labeler Time: 0.05911731719970703s
20.13024115562439
39759273_0_1427898308030295194.csv: 5 of 9
cl

## Feature Generation

In [9]:
def feature_generation(candidate_dir, embedding_dir, class_count_dir, property_count_dir, context_path, output_path):
    file_list = glob.glob(candidate_dir + '/*.csv')
    for i, file in enumerate(file_list):
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        embedding_file = f"{embedding_dir}/{filename.strip('.csv')}_graph_embedding_complex.tsv"
        class_count_file = f"{class_count_dir}/{filename.strip('.csv')}_class_count.tsv"
        property_count_file = f"{property_count_dir}/{filename.strip('.csv')}_prop_count.tsv"
        context_file = f"{context_path}/{filename.strip('.csv')}_context.tsv"
        output_file = f"{output_path}/{filename}"
        classifier_features_str = ",".join(classifier_features)
        !time tl align-page-rank $file \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
            / string-similarity -i --method jaro_winkler -o jaro_winkler \
            / string-similarity -i --method levenshtein -o levenshtein \
            / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
            / normalize-scores -c des_cont_jaccard / smallest-qnode-number \
            / mosaic-features -c kg_labels --num-char --num-tokens \
            / create-singleton-feature -o singleton \
            / vote-by-classifier  \
            --prob-threshold 0.995 \
            --features $classifier_features_str \
            --model $classifier_model_path \
            / score-using-embedding \
            --column-vector-strategy centroid-of-lof \
            --lof-strategy ems-mv \
            -o lof-graph-embedding-score \
            --embedding-file $embedding_file \
            / generate-reciprocal-rank  \
            -c lof-graph-embedding-score \
            -o lof-reciprocal-rank \
            / compute-tf-idf  \
            --feature-file $class_count_file \
            --feature-name class_count \
            --singleton-column is_lof \
            -o lof_class_count_tf_idf_score \
            / compute-tf-idf \
            --feature-file $property_count_file \
            --feature-name property_count \
            --singleton-column is_lof \
            -o lof_property_count_tf_idf_score \
            / context-match --context-file $context_file \
            -o context_score \
            / create-pseudo-gt \
            --column-thresholds singleton:1,context_score:0.9\
            > $output_file

In [10]:
feature_generation(train_candidate_path, train_graph_embedding, train_class_count, train_prop_count, train_context_path, train_feature_path)

37856682_0_6818907050314633217.csv: 1 of 44
align-page-rank Time: 3.5839428901672363s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 17.59194803237915s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 20.15993309020996s
string-similarity-['jaro_winkler'] Time: 3.8112339973449707s
string-similarity-['levenshtein'] Time: 20.585536241531372s
string-similarity-['jaccard:tokenizer=word'] Time: 0.31127071380615234s
normalize-scores-des_cont_jaccard Time: 0.18901801109313965s
smallest-qnode-number Time: 3.2835514545440674s
mosaic-features Time: 0.0931549072265625s
creat-singleton-feature Time: 1.0386772155761719s
vote-by-classifier Time: 2.6055452823638916s
Qnodes to lookup: 43381
Qnodes from file: 42173
Outlier removal generates 286 lof-voted candidates
score-using-embedding Time: 99.44141983985901s
generate-reciprocal-rank-lof-graph-embedding-score Time: 2.113983154296875s
compute-tf-idf-class_count Time: 108.45605039596558s
compute-tf-idf-property_count Ti

Qnodes to lookup: 13682
Qnodes from file: 13069
Outlier removal generates 40 lof-voted candidates
score-using-embedding Time: 26.349472761154175s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.6712062358856201s
compute-tf-idf-class_count Time: 30.599947929382324s
compute-tf-idf-property_count Time: 31.911883115768433s
context-match Time: 29.3450345993042s
create-pseudo-gt Time: 0.003375530242919922s

real	1m18.980s
user	1m33.149s
sys	0m3.493s
53989675_0_8697482470743954630.csv: 7 of 44
align-page-rank Time: 0.5907995700836182s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.7221124172210693s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 0.9017333984375s
string-similarity-['jaro_winkler'] Time: 0.14286375045776367s
string-similarity-['levenshtein'] Time: 0.7043249607086182s
string-similarity-['jaccard:tokenizer=word'] Time: 0.05025482177734375s
normalize-scores-des_cont_jaccard Time: 0.017475366592407227s
smallest-qnode-number Time: 0.12

smallest-qnode-number Time: 0.2503468990325928s
mosaic-features Time: 0.010849952697753906s
creat-singleton-feature Time: 0.12829923629760742s
vote-by-classifier Time: 0.35686802864074707s
Qnodes to lookup: 3480
Qnodes from file: 3403
Outlier removal generates 53 lof-voted candidates
score-using-embedding Time: 24.771339178085327s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.29511594772338867s
compute-tf-idf-class_count Time: 25.94375228881836s
compute-tf-idf-property_count Time: 26.50812554359436s
context-match Time: 60.51843237876892s
create-pseudo-gt Time: 0.003864765167236328s

real	1m37.830s
user	2m8.009s
sys	0m2.820s
58891288_0_1117541047012405958.csv: 13 of 44
align-page-rank Time: 1.4625704288482666s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 4.038693428039551s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 9.524704456329346s
string-similarity-['jaro_winkler'] Time: 0.7399322986602783s
string-similarity-['levenshtein'] Time:

align-page-rank Time: 1.438978910446167s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 2.630197763442993s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 7.649493455886841s
string-similarity-['jaro_winkler'] Time: 0.5699269771575928s
string-similarity-['levenshtein'] Time: 3.3385543823242188s
string-similarity-['jaccard:tokenizer=word'] Time: 0.09335136413574219s
normalize-scores-des_cont_jaccard Time: 0.03708171844482422s
smallest-qnode-number Time: 0.474776029586792s
mosaic-features Time: 0.015251636505126953s
creat-singleton-feature Time: 0.20422983169555664s
vote-by-classifier Time: 0.34007978439331055s
Qnodes to lookup: 8276
Qnodes from file: 8183
Outlier removal generates 62 lof-voted candidates
score-using-embedding Time: 21.92274236679077s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.45206642150878906s
compute-tf-idf-class_count Time: 23.901442766189575s
compute-tf-idf-property_count Time: 24.10582995414734s
context-match Time: 

Qnodes to lookup: 1922
Qnodes from file: 1861
Outlier removal generates 7 lof-voted candidates
score-using-embedding Time: 6.740741729736328s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.12123370170593262s
compute-tf-idf-class_count Time: 7.863901376724243s
compute-tf-idf-property_count Time: 7.485363006591797s
context-match Time: 0.2155745029449463s
create-pseudo-gt Time: 0.003883838653564453s

real	0m18.603s
user	0m48.136s
sys	0m2.543s
21362676_0_6854186738074119688.csv: 24 of 44
align-page-rank Time: 1.5169978141784668s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 4.22599196434021s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.347371816635132s
string-similarity-['jaro_winkler'] Time: 0.7555856704711914s
string-similarity-['levenshtein'] Time: 4.323064088821411s
string-similarity-['jaccard:tokenizer=word'] Time: 0.19310975074768066s
normalize-scores-des_cont_jaccard Time: 0.04659080505371094s
smallest-qnode-number Time: 0.593973

smallest-qnode-number Time: 5.08288049697876s
mosaic-features Time: 0.07559323310852051s
creat-singleton-feature Time: 1.2987840175628662s
vote-by-classifier Time: 0.40496397018432617s
Qnodes to lookup: 43141
Qnodes from file: 41724
_centroid_of_lof: Missing 1 of 233
Outlier removal generates 139 lof-voted candidates
score-using-embedding Time: 84.75041246414185s
generate-reciprocal-rank-lof-graph-embedding-score Time: 2.496579885482788s
compute-tf-idf-class_count Time: 91.7292697429657s
compute-tf-idf-property_count Time: 94.27704572677612s
context-match Time: 267.2769134044647s
create-pseudo-gt Time: 0.0044460296630859375s

real	6m16.464s
user	6m45.394s
sys	0m6.151s
41480166_0_6681239260286218499.csv: 30 of 44
align-page-rank Time: 3.226585865020752s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 9.361800193786621s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 24.38023352622986s
string-similarity-['jaro_winkler'] Time: 1.962156057357788s
string-si


real	1m51.356s
user	2m22.280s
sys	0m3.406s
90196673_0_5458330029110291950.csv: 35 of 44
align-page-rank Time: 7.00242280960083s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 16.852534294128418s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 18.755080461502075s
string-similarity-['jaro_winkler'] Time: 3.489022970199585s
string-similarity-['levenshtein'] Time: 16.916688442230225s
string-similarity-['jaccard:tokenizer=word'] Time: 0.4146902561187744s
normalize-scores-des_cont_jaccard Time: 0.1455090045928955s
smallest-qnode-number Time: 4.542864561080933s
mosaic-features Time: 0.06823492050170898s
creat-singleton-feature Time: 1.3138232231140137s
vote-by-classifier Time: 0.4051661491394043s
Qnodes to lookup: 22360
Qnodes from file: 21926
Outlier removal generates 144 lof-voted candidates
score-using-embedding Time: 84.75564098358154s
generate-reciprocal-rank-lof-graph-embedding-score Time: 2.7934882640838623s
compute-tf-idf-class_count Time: 90.920879

vote-by-classifier Time: 1.1223773956298828s
Qnodes to lookup: 10760
Qnodes from file: 10387
Outlier removal generates 99 lof-voted candidates
score-using-embedding Time: 31.497501611709595s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.6302452087402344s
compute-tf-idf-class_count Time: 34.024964332580566s
compute-tf-idf-property_count Time: 34.34642481803894s
context-match Time: 53.901068925857544s
create-pseudo-gt Time: 0.003999948501586914s

real	1m40.881s
user	2m10.799s
sys	0m3.196s
50245608_0_871275842592178099.csv: 41 of 44
align-page-rank Time: 4.305098295211792s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 15.580395698547363s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 22.10473656654358s
string-similarity-['jaro_winkler'] Time: 2.3158695697784424s
string-similarity-['levenshtein'] Time: 14.31022024154663s
string-similarity-['jaccard:tokenizer=word'] Time: 0.7259218692779541s
normalize-scores-des_cont_jaccard Time: 0.15855050

In [11]:
feature_generation(dev_candidate_path, dev_graph_embedding, dev_class_count, dev_prop_count, dev_context_path, dev_feature_path)

84575189_0_6365692015941409487.csv: 1 of 9
align-page-rank Time: 1.8409578800201416s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 7.2447426319122314s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 10.854839086532593s
string-similarity-['jaro_winkler'] Time: 0.9745080471038818s
string-similarity-['levenshtein'] Time: 7.9472291469573975s
string-similarity-['jaccard:tokenizer=word'] Time: 0.10293197631835938s
normalize-scores-des_cont_jaccard Time: 0.04551291465759277s
smallest-qnode-number Time: 0.6154012680053711s
mosaic-features Time: 0.02413797378540039s
creat-singleton-feature Time: 0.31090474128723145s
vote-by-classifier Time: 1.2024152278900146s
Qnodes to lookup: 8486
Qnodes from file: 7897
Outlier removal generates 46 lof-voted candidates
score-using-embedding Time: 37.1504967212677s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.6580898761749268s
compute-tf-idf-class_count Time: 40.047500133514404s
compute-tf-idf-property_count Ti

Qnodes to lookup: 2291
Qnodes from file: 2226
Outlier removal generates 9 lof-voted candidates
score-using-embedding Time: 7.152173042297363s
generate-reciprocal-rank-lof-graph-embedding-score Time: 0.13177800178527832s
compute-tf-idf-class_count Time: 8.005523920059204s
compute-tf-idf-property_count Time: 8.181562185287476s
context-match Time: 13.058216333389282s
create-pseudo-gt Time: 0.0032150745391845703s

real	0m32.771s
user	1m2.884s
sys	0m2.764s
14067031_0_559833072073397908.csv: 7 of 9
align-page-rank Time: 0.9596638679504395s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.8410727977752686s
string-similarity-['symmetric_monge_elkan:tokenizer=word'] Time: 3.7526419162750244s
string-similarity-['jaro_winkler'] Time: 0.775482177734375s
string-similarity-['levenshtein'] Time: 4.625465154647827s
string-similarity-['jaccard:tokenizer=word'] Time: 0.22585725784301758s
normalize-scores-des_cont_jaccard Time: 0.03758049011230469s
smallest-qnode-number Time: 0.30246782

### Generate Training Data

In [12]:
def merge_files(args):
    datapath = args.train_path
    eval_file_names = []
    for (dirpath, dirnames, filenames) in os.walk(datapath):
        for fn in filenames:
            if "csv" not in fn:
                continue
            abs_fn = f"{dirpath}/{fn}"
            assert os.path.isfile(abs_fn)
            if os.path.getsize(abs_fn) == 0:
                continue
            eval_file_names.append(abs_fn)
    df_list = []
    for fn in eval_file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        df_list.append(df)
    return pd.concat(df_list) 

def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    pickle.dump(scaler, open(min_max_scaler_path, 'wb'))
    return scaler

def generate_train_data(args):
    scaler_path = args.min_max_scaler_path
    scaler = pickle.load(open(scaler_path, 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features) + ['evaluation_label']
    print(sfeatures)
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    for i,file in enumerate(glob.glob(args.train_path + '/*.csv')):
        file_name = file.split('/')[-1]
        print(file_name)
        if os.path.getsize(file) == 0:
                continue
        d_sample = pd.read_csv(file)
#         grouped_obj = d_sample.groupby(['row', 'column'])
        grouped_obj = d_sample.groupby(['column', 'row'])
        for cell in grouped_obj:
            cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
            pos_features = []
            neg_features = []
            a = cell[1][cell[1]['evaluation_label'] == 1]
            if a.empty:
                continue
            num_rows = 64
            pos_row = a[sfeatures].drop('evaluation_label',axis=1)
            negatives_filtered = cell[1][cell[1]['evaluation_label'] == -1]
            sorted_df = negatives_filtered.sort_values('lof-graph-embedding-score',ascending=False)
            sorted_df = sorted_df[sfeatures]
            if 0 in sorted_df['evaluation_label'].tolist():
                continue
            if sorted_df.empty:
                continue
            neg_list = []
            if num_rows < len(sorted_df):
                sorted_df = sorted_df[sorted_df['evaluation_label'] == -1]
                neg_list.append(sorted_df[:2])
                retrieval_score_df = sorted_df[2:].sort_values('retrieval_score',ascending=False)
                neg_list.append(retrieval_score_df[:2])
                pagerank_score_df = retrieval_score_df[2:].sort_values('pagerank', ascending=False)
                neg_list.append(pagerank_score_df[:2])
                class_count_score_df = pagerank_score_df[2:].sort_values('lof_class_count_tf_idf_score', ascending=False)
                neg_list.append(class_count_score_df[:2])
                prop_count_score_df = class_count_score_df[2:].sort_values('lof_property_count_tf_idf_score', ascending=False)
                neg_list.append(prop_count_score_df[:2])
                monge_elkan_score_df = prop_count_score_df[2:].sort_values('monge_elkan', ascending=False)
                neg_list.append(monge_elkan_score_df[:2])
                monge_elkan_alias_score_df = monge_elkan_score_df[2:].sort_values('monge_elkan_aliases', ascending=False)
                neg_list.append(monge_elkan_alias_score_df[:2])
                
                context_score_df = monge_elkan_alias_score_df[2:].sort_values('context_score', ascending=False)
                neg_list.append(context_score_df[:2])

                jaro_winkler_score_df = monge_elkan_alias_score_df[2:].sort_values('jaro_winkler', ascending=False)
                neg_list.append(jaro_winkler_score_df[:2])
                
                top_sample_df = jaro_winkler_score_df.sample(n=50)
                neg_list.append(top_sample_df)
                top_sample_df = pd.concat(neg_list)
                top_sample_df.drop('evaluation_label', inplace=True, axis=1)
                top_sample_arr = top_sample_df.to_numpy()

            for i in range(len(top_sample_arr)):
                neg_features.append(top_sample_arr[i])
            random.shuffle(neg_features)
            for i in range(len(top_sample_arr)):
                pos_row_sample = pos_row.sample(n=1)
                ar = pos_row_sample.to_numpy()
                for ps_ar in ar:
                    pos_features.append(ps_ar)
            positive_features_final.append(pos_features)
            negative_features_final.append(neg_features)
    print(len(positive_features_final), len(positive_features_final[37]))
    print(len(negative_features_final), len(negative_features_final[37]))
    pickle.dump(positive_features_final,open(args.pos_output,'wb'))
    pickle.dump(negative_features_final,open(args.neg_output,'wb'))


In [13]:
gen_training_data_args = Namespace(train_path=train_feature_path, pos_output=pos_output, neg_output=neg_output, 
                 min_max_scaler_path=min_max_scaler_path)
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)
generate_train_data(gen_training_data_args)


['pagerank', 'retrieval_score', 'monge_elkan', 'monge_elkan_aliases', 'des_cont_jaccard', 'jaro_winkler', 'levenshtein', 'singleton', 'num_char', 'num_tokens', 'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score', 'lof-graph-embedding-score', 'lof-reciprocal-rank', 'context_score', 'pseudo_gt', 'evaluation_label']
37856682_0_6818907050314633217.csv
38428277_0_1311643810102462607.csv
29414811_12_251152470253168163.csv
9834884_0_3871985887467090123.csv
21245481_0_8730460088443117515.csv
39650055_5_7135804139753401681.csv
53989675_0_8697482470743954630.csv
99070098_0_2074872741302696997.csv
46671561_0_6122315295162029872.csv
80588006_0_6965325215443683359.csv
20135078_0_7570343137119682530.csv
39107734_2_2329160387535788734.csv
58891288_0_1117541047012405958.csv
84548468_0_5955155464119382182.csv
16767252_0_2409448375013995751.csv
88523363_0_8180214313099580515.csv
69537082_0_7789694313271016902.csv
33401079_0_9127583903019856402.csv
1438042989043_35_20150728002309-00287-ip-1

### Model Definition

In [14]:
# Dataset
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features
    
    def __len__(self):
        return len(self.pos_features)
    
    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]

# Model
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        #original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc2 = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)
    
    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))
        
        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))
        
        return pos_out, neg_out
    
    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out

# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0
    
    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss

### Training

In [15]:
def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))

    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))

    train_dataset = T2DV2Dataset(pos_features_flatten, neg_features_flatten)
    train_dataloader = DataLoader(train_dataset, batch_size=64)
    return train_dataloader

def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model):
    scaler = pickle.load(open(min_max_scaler_path, 'rb'))
    normalize_features = features
    for file in glob.glob(input_table_path + '/*.csv'):
        file_name = file.split('/')[-1]
        if os.path.getsize(file) == 0:
                continue
        if file_name != '52299421_0_4473286348258170200.csv':
            print(file_name)
            d_sample = pd.read_csv(file)
            grouped_obj = d_sample.groupby(['column', 'row'])
            new_df_list = []
            pred = []
            for cell in grouped_obj:
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                sorted_df = cell[1].sort_values('lof-graph-embedding-score',ascending=False)[:64]
                sorted_df_features = sorted_df[normalize_features]
                new_df_list.append(sorted_df)
                arr = sorted_df_features.to_numpy()
                test_inp = []
                for a in arr:
                    test_inp.append(a)
                test_tensor = torch.tensor(test_inp).float()
                scores = model.predict(test_tensor)
                pred.extend(torch.squeeze(scores).tolist())
            test_df = pd.concat(new_df_list)
            test_df[final_score_column] = pred
            test_df.to_csv(f"{output_table_path}/{file_name}", index=False)

def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')
    
    else:
        device = torch.device('cpu')
    train_dataloader = generate_dataloader(args.positive_feat_path, args.negative_feat_path)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(features)).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid, batch in tqdm(enumerate(train_dataloader), position=0, leave=True):
            positive_feat = torch.tensor(batch[0].float())
            negative_feat = torch.tensor(batch[1].float())
            optimizer.zero_grad()
            pos_out, neg_out = model(positive_feat, negative_feat)
            loss = criterion(pos_out, neg_out)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid

        # Evaluation
        model.eval()
        infer_scores(args.min_max_scaler_path, args.dev_path, args.dev_output, model)
        eval_data = merge_eval_files(args.dev_output)
        res, candidate_eval_data = parse_eval_files_stats(eval_data, final_score_column)
        top1_precision = res['num_tasks_with_model_score_top_one_accurate']/res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            top1_max_prec = top1_precision
            model_save_name = 'epoch_{}_loss_{}_top1_{}.pth'.format(epoch, avg_loss, top1_max_prec)
            best_model_path = os.path.join(args.model_save_path, model_save_name)
            torch.save(model.state_dict(), best_model_path)
        
        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision, top1_max_prec))
    return best_model_path

In [16]:
def merge_eval_files(final_score_path):
    eval_file_names = []
    df_list = []
    for (dirpath, dirnames, filenames) in os.walk(final_score_path):
        for fn in filenames:
            if fn != '52299421_0_4473286348258170200.csv':
                if "csv" not in fn:
                    continue
                abs_fn = os.path.join(dirpath, fn)
                assert os.path.isfile(abs_fn)
                if os.path.getsize(abs_fn) == 0:
                    continue
                eval_file_names.append(abs_fn)
    
    for fn in eval_file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        # df = df.fillna('')
        df_list.append(df)
    return pd.concat(df_list)

def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks'] = len(eval_data.groupby(['table_id', 'column', 'row']))
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    res['num_tasks_with_gt_in_candidate'] = len(eval_data[eval_data['evaluation_label'] == 1].groupby(['table_id', 'column', 'row']))
    res['num_tasks_with_singleton_candidate'] = len(candidate_eval_data[candidate_eval_data['count'] == 1].groupby(['table_id', 'column', 'row']))
    singleton_eval_data = candidate_eval_data[candidate_eval_data['count'] == 1]
    num_tasks_with_singleton_candidate_with_gt = 0
    for i, row in singleton_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) == 1
        if c_e_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_singleton_candidate_with_gt += 1
    res['num_tasks_with_singleton_candidate_with_gt'] = num_tasks_with_singleton_candidate_with_gt
    num_tasks_with_graph_top_one_accurate = []
    num_tasks_with_graph_top_five_accurate = []
    num_tasks_with_graph_top_ten_accurate = []
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    # candidate_eval_data = candidate_eval_data[:1]
    for i, row in candidate_eval_data.iterrows():
        #print(i)
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)
            
        # handle graph-embedding-score
        s_data = c_e_data.sort_values(by=['lof-graph-embedding-score'], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_graph_top_one_accurate.append(1)
        else:
            num_tasks_with_graph_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_graph_top_five_accurate.append(1)
        else:
            num_tasks_with_graph_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_graph_top_ten_accurate.append(1)
        else:
            num_tasks_with_graph_top_ten_accurate.append(0)
        
        #rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_model_score_top_five_accurate.append(1)
        else:
            num_tasks_with_model_score_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_model_score_top_ten_accurate.append(1)
        else:
            num_tasks_with_model_score_top_ten_accurate.append(0)
            
        cf_e_data = c_e_data.copy()
        cf_e_data['lof-graph-embedding-score'] = cf_e_data['lof-graph-embedding-score'].replace(np.nan, 0)
        cf_e_data[method] = cf_e_data[method].replace(np.nan, 0)

    candidate_eval_data['lof-graph_top_one_accurate'] = num_tasks_with_graph_top_one_accurate
    candidate_eval_data['lof-graph_top_five_accurate'] = num_tasks_with_graph_top_five_accurate
    candidate_eval_data['lof-graph_top_ten_accurate'] = num_tasks_with_graph_top_five_accurate
    candidate_eval_data['model_top_one_accurate'] = num_tasks_with_model_score_top_one_accurate
    candidate_eval_data['model_top_five_accurate'] = num_tasks_with_model_score_top_five_accurate
    candidate_eval_data['model_top_ten_accurate'] = num_tasks_with_model_score_top_ten_accurate
    candidate_eval_data['has_gt'] = has_gt_list
    candidate_eval_data['has_gt_in_candidate'] = has_gt_in_candidate
    res['num_tasks_with_graph_top_one_accurate'] = sum(num_tasks_with_graph_top_one_accurate)
    res['num_tasks_with_graph_top_five_accurate'] = sum(num_tasks_with_graph_top_five_accurate)
    res['num_tasks_with_graph_top_ten_accurate'] = sum(num_tasks_with_graph_top_ten_accurate)
    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    res['num_tasks_with_model_score_top_five_accurate'] = sum(num_tasks_with_model_score_top_five_accurate)
    res['num_tasks_with_model_score_top_ten_accurate'] = sum(num_tasks_with_model_score_top_ten_accurate)
    return res, candidate_eval_data

In [17]:
training_args = Namespace(num_epochs=20, lr=0.001, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=dev_feature_path, dev_output=dev_output_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)

In [18]:
## Call Training
best_model_path = train(training_args)

  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
5973it [00:15, 382.01it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
51it [00:00, 505.17it/s]

Epoch 0, Avg Loss is 0.12168200314044952, epoch top1 0.8870056497175142, max top1 0.8870056497175142


5973it [00:10, 543.81it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
54it [00:00, 535.82it/s]

Epoch 1, Avg Loss is 0.09006624668836594, epoch top1 0.9110169491525424, max top1 0.9110169491525424


5973it [00:09, 610.35it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
54it [00:00, 536.86it/s]

Epoch 2, Avg Loss is 0.08523514866828918, epoch top1 0.846045197740113, max top1 0.9110169491525424


5973it [00:09, 613.21it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
51it [00:00, 509.23it/s]

Epoch 3, Avg Loss is 0.08374599367380142, epoch top1 0.8050847457627118, max top1 0.9110169491525424


5973it [00:09, 617.68it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
52it [00:00, 515.25it/s]

Epoch 4, Avg Loss is 0.08500885963439941, epoch top1 0.8121468926553672, max top1 0.9110169491525424


5973it [00:10, 586.52it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


0it [00:00, ?it/s]

Epoch 5, Avg Loss is 0.0799228698015213, epoch top1 0.7175141242937854, max top1 0.9110169491525424


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
5973it [00:09, 613.26it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


0it [00:00, ?it/s]

Epoch 6, Avg Loss is 0.07955721765756607, epoch top1 0.5805084745762712, max top1 0.9110169491525424


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
5973it [00:09, 615.90it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


0it [00:00, ?it/s]

Epoch 7, Avg Loss is 0.08696103096008301, epoch top1 0.768361581920904, max top1 0.9110169491525424


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
5973it [00:09, 602.25it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
52it [00:00, 517.20it/s]

Epoch 8, Avg Loss is 0.08515159785747528, epoch top1 0.6793785310734464, max top1 0.9110169491525424


5973it [00:09, 609.80it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
53it [00:00, 523.63it/s]

Epoch 9, Avg Loss is 0.0814913809299469, epoch top1 0.7372881355932204, max top1 0.9110169491525424


5973it [00:09, 615.22it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
53it [00:00, 527.11it/s]

Epoch 10, Avg Loss is 0.0800306499004364, epoch top1 0.7612994350282486, max top1 0.9110169491525424


5973it [00:09, 600.38it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
51it [00:00, 506.43it/s]

Epoch 11, Avg Loss is 0.08220110833644867, epoch top1 0.4533898305084746, max top1 0.9110169491525424


5973it [00:09, 604.63it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
52it [00:00, 516.07it/s]

Epoch 12, Avg Loss is 0.08605607599020004, epoch top1 0.4788135593220339, max top1 0.9110169491525424


5973it [00:09, 603.30it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
51it [00:00, 508.69it/s]

Epoch 13, Avg Loss is 0.0884942039847374, epoch top1 0.7853107344632768, max top1 0.9110169491525424


5973it [00:09, 604.69it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
53it [00:00, 522.68it/s]

Epoch 14, Avg Loss is 0.08723536878824234, epoch top1 0.7612994350282486, max top1 0.9110169491525424


5973it [00:10, 596.19it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
52it [00:00, 514.86it/s]

Epoch 15, Avg Loss is 0.08647142350673676, epoch top1 0.6129943502824858, max top1 0.9110169491525424


5973it [00:09, 605.40it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
52it [00:00, 512.45it/s]

Epoch 16, Avg Loss is 0.0812574252486229, epoch top1 0.768361581920904, max top1 0.9110169491525424


5973it [00:09, 607.16it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
50it [00:00, 496.51it/s]

Epoch 17, Avg Loss is 0.08396260440349579, epoch top1 0.6680790960451978, max top1 0.9110169491525424


5973it [00:10, 581.17it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv


0it [00:00, ?it/s]

Epoch 18, Avg Loss is 0.09217099845409393, epoch top1 0.7245762711864406, max top1 0.9110169491525424


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
5973it [00:10, 596.30it/s]


84575189_0_6365692015941409487.csv
28086084_0_3127660530989916727.csv
50270082_0_444360818941411589.csv
29414811_2_4773219892816395776.csv
39759273_0_1427898308030295194.csv
14380604_4_3329235705746762392.csv
14067031_0_559833072073397908.csv
45073662_0_3179937335063201739.csv
Epoch 19, Avg Loss is 0.08728277683258057, epoch top1 0.6991525423728814, max top1 0.9110169491525424


In [19]:
best_model_path

'/home/sriamazingram/USC/Others/ISI/data/t2dv2/dev-output/8_1/saved_models/epoch_1_loss_0.09006624668836594_top1_0.9110169491525424.pth'

## Dev Prediction

In [20]:
def dev_prediction(dev_feature_path, dev_predictions_top_k, saved_model, output_column, min_max_scaler_path, k=5):
    for file in glob.glob(dev_feature_path + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        feature_str =  ",".join(features)
        if os.path.getsize(file) == 0:
                    continue
        # location where the output generated by the predictions wil be stored.
        dev_output = f"{dev_predictions_top_k}/{filename}"
        !tl predict-using-model $file -o $output_column \
            --features $feature_str \
            --ranking-model $saved_model \
            --normalization-factor $min_max_scaler_path \
            / get-kg-links -c $output_column -k $k --k-rows \
            > $dev_output

In [21]:
def add_color(dev_predictions_top_k, dev_colorized_path, score_column, k=5):
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_color_file = f"{dev_colorized_path}/{filename.strip('.csv')}.xlsx"
        !tl add-color $file -c "$score_column,evaluation_label" -k $k --output $dev_color_file

In [22]:
def compute_metrics(dev_predictions_top_k, dev_predictions_metrics, score_column, k=5):
    df_list = []
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        !tl metrics $file -k $k -c $score_column --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))
    return pd.concat(df_list)

In [23]:
dev_prediction(dev_feature_path, dev_predictions_top_k, best_model_path, final_score_column, min_max_scaler_path, k=200)

84575189_0_6365692015941409487.csv
predict-using-model Time: 6.13994574546814s
get-kg-links-siamese_prediction Time: 1.1168434619903564s
28086084_0_3127660530989916727.csv
predict-using-model Time: 2.1349174976348877s
get-kg-links-siamese_prediction Time: 2.3903934955596924s
50270082_0_444360818941411589.csv
predict-using-model Time: 1.5177969932556152s
get-kg-links-siamese_prediction Time: 1.7700471878051758s
29414811_2_4773219892816395776.csv
predict-using-model Time: 0.48032307624816895s
get-kg-links-siamese_prediction Time: 0.23757243156433105s
39759273_0_1427898308030295194.csv
predict-using-model Time: 1.0865421295166016s
get-kg-links-siamese_prediction Time: 1.0333786010742188s
14380604_4_3329235705746762392.csv
predict-using-model Time: 0.45293116569519043s
get-kg-links-siamese_prediction Time: 0.22068047523498535s
14067031_0_559833072073397908.csv
predict-using-model Time: 0.7280888557434082s
get-kg-links-siamese_prediction Time: 0.6941039562225342s
45073662_0_3179937335063201

In [24]:
metrics_df = compute_metrics(dev_predictions_top_k, dev_metrics_path, final_score_column, k=200)

84575189_0_6365692015941409487.csv
metrics Time: 3.6917383670806885s
28086084_0_3127660530989916727.csv
metrics Time: 8.609846115112305s
50270082_0_444360818941411589.csv
metrics Time: 5.946252822875977s
29414811_2_4773219892816395776.csv
metrics Time: 0.7568254470825195s
39759273_0_1427898308030295194.csv
metrics Time: 3.6701691150665283s
14380604_4_3329235705746762392.csv
metrics Time: 0.7795116901397705s
14067031_0_559833072073397908.csv
metrics Time: 2.5387775897979736s
45073662_0_3179937335063201739.csv
metrics Time: 1.0177044868469238s
52299421_0_4473286348258170200.csv
metrics Time: 5.182516098022461s


In [25]:
metrics_df

Unnamed: 0,k,f1,precision,recall,tag
0,200,0.962963,0.928571,1.0,84575189_0_6365692015941409487.csv
0,200,0.890932,0.831818,0.959091,28086084_0_3127660530989916727.csv
0,200,0.981818,0.964286,1.0,50270082_0_444360818941411589.csv
0,200,0.926829,0.863636,1.0,29414811_2_4773219892816395776.csv
0,200,0.994975,0.99,1.0,39759273_0_1427898308030295194.csv
0,200,0.95,0.95,0.95,14380604_4_3329235705746762392.csv
0,200,0.943396,0.943396,0.943396,14067031_0_559833072073397908.csv
0,200,1.0,1.0,1.0,45073662_0_3179937335063201739.csv
0,200,0.89011,0.89011,0.89011,52299421_0_4473286348258170200.csv


In [26]:
metrics_df['recall'].mean()

0.971399669512877

In [27]:
metrics_df.to_csv(f"{dev_metrics_path}/metrics_200.csv", index=False)

In [28]:
metrics_df = compute_metrics(dev_predictions_top_k, dev_metrics_path, final_score_column, k=1)

84575189_0_6365692015941409487.csv
metrics Time: 3.7166073322296143s
28086084_0_3127660530989916727.csv
metrics Time: 8.19441843032837s
50270082_0_444360818941411589.csv
metrics Time: 5.905001401901245s
29414811_2_4773219892816395776.csv
metrics Time: 0.7426278591156006s
39759273_0_1427898308030295194.csv
metrics Time: 3.6778008937835693s
14380604_4_3329235705746762392.csv
metrics Time: 0.7746427059173584s
14067031_0_559833072073397908.csv
metrics Time: 2.4658446311950684s
45073662_0_3179937335063201739.csv
metrics Time: 1.0018744468688965s
52299421_0_4473286348258170200.csv
metrics Time: 4.926670551300049s


In [29]:
metrics_df

Unnamed: 0,k,f1,precision,recall,tag
0,1,0.928571,0.928571,0.928571,84575189_0_6365692015941409487.csv
0,1,0.831818,0.831818,0.831818,28086084_0_3127660530989916727.csv
0,1,0.964286,0.964286,0.964286,50270082_0_444360818941411589.csv
0,1,0.840295,0.863636,0.818182,29414811_2_4773219892816395776.csv
0,1,0.984975,0.99,0.98,39759273_0_1427898308030295194.csv
0,1,0.95,0.95,0.95,14380604_4_3329235705746762392.csv
0,1,0.943396,0.943396,0.943396,14067031_0_559833072073397908.csv
0,1,1.0,1.0,1.0,45073662_0_3179937335063201739.csv
0,1,0.884581,0.89011,0.879121,52299421_0_4473286348258170200.csv


In [30]:
metrics_df['f1'].mean()

0.9253246964362959

In [31]:
metrics_df.to_csv(f"{dev_metrics_path}/metrics_1.csv", index=False)

In [32]:
add_color(dev_predictions_top_k, dev_colorized_path, final_score_column)

84575189_0_6365692015941409487.csv
add-color Time: 1.8227787017822266s
28086084_0_3127660530989916727.csv
add-color Time: 3.9826228618621826s
50270082_0_444360818941411589.csv
add-color Time: 2.8458807468414307s
29414811_2_4773219892816395776.csv
add-color Time: 0.360825777053833s
39759273_0_1427898308030295194.csv
add-color Time: 1.7265467643737793s
14380604_4_3329235705746762392.csv
add-color Time: 0.35480785369873047s
14067031_0_559833072073397908.csv
add-color Time: 1.1504685878753662s
45073662_0_3179937335063201739.csv
add-color Time: 0.4418649673461914s
52299421_0_4473286348258170200.csv
add-color Time: 2.371635675430298s
