In [1]:
import glob
import time
import os
import pandas as pd
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import pickle
from argparse import ArgumentParser, Namespace
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from itertools import chain
from tqdm import tqdm
import copy
import shutil
import pickle

I assume that the candidate generation and feature genration has already be run on the training and dev tables

In [2]:
es_url = 'http://ckg07:9200'
es_index = 'wikidatadwd-augmented'

# Input Paths

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-train-canonical/
train_path = f'/Users/amandeep/Github/table-linker/data/t2dv2/t2dv2-train-canonical'

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/canonical-with-context/t2dv2-dev-canonical/
dev_path = f'/Users/amandeep/Github/table-linker/data/t2dv2/t2dv2-dev-canonical'

# GDrive Path: /table-linker-dataset/2019-iswc_challenge_data/t2dv2/ground_truth/Xinting_GT_csv
ground_truth_files = f'/Users/amandeep/Github/table-linker/data/t2dv2/round_1_GT'

# can be downloaded from https://github.com/usc-isi-i2/table-linker-pipelines/blob/main/table-linker-full-pipeline/models/weighted_lr.pkl
classifier_model_path = '/Users/amandeep/Github/table-linker-pipelines/table-linker-full-pipeline/models/weighted_lr.pkl'


# OUTPUT PATHS
output_path = '/Users/amandeep/Github/table-linker/data/t2dv2'
train_output_path = f'{output_path}/train-output'
dev_output_path = f'{output_path}/dev-output'

# increase version to create a new folder for an experiment
VERSION = "2.0"

train_candidate_path = f'{train_output_path}/{VERSION}/candidates'
train_feature_path = f'{train_output_path}/{VERSION}/features'

dev_candidate_path = f'{dev_output_path}/{VERSION}/candidates'
dev_feature_path = f'{dev_output_path}/{VERSION}/features'
dev_output_predictions = f'{dev_output_path}/{VERSION}/dev_predictions'
dev_predictions_top_k = f'{dev_output_path}/{VERSION}/dev_predictions_top_k'
dev_colorized_path = f'{dev_output_path}/{VERSION}/dev_predictions_colorized'
dev_metrics_path = f'{dev_output_path}/{VERSION}/dev_predictions_metrics'

aux_field = 'graph_embedding_complex,class_count,property_count,context'


train_prop_count = f'{train_output_path}/{VERSION}/train_prop_count' 
train_class_count = f'{train_output_path}/{VERSION}/train_class_count'
train_context_path = f'{train_output_path}/{VERSION}/train_context'
train_graph_embedding = f'{train_output_path}/{VERSION}/train_graph_embedding'

dev_prop_count = f'{dev_output_path}/{VERSION}/dev_prop_count'
dev_class_count = f'{dev_output_path}/{VERSION}/dev_class_count'
dev_context_path = f'{dev_output_path}/{VERSION}/dev_context'
dev_graph_embedding = f'{dev_output_path}/{VERSION}/dev_graph_embedding'

temp_dir = f'{output_path}/temp'



pos_output = f'{temp_dir}/training_data/pos_features.pkl'
neg_output = f'{temp_dir}/training_data/neg_features.pkl'
min_max_scaler_path = f'{temp_dir}/training_data/normalization_factor.pkl'

final_score_column = 'siamese_pred'

model_save_path = f'{output_path}/saved_models'
best_model_path = ''

In [3]:
!mkdir -p $temp_dir

!mkdir -p $train_prop_count
!mkdir -p $dev_prop_count
!mkdir -p $train_class_count
!mkdir -p $dev_class_count
!mkdir -p $train_graph_embedding
!mkdir -p $dev_graph_embedding
!mkdir -p $train_context_path
!mkdir -p $dev_context_path

!mkdir -p $train_candidate_path
!mkdir -p $dev_candidate_path

!mkdir -p $train_feature_path
!mkdir -p $dev_feature_path

!mkdir -p $temp_dir/training_data
!mkdir -p $dev_output_predictions
!mkdir -p $model_save_path
!mkdir -p $dev_predictions_top_k
!mkdir -p $dev_colorized_path
!mkdir -p $dev_metrics_path

In [4]:
features = ['pagerank','retrieval_score','monge_elkan','monge_elkan_aliases','des_cont_jaccard',
            'jaro_winkler','levenshtein','singleton','num_char','num_tokens',
           'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score',
           'lof-graph-embedding-score', 'lof-reciprocal-rank']

## Candidate Generation

In [7]:
def candidate_generation(path, gt_path, output_path, class_count_path, prop_count_path, context_path, graph_embedding):
    file_list = glob.glob(path + '/*.csv')
    for i, file in enumerate(file_list):
        st = time.time()
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        gt_file = f"{ground_truth_files}/{filename}"
        output_file = f"{output_path}/{filename}"
        
        !tl clean -c label -o label_clean $file / \
        --url $es_url --index $es_index \
        get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder $temp_dir / \
        --url $es_url --index $es_index \
        get-exact-matches -c label_clean \
        --auxiliary-fields {aux_field} \
        --auxiliary-folder {temp_dir} / \
        ground-truth-labeler --gt-file $gt_file > $output_file
        
        for field in aux_field.split(','):
            aux_list = []
            for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
                aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
            aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode'])
            if field == 'class_count':
                class_count_file = f"{class_count_path}/{filename.strip('.csv')}_class_count.tsv"
                aux_df.to_csv(class_count_file, sep='\t', index=False)
            elif field == 'property_count':
                prop_count_file = f"{prop_count_path}/{filename.strip('.csv')}_prop_count.tsv"
                aux_df.to_csv(prop_count_file, sep='\t', index=False)
            elif field == 'context':
                context_file = f"{context_path}/{filename.strip('.csv')}_context.tsv"
                aux_df.to_csv(context_file, sep='\t', index=False)
            else:
                graph_embedding_file = f"{graph_embedding}/{filename.strip('.csv')}_graph_embedding_complex.tsv"
                aux_df.to_csv(graph_embedding_file, sep='\t', index=False)
        
        print(time.time() - st)


In [8]:
candidate_generation(train_path, ground_truth_files, train_candidate_path, train_class_count, train_prop_count, train_context_path,train_graph_embedding)

58891288_0_1117541047012405958.csv: 1 of 44
16.49327778816223
39173938_0_7916056990138658530.csv: 2 of 44
15.740509033203125
10579449_0_1681126353774891032.csv: 3 of 44
10.438456773757935
33401079_0_9127583903019856402.csv: 4 of 44
15.381871223449707
21362676_0_6854186738074119688.csv: 5 of 44
16.290204763412476
38428277_0_1311643810102462607.csv: 6 of 44
18.293812036514282
91959037_0_7907661684242014480.csv: 7 of 44
33.63757681846619
20135078_0_7570343137119682530.csv: 8 of 44
13.750951051712036
35188621_0_6058553107571275232.csv: 9 of 44
19.511975288391113
54719588_0_8417197176086756912.csv: 10 of 44
30.593504190444946
21245481_0_8730460088443117515.csv: 11 of 44
24.660501956939697
71840765_0_6664391841933033844.csv: 12 of 44
10.718027830123901
8468806_0_4382447409703007384.csv: 13 of 44
18.379832983016968
88523363_0_8180214313099580515.csv: 14 of 44
39.93649196624756
29414811_13_8724394428539174350.csv: 15 of 44
8.251392126083374
99070098_0_2074872741302696997.csv: 16 of 44
18.31780

In [9]:
candidate_generation(dev_path, ground_truth_files, dev_candidate_path, dev_class_count, dev_prop_count, dev_context_path, dev_graph_embedding)

39759273_0_1427898308030295194.csv: 1 of 9
15.287719964981079
45073662_0_3179937335063201739.csv: 2 of 9
9.924513101577759
29414811_2_4773219892816395776.csv: 3 of 9
6.981388092041016
84575189_0_6365692015941409487.csv: 4 of 9
13.720087051391602
14380604_4_3329235705746762392.csv: 5 of 9
11.153507709503174
52299421_0_4473286348258170200.csv: 6 of 9
15.762428045272827
50270082_0_444360818941411589.csv: 7 of 9
24.03650403022766
28086084_0_3127660530989916727.csv: 8 of 9
30.431898832321167
14067031_0_559833072073397908.csv: 9 of 9
14.37183427810669


## Feature Generation

In [10]:
def feature_generation(candidate_dir, embedding_dir, class_count_dir, property_count_dir, output_path):
    file_list = glob.glob(candidate_dir + '/*.csv')
    for i, file in enumerate(file_list):
        filename = file.split('/')[-1]
        print(f"{filename}: {i+1} of {len(file_list)}")
        embedding_file = f"{embedding_dir}/{filename.strip('.csv')}_graph_embedding_complex.tsv"
        class_count_file = f"{class_count_dir}/{filename.strip('.csv')}_class_count.tsv"
        property_count_file = f"{property_count_dir}/{filename.strip('.csv')}_prop_count.tsv"
        output_file = f"{output_path}/{filename}"
        !time tl align-page-rank $file \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
            / string-similarity -i --method jaro_winkler -o jaro_winkler \
            / string-similarity -i --method levenshtein -o levenshtein \
            / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
            / normalize-scores -c des_cont_jaccard / smallest-qnode-number \
            / mosaic-features -c kg_labels --num-char --num-tokens \
            / create-singleton-feature -o singleton \
            / vote-by-classifier  \
            --prob-threshold 0.995 \
            --model $classifier_model_path \
            / score-using-embedding \
            --column-vector-strategy centroid-of-lof \
            --lof-strategy ems-mv \
            -o lof-graph-embedding-score \
            --embedding-file $embedding_file \
            / generate-reciprocal-rank  \
            -c lof-graph-embedding-score \
            -o lof-reciprocal-rank \
            / compute-tf-idf  \
            --feature-file $class_count_file \
            --feature-name class_count \
            --singleton-column singleton \
            -o lof_class_count_tf_idf_score \
            / compute-tf-idf \
            --feature-file $property_count_file \
            --feature-name property_count \
            --singleton-column singleton \
            -o lof_property_count_tf_idf_score \
            > $output_file

In [11]:
feature_generation(train_candidate_path, train_graph_embedding, train_class_count, train_prop_count, train_feature_path)

58891288_0_1117541047012405958.csv: 1 of 44
Qnodes to lookup: 10717
Qnodes from file: 10399
Outlier removal generates 86 lof-voted candidates

real	0m46.753s
user	0m56.399s
sys	0m9.519s
39173938_0_7916056990138658530.csv: 2 of 44
Qnodes to lookup: 9986
Qnodes from file: 9718
Outlier removal generates 74 lof-voted candidates

real	0m38.779s
user	0m51.260s
sys	0m8.490s
10579449_0_1681126353774891032.csv: 3 of 44
Qnodes to lookup: 1706
Qnodes from file: 1652
Command: score-using-embedding
Error Message:  Traceback (most recent call last):
  File "/Users/amandeep/Github/table-linker/tl/cli/score-using-embedding.py", line 74, in run
    vector_transformer.process_vectors()
  File "/Users/amandeep/Github/table-linker/tl/features/external_embedding.py", line 158, in process_vectors
    if not self._centroid_of_lof():
  File "/Users/amandeep/Github/table-linker/tl/features/external_embedding.py", line 330, in _centroid_of_lof
    lof_pred = clf.fit_predict(vectors)
  File "/Users/amandeep/Gith

In [12]:
feature_generation(dev_candidate_path, dev_graph_embedding, dev_class_count, dev_prop_count, dev_feature_path)

39759273_0_1427898308030295194.csv: 1 of 9
Qnodes to lookup: 10448
Qnodes from file: 10120
Outlier removal generates 103 lof-voted candidates

real	0m55.657s
user	1m7.552s
sys	0m8.793s
45073662_0_3179937335063201739.csv: 2 of 9
Qnodes to lookup: 3040
Qnodes from file: 3004
Command: score-using-embedding
Error Message:  Traceback (most recent call last):
  File "/Users/amandeep/Github/table-linker/tl/cli/score-using-embedding.py", line 74, in run
    vector_transformer.process_vectors()
  File "/Users/amandeep/Github/table-linker/tl/features/external_embedding.py", line 158, in process_vectors
    if not self._centroid_of_lof():
  File "/Users/amandeep/Github/table-linker/tl/features/external_embedding.py", line 330, in _centroid_of_lof
    lof_pred = clf.fit_predict(vectors)
  File "/Users/amandeep/Github/table-linker/tl_env/lib/python3.9/site-packages/sklearn/neighbors/_lof.py", line 246, in _fit_predict
    return self.fit(X)._predict()
  File "/Users/amandeep/Github/table-linker/tl_

### Generate Training Data

In [13]:
def merge_files(args):
    datapath = args.train_path
    eval_file_names = []
    for (dirpath, dirnames, filenames) in os.walk(datapath):
        for fn in filenames:
            if "csv" not in fn:
                continue
            abs_fn = f"{dirpath}/{fn}"
            assert os.path.isfile(abs_fn)
            if os.path.getsize(abs_fn) == 0:
                continue
            eval_file_names.append(abs_fn)
    df_list = []
    for fn in eval_file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        df_list.append(df)
    return pd.concat(df_list) 

def compute_normalization_factor(args, all_data):
    min_max_scaler_path = args.min_max_scaler_path
    all_data_features = all_data[features]
    scaler = MinMaxScaler()
    scaler.fit(all_data_features)
    pickle.dump(scaler, open(min_max_scaler_path, 'wb'))
    return scaler

def generate_train_data(args):
    scaler_path = args.min_max_scaler_path
    scaler = pickle.load(open(scaler_path, 'rb'))
    final_list = []
    sfeatures = copy.deepcopy(features) + ['evaluation_label']
    normalize_features = features
    evaluation_label = ['evaluation_label']
    positive_features_final = []
    negative_features_final = []
    for i,file in enumerate(glob.glob(args.train_path + '/*.csv')):
        file_name = file.split('/')[-1]
        print(file_name)
        if os.path.getsize(file) == 0:
                continue
        d_sample = pd.read_csv(file)
#         grouped_obj = d_sample.groupby(['row', 'column'])
        grouped_obj = d_sample.groupby(['column', 'row'])
        for cell in grouped_obj:
            cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
            pos_features = []
            neg_features = []
            a = cell[1][cell[1]['evaluation_label'] == 1]
            if a.empty:
                continue
            num_rows = 64
            pos_row = a[sfeatures].drop('evaluation_label',axis=1)
            negatives_filtered = cell[1][cell[1]['evaluation_label'] == -1]
            sorted_df = negatives_filtered.sort_values('lof-graph-embedding-score',ascending=False)
            sorted_df = sorted_df[sfeatures]
            if 0 in sorted_df['evaluation_label'].tolist():
                continue
            if sorted_df.empty:
                continue
            neg_list = []
            if num_rows < len(sorted_df):
                sorted_df = sorted_df[sorted_df['evaluation_label'] == -1]
                neg_list.append(sorted_df[:2])
                retrieval_score_df = sorted_df[2:].sort_values('retrieval_score',ascending=False)
                neg_list.append(retrieval_score_df[:2])
                pagerank_score_df = retrieval_score_df[2:].sort_values('pagerank', ascending=False)
                neg_list.append(pagerank_score_df[:2])
                class_count_score_df = pagerank_score_df[2:].sort_values('lof_class_count_tf_idf_score', ascending=False)
                neg_list.append(class_count_score_df[:2])
                prop_count_score_df = class_count_score_df[2:].sort_values('lof_property_count_tf_idf_score', ascending=False)
                neg_list.append(prop_count_score_df[:2])
                monge_elkan_score_df = prop_count_score_df[2:].sort_values('monge_elkan', ascending=False)
                neg_list.append(monge_elkan_score_df[:2])
                monge_elkan_alias_score_df = monge_elkan_score_df[2:].sort_values('monge_elkan_aliases', ascending=False)
                neg_list.append(monge_elkan_alias_score_df[:2])

                jaro_winkler_score_df = monge_elkan_alias_score_df[2:].sort_values('jaro_winkler', ascending=False)
                neg_list.append(jaro_winkler_score_df[:2])
                top_sample_df = jaro_winkler_score_df.sample(n=50)
                neg_list.append(top_sample_df)
                top_sample_df = pd.concat(neg_list)
                top_sample_df.drop('evaluation_label', inplace=True, axis=1)
                top_sample_arr = top_sample_df.to_numpy()

            for i in range(len(top_sample_arr)):
                neg_features.append(top_sample_arr[i])
            random.shuffle(neg_features)
            for i in range(len(top_sample_arr)):
                pos_row_sample = pos_row.sample(n=1)
                ar = pos_row_sample.to_numpy()
                for ps_ar in ar:
                    pos_features.append(ps_ar)
            positive_features_final.append(pos_features)
            negative_features_final.append(neg_features)
    print(len(positive_features_final), len(positive_features_final[37]))
    print(len(negative_features_final), len(negative_features_final[37]))
    pickle.dump(positive_features_final,open(args.pos_output,'wb'))
    pickle.dump(negative_features_final,open(args.neg_output,'wb'))


In [14]:
gen_training_data_args = Namespace(train_path=train_feature_path, pos_output=pos_output, neg_output=neg_output, 
                 min_max_scaler_path=min_max_scaler_path)
all_data = merge_files(gen_training_data_args)
scaler = compute_normalization_factor(gen_training_data_args, all_data)
generate_train_data(gen_training_data_args)


58891288_0_1117541047012405958.csv
39173938_0_7916056990138658530.csv
10579449_0_1681126353774891032.csv
33401079_0_9127583903019856402.csv
21362676_0_6854186738074119688.csv
38428277_0_1311643810102462607.csv
91959037_0_7907661684242014480.csv
20135078_0_7570343137119682530.csv
35188621_0_6058553107571275232.csv
54719588_0_8417197176086756912.csv
21245481_0_8730460088443117515.csv
71840765_0_6664391841933033844.csv
8468806_0_4382447409703007384.csv
88523363_0_8180214313099580515.csv
29414811_13_8724394428539174350.csv
99070098_0_2074872741302696997.csv
43237185_1_3636357855502246981.csv
46671561_0_6122315295162029872.csv
53989675_0_8697482470743954630.csv
25404227_0_2240631045609013057.csv
9834884_0_3871985887467090123.csv
63450419_0_8012592961815711786.csv
1438042986423_95_20150728002306-00125-ip-10-236-191-2_88435628_5.csv
22864497_0_8632623712684511496.csv
53822652_0_5767892317858575530.csv
37856682_0_6818907050314633217.csv
26310680_0_5150772059999313798.csv
29414811_12_2511524702

### Model Definition

In [15]:
# Dataset
class T2DV2Dataset(Dataset):
    def __init__(self, pos_features, neg_features):
        self.pos_features = pos_features
        self.neg_features = neg_features
    
    def __len__(self):
        return len(self.pos_features)
    
    def __getitem__(self, idx):
        return self.pos_features[idx], self.neg_features[idx]

# Model
class PairwiseNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        #original 12x24, 24x12, 12x12, 12x1
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc2 = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)
    
    def forward(self, pos_features, neg_features):
        # Positive pass
        x = F.relu(self.fc1(pos_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        pos_out = torch.sigmoid(self.fc4(x))
        
        # Negative Pass
        x = F.relu(self.fc1(neg_features))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        neg_out = torch.sigmoid(self.fc4(x))
        
        return pos_out, neg_out
    
    def predict(self, test_feat):
        x = F.relu(self.fc1(test_feat))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        test_out = torch.sigmoid(self.fc4(x))
        return test_out

# Pairwise Loss
class PairwiseLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = 0
    
    def forward(self, pos_out, neg_out):
        distance = (1 - pos_out) + neg_out
        loss = torch.mean(torch.max(torch.tensor(0), distance))
        return loss

### Training

In [16]:
def generate_dataloader(positive_feat_path, negative_feat_path):
    pos_features = pickle.load(open(positive_feat_path, 'rb'))
    neg_features = pickle.load(open(negative_feat_path, 'rb'))

    pos_features_flatten = list(chain.from_iterable(pos_features))
    neg_features_flatten = list(chain.from_iterable(neg_features))

    train_dataset = T2DV2Dataset(pos_features_flatten, neg_features_flatten)
    train_dataloader = DataLoader(train_dataset, batch_size=64)
    return train_dataloader

def infer_scores(min_max_scaler_path, input_table_path, output_table_path, model):
    scaler = pickle.load(open(min_max_scaler_path, 'rb'))
    normalize_features = features
    for file in glob.glob(input_table_path + '/*.csv'):
        file_name = file.split('/')[-1]
        if os.path.getsize(file) == 0:
                continue
        if file_name != '52299421_0_4473286348258170200.csv':
            print(file_name)
            d_sample = pd.read_csv(file)
            grouped_obj = d_sample.groupby(['column', 'row'])
            new_df_list = []
            pred = []
            for cell in grouped_obj:
                cell[1][normalize_features] = scaler.transform(cell[1][normalize_features])
                sorted_df = cell[1].sort_values('lof-graph-embedding-score',ascending=False)[:64]
                sorted_df_features = sorted_df[normalize_features]
                new_df_list.append(sorted_df)
                arr = sorted_df_features.to_numpy()
                test_inp = []
                for a in arr:
                    test_inp.append(a)
                test_tensor = torch.tensor(test_inp).float()
                scores = model.predict(test_tensor)
                pred.extend(torch.squeeze(scores).tolist())
            test_df = pd.concat(new_df_list)
            test_df['siamese_pred'] = pred
            test_df.to_csv(f"{output_table_path}/{file_name}", index=False)

def train(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')
    
    else:
        device = torch.device('cpu')
    train_dataloader = generate_dataloader(args.positive_feat_path, args.negative_feat_path)
    criterion = PairwiseLoss()
    EPOCHS = args.num_epochs
    model = PairwiseNetwork(len(features)).to(device=device)
    optimizer = Adam(model.parameters(), lr=args.lr)
    top1_max_prec = 0
    for epoch in range(EPOCHS):
        train_epoch_loss = 0
        avg_loss = 0
        model.train()
        for bid, batch in tqdm(enumerate(train_dataloader), position=0, leave=True):
            positive_feat = torch.tensor(batch[0].float())
            negative_feat = torch.tensor(batch[1].float())
            optimizer.zero_grad()
            pos_out, neg_out = model(positive_feat, negative_feat)
            loss = criterion(pos_out, neg_out)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss
        avg_loss = train_epoch_loss / bid

        # Evaluation
        model.eval()
        infer_scores(args.min_max_scaler_path, args.dev_path, args.dev_output, model)
        eval_data = merge_eval_files(args.dev_output)
        res, candidate_eval_data = parse_eval_files_stats(eval_data, 'siamese_pred')
        top1_precision = res['num_tasks_with_model_score_top_one_accurate']/res['num_tasks_with_gt']
        if top1_precision > top1_max_prec:
            top1_max_prec = top1_precision
            model_save_name = 'epoch_{}_loss_{}_top1_{}.pth'.format(epoch, avg_loss, top1_max_prec)
            best_model_path = os.path.join(args.model_save_path, model_save_name)
            torch.save(model.state_dict(), best_model_path)
        
        print("Epoch {}, Avg Loss is {}, epoch top1 {}, max top1 {}".format(epoch, avg_loss, top1_precision, top1_max_prec))
    return best_model_path

In [17]:
def merge_eval_files(final_score_path):
    eval_file_names = []
    df_list = []
    for (dirpath, dirnames, filenames) in os.walk(final_score_path):
        for fn in filenames:
            if fn != '52299421_0_4473286348258170200.csv':
                if "csv" not in fn:
                    continue
                abs_fn = os.path.join(dirpath, fn)
                assert os.path.isfile(abs_fn)
                if os.path.getsize(abs_fn) == 0:
                    continue
                eval_file_names.append(abs_fn)
    
    for fn in eval_file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        # df = df.fillna('')
        df_list.append(df)
    return pd.concat(df_list)

def parse_eval_files_stats(eval_data, method):
    res = {}
    candidate_eval_data = eval_data.groupby(['table_id', 'column', 'row'])['table_id'].count().reset_index(name="count")
    res['num_tasks'] = len(eval_data.groupby(['table_id', 'column', 'row']))
    res['num_tasks_with_gt'] = len(eval_data[pd.notna(eval_data['GT_kg_id'])].groupby(['table_id', 'column', 'row']))
    res['num_tasks_with_gt_in_candidate'] = len(eval_data[eval_data['evaluation_label'] == 1].groupby(['table_id', 'column', 'row']))
    res['num_tasks_with_singleton_candidate'] = len(candidate_eval_data[candidate_eval_data['count'] == 1].groupby(['table_id', 'column', 'row']))
    singleton_eval_data = candidate_eval_data[candidate_eval_data['count'] == 1]
    num_tasks_with_singleton_candidate_with_gt = 0
    for i, row in singleton_eval_data.iterrows():
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) == 1
        if c_e_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_singleton_candidate_with_gt += 1
    res['num_tasks_with_singleton_candidate_with_gt'] = num_tasks_with_singleton_candidate_with_gt
    num_tasks_with_graph_top_one_accurate = []
    num_tasks_with_graph_top_five_accurate = []
    num_tasks_with_graph_top_ten_accurate = []
    num_tasks_with_model_score_top_one_accurate = []
    num_tasks_with_model_score_top_five_accurate = []
    num_tasks_with_model_score_top_ten_accurate = []
    has_gt_list = []
    has_gt_in_candidate = []
    # candidate_eval_data = candidate_eval_data[:1]
    for i, row in candidate_eval_data.iterrows():
        #print(i)
        table_id, row_idx, col_idx = row['table_id'], row['row'], row['column']
        c_e_data = eval_data[(eval_data['table_id'] == table_id) & (eval_data['row'] == row_idx) & (eval_data['column'] == col_idx)]
        assert len(c_e_data) > 0
        if np.nan not in set(c_e_data['GT_kg_id']):
            has_gt_list.append(1)
        else:
            has_gt_list.append(0)
        if 1 in set(c_e_data['evaluation_label']):
            has_gt_in_candidate.append(1)
        else:
            has_gt_in_candidate.append(0)
            
        # handle graph-embedding-score
        s_data = c_e_data.sort_values(by=['lof-graph-embedding-score'], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_graph_top_one_accurate.append(1)
        else:
            num_tasks_with_graph_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_graph_top_five_accurate.append(1)
        else:
            num_tasks_with_graph_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_graph_top_ten_accurate.append(1)
        else:
            num_tasks_with_graph_top_ten_accurate.append(0)
        
        #rank on model score
        s_data = c_e_data.sort_values(by=[method], ascending=False)
        if s_data.iloc[0]['evaluation_label'] == 1:
            num_tasks_with_model_score_top_one_accurate.append(1)
        else:
            num_tasks_with_model_score_top_one_accurate.append(0)
        if 1 in set(s_data.iloc[0:5]['evaluation_label']):
            num_tasks_with_model_score_top_five_accurate.append(1)
        else:
            num_tasks_with_model_score_top_five_accurate.append(0)
        if 1 in set(s_data.iloc[0:10]['evaluation_label']):
            num_tasks_with_model_score_top_ten_accurate.append(1)
        else:
            num_tasks_with_model_score_top_ten_accurate.append(0)
            
        cf_e_data = c_e_data.copy()
        cf_e_data['lof-graph-embedding-score'] = cf_e_data['lof-graph-embedding-score'].replace(np.nan, 0)
        cf_e_data[method] = cf_e_data[method].replace(np.nan, 0)

    candidate_eval_data['lof-graph_top_one_accurate'] = num_tasks_with_graph_top_one_accurate
    candidate_eval_data['lof-graph_top_five_accurate'] = num_tasks_with_graph_top_five_accurate
    candidate_eval_data['lof-graph_top_ten_accurate'] = num_tasks_with_graph_top_five_accurate
    candidate_eval_data['model_top_one_accurate'] = num_tasks_with_model_score_top_one_accurate
    candidate_eval_data['model_top_five_accurate'] = num_tasks_with_model_score_top_five_accurate
    candidate_eval_data['model_top_ten_accurate'] = num_tasks_with_model_score_top_ten_accurate
    candidate_eval_data['has_gt'] = has_gt_list
    candidate_eval_data['has_gt_in_candidate'] = has_gt_in_candidate
    res['num_tasks_with_graph_top_one_accurate'] = sum(num_tasks_with_graph_top_one_accurate)
    res['num_tasks_with_graph_top_five_accurate'] = sum(num_tasks_with_graph_top_five_accurate)
    res['num_tasks_with_graph_top_ten_accurate'] = sum(num_tasks_with_graph_top_ten_accurate)
    res['num_tasks_with_model_score_top_one_accurate'] = sum(num_tasks_with_model_score_top_one_accurate)
    res['num_tasks_with_model_score_top_five_accurate'] = sum(num_tasks_with_model_score_top_five_accurate)
    res['num_tasks_with_model_score_top_ten_accurate'] = sum(num_tasks_with_model_score_top_ten_accurate)
    return res, candidate_eval_data

In [18]:
training_args = Namespace(num_epochs=20, lr=0.001, positive_feat_path=pos_output, negative_feat_path=neg_output,
                         dev_path=dev_feature_path, dev_output=dev_output_predictions,
                         model_save_path=model_save_path, min_max_scaler_path=min_max_scaler_path)

In [19]:
## Call Training
best_model_path = train(training_args)

  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
5753it [00:11, 490.96it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
47it [00:00, 462.12it/s]

Epoch 0, Avg Loss is 0.13126103579998016, epoch top1 0.8370044052863436, max top1 0.8370044052863436


5753it [00:13, 435.45it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
45it [00:00, 448.55it/s]

Epoch 1, Avg Loss is 0.09571801126003265, epoch top1 0.8766519823788547, max top1 0.8766519823788547


5753it [00:13, 438.94it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
45it [00:00, 445.03it/s]

Epoch 2, Avg Loss is 0.09287526458501816, epoch top1 0.8428781204111601, max top1 0.8766519823788547


5753it [00:12, 455.20it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
33it [00:00, 321.23it/s]

Epoch 3, Avg Loss is 0.09020643681287766, epoch top1 0.8370044052863436, max top1 0.8766519823788547


5753it [00:15, 371.33it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


0it [00:00, ?it/s]

Epoch 4, Avg Loss is 0.0908387303352356, epoch top1 0.8281938325991189, max top1 0.8766519823788547


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
5753it [00:12, 444.62it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
48it [00:00, 474.90it/s]

Epoch 5, Avg Loss is 0.08578304201364517, epoch top1 0.8311306901615272, max top1 0.8766519823788547


5753it [00:10, 524.94it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
48it [00:00, 472.39it/s]

Epoch 6, Avg Loss is 0.08764868229627609, epoch top1 0.8502202643171806, max top1 0.8766519823788547


5753it [00:11, 483.71it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
44it [00:00, 431.27it/s]

Epoch 7, Avg Loss is 0.08847170323133469, epoch top1 0.8208516886930984, max top1 0.8766519823788547


5753it [00:14, 410.18it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
29it [00:00, 286.15it/s]

Epoch 8, Avg Loss is 0.08743066340684891, epoch top1 0.8120411160058737, max top1 0.8766519823788547


5753it [00:13, 413.09it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
47it [00:00, 461.70it/s]

Epoch 9, Avg Loss is 0.08541082590818405, epoch top1 0.7430249632892805, max top1 0.8766519823788547


5753it [00:11, 482.39it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
99it [00:00, 493.82it/s]

Epoch 10, Avg Loss is 0.08799580484628677, epoch top1 0.7797356828193832, max top1 0.8766519823788547


5753it [00:12, 454.16it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
54it [00:00, 536.15it/s]

Epoch 11, Avg Loss is 0.08625132590532303, epoch top1 0.762114537444934, max top1 0.8766519823788547


5753it [00:10, 539.99it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
47it [00:00, 464.25it/s]

Epoch 12, Avg Loss is 0.0853491723537445, epoch top1 0.788546255506608, max top1 0.8766519823788547


5753it [00:11, 508.78it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
44it [00:00, 433.15it/s]

Epoch 13, Avg Loss is 0.08756887167692184, epoch top1 0.7400881057268722, max top1 0.8766519823788547


5753it [00:13, 423.24it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
47it [00:00, 468.77it/s]

Epoch 14, Avg Loss is 0.08279433101415634, epoch top1 0.7415565345080763, max top1 0.8766519823788547


5753it [00:14, 403.79it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
40it [00:00, 397.35it/s]

Epoch 15, Avg Loss is 0.08727917820215225, epoch top1 0.6960352422907489, max top1 0.8766519823788547


5753it [00:13, 437.05it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
46it [00:00, 454.74it/s]

Epoch 16, Avg Loss is 0.08266548812389374, epoch top1 0.73568281938326, max top1 0.8766519823788547


5753it [00:12, 465.87it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
47it [00:00, 466.38it/s]

Epoch 17, Avg Loss is 0.08023349940776825, epoch top1 0.7944199706314243, max top1 0.8766519823788547


5753it [00:13, 440.91it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


  positive_feat = torch.tensor(batch[0].float())
  negative_feat = torch.tensor(batch[1].float())
40it [00:00, 395.84it/s]

Epoch 18, Avg Loss is 0.09169851243495941, epoch top1 0.7474302496328928, max top1 0.8766519823788547


5753it [00:12, 458.33it/s]


39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv
Epoch 19, Avg Loss is 0.08621712774038315, epoch top1 0.7900146842878121, max top1 0.8766519823788547


## Dev Prediction

In [20]:
def dev_prediction(dev_feature_path, dev_predictions_top_k, saved_model, output_column, min_max_scaler_path, k=5):
    for file in glob.glob(dev_feature_path + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
        # location where the output generated by the predictions wil be stored.
        dev_output = f"{dev_predictions_top_k}/{filename}"
        !tl predict-using-model $file -o $output_column \
            --features {",".join(features)} \
            --ranking-model $saved_model \
            --normalization-factor $min_max_scaler_path \
            / get-kg-links -c $output_column -k $k --k-rows \
            > $dev_output

In [21]:
def add_color(dev_predictions_top_k, dev_colorized_path, score_column, k=5):
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_color_file = f"{dev_colorized_path}/{filename.strip('.csv')}.xlsx"
        !tl add-color $file -c $score_column -k $k --output $dev_color_file

In [22]:
def compute_metrics(dev_predictions_top_k, dev_predictions_metrics, score_column, k=5):
    df_list = []
    for file in glob.glob(dev_predictions_top_k + '/*.csv'):
        filename = file.split("/")[-1]
        print(filename)
        if os.path.getsize(file) == 0:
                    continue
                
        dev_metrics_file = f"{dev_predictions_metrics}/{filename}"
        !tl metrics $file -k $k -c $score_column --tag $filename> $dev_metrics_file
        df_list.append(pd.read_csv(dev_metrics_file))
    return pd.concat(df_list)

In [23]:
dev_prediction(dev_feature_path, dev_predictions_top_k, best_model_path, final_score_column, min_max_scaler_path)

39759273_0_1427898308030295194.csv
45073662_0_3179937335063201739.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
52299421_0_4473286348258170200.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


In [24]:
metrics_df = compute_metrics(dev_predictions_top_k, dev_metrics_path, final_score_column, k=1)

39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
52299421_0_4473286348258170200.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv


In [25]:
metrics_df

Unnamed: 0,k,f1,precision,recall,tag
0,1,0.959583,0.98,0.94,39759273_0_1427898308030295194.csv
0,1,0.772727,0.772727,0.772727,29414811_2_4773219892816395776.csv
0,1,0.887755,0.887755,0.887755,84575189_0_6365692015941409487.csv
0,1,0.95,0.95,0.95,14380604_4_3329235705746762392.csv
0,1,0.846154,0.846154,0.846154,52299421_0_4473286348258170200.csv
0,1,0.946429,0.946429,0.946429,50270082_0_444360818941411589.csv
0,1,0.840909,0.840909,0.840909,28086084_0_3127660530989916727.csv
0,1,0.943396,0.943396,0.943396,14067031_0_559833072073397908.csv


In [26]:
add_color(dev_predictions_top_k, dev_colorized_path, final_score_column)

39759273_0_1427898308030295194.csv
29414811_2_4773219892816395776.csv
84575189_0_6365692015941409487.csv
14380604_4_3329235705746762392.csv
52299421_0_4473286348258170200.csv
50270082_0_444360818941411589.csv
28086084_0_3127660530989916727.csv
14067031_0_559833072073397908.csv
