## Train weighted LR model on t2dv2 data
- Features used in this model: 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized'
- Trained model should be saved on disk for future load
- This part is not included in the actual table linker pipeline

## Dataset 
- Train: https://drive.google.com/drive/u/2/folders/1bUDGEI4gNEw6VY4Bquy0zy10ga-7Mb2M 
- Dev: https://drive.google.com/drive/u/2/folders/1RgxmFI7QGVh53Y_2OwvBk1Uyzu0Y2ftX

In [31]:
import numpy as np
import pandas as pd
from glob import glob
import os
import pickle

In [32]:
HOME_DIR = '/Users/amandeep/Github/table-linker/data/SemTab2020'
training_candidates_dir = f'{HOME_DIR}/table-linker/train1-output/1_0/candidates'
training_features_dir = f'{HOME_DIR}/table-linker/train1-output/1_0/features-voting/train'
trained_model_file = f'{HOME_DIR}/table-linker/train1-output/1_0/weighted_lr.pkl'

dev_candidates_dir = f'{HOME_DIR}/table-linker/dev-output/1_0/candidates'
dev_features_dir = f'{HOME_DIR}/table-linker/dev-output/1_0/features-voting/dev'

In [33]:
!mkdir -p $training_features_dir
!mkdir -p $dev_features_dir

In [34]:
# helper method we use
def clean_dataset(df):
    print(f"length of df: {len(df)}")
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    # df.dropna(inplace=True)
    print(f"length of df: {len(df)} after dropping na")
    indices_to_keep = ~df.loc[:, ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].isin([np.nan, np.inf, -np.inf]).any(1)
    print(f"length of indices_to_keep: {sum(indices_to_keep)}")
    return df[indices_to_keep]

# merge all eval files in one df
def merge_df(file_names: list):
    df_list = []
    for fn in file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        # df = df.fillna('')
        df_list.append(df)
    
    return pd.concat(df_list)

#### Generate training data with 4 required features

In [60]:
training_features = ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'monge_elkan_aliases','des_cont_jaccard_normalized']

In [36]:
def compute_features(candidates_path, features_path):
        file_list = glob(candidates_path + '/*.csv')
        files_num = len(file_list)
        for i, file in enumerate(file_list):
            assert os.path.isfile(file)
            if os.path.getsize(file) == 0:
                continue
            filename = file.split("/")[-1]
            
            print(f"{filename}: {i+1} of {files_num}")
            
            feature_f = f'{features_path}/{filename}'
            
            !tl smallest-qnode-number $file \
            / align-page-rank \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
            / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -c label_clean kg_aliases -o monge_elkan_aliases \
            / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
            / normalize-scores -c des_cont_jaccard \
            > $feature_f
    
    
            assert os.path.isfile(feature_f), f"Something wrong with training feature result: {i + 1}th file: {filename}"

In [37]:
compute_features(training_candidates_dir, training_features_dir) 

ZX8GERJC.csv: 1 of 100
8ZD74BO9.csv: 2 of 100
W0ZNF869.csv: 3 of 100
AM1UELOJ.csv: 4 of 100
5IXA0RAI.csv: 5 of 100
8EFC5XVR.csv: 6 of 100
DPUA686B.csv: 7 of 100
UMMA6HQO.csv: 8 of 100
ERPSWFMM.csv: 9 of 100
ZDAZ5PQ5.csv: 10 of 100
XF412HIL.csv: 11 of 100
BQ36GYQE.csv: 12 of 100
CKRLO13X.csv: 13 of 100
L5LFLQIN.csv: 14 of 100
J6SSKET3.csv: 15 of 100
T8SL8HGK.csv: 16 of 100
JUFYSXYP.csv: 17 of 100
CYYO69JB.csv: 18 of 100
YMHERMQV.csv: 19 of 100
6XCOGRWM.csv: 20 of 100
WNKF57RH.csv: 21 of 100
OMJX8TT6.csv: 22 of 100
IUBTQXYO.csv: 23 of 100
0XXGVKA8.csv: 24 of 100
57681CMM.csv: 25 of 100
VE3T1LHT.csv: 26 of 100
4KGRZFTI.csv: 27 of 100
UU8Q91MG.csv: 28 of 100
75MLA4XJ.csv: 29 of 100
U5L8U1OL.csv: 30 of 100
QDJ86U5I.csv: 31 of 100
384SR1N3.csv: 32 of 100
PG0TP6O0.csv: 33 of 100
VFVMRNF9.csv: 34 of 100
CCCNRESE.csv: 35 of 100
7ZQB5C2O.csv: 36 of 100
LTZQIN2R.csv: 37 of 100
QID3PSI3.csv: 38 of 100
NXBVTACX.csv: 39 of 100
DKRE7U28.csv: 40 of 100
U8BHYWZ7.csv: 41 of 100
2389HYHH.csv: 42 of 100
2

In [40]:
!ls $training_features_dir | wc -l

     100


In [43]:
# get training data
all_data = merge_df([file for file in glob(f'{training_features_dir}/*.csv')])
all_data.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,GT_kg_id,GT_kg_label,evaluation_label,smallest_qnode_number,aligned_pagerank,monge_elkan,monge_elkan_aliases,des_cont_jaccard,des_cont_jaccard_normalized,table_id
0,0,0,Mauretania Casariensis,Fossatum Africae|ancient Rome,Mauretania Casariensis,,,,exact-match,,...,Q734505,Mauretania Caesariensis,-1,0,0.0,0.0,0.0,0.0,0.0,ZX8GERJC
1,0,0,Mauretania Casariensis,Fossatum Africae|ancient Rome,Mauretania Casariensis,Q734505,Mauretania Caesariensis|Maurétanie césarienne|...,Mauretanie cesarienne,fuzzy-augmented,province of the Roman Empire in northwest Africa,...,Q734505,Mauretania Caesariensis,1,0,0.0,0.964646,0.821775,0.0,0.0,ZX8GERJC
2,0,0,Mauretania Casariensis,Fossatum Africae|ancient Rome,Mauretania Casariensis,Q309272,Mauritânia romana|Mauritania|Mauretania|Maurét...,Mauritania romana|Konigreich Mauretanien|Maure...,fuzzy-augmented,Kingdom in the ancient Maghreb,...,Q734505,Mauretania Caesariensis,-1,0,0.0,0.90101,0.90101,0.166667,0.333333,ZX8GERJC
3,0,0,Mauretania Casariensis,Fossatum Africae|ancient Rome,Mauretania Casariensis,Q1427468,Flavie Césarienne|Flávia Cesariense|Flavia Cae...,Flavia Cesariense|Flavie Cesarienne|Flavia Cae...,fuzzy-augmented,Roman province,...,Q734505,Mauretania Caesariensis,-1,0,0.0,0.739646,0.739646,0.0,0.0,ZX8GERJC
4,0,0,Mauretania Casariensis,Fossatum Africae|ancient Rome,Mauretania Casariensis,Q1034889,diocesi di Capra|Capra,,fuzzy-augmented,,...,Q734505,Mauretania Caesariensis,-1,0,0.0,0.636742,0.0,0.0,0.0,ZX8GERJC


In [44]:
print(len(all_data))

514782


#### Prepare features for training

In [47]:
eval_features = training_features + ['evaluation_label']
training_data = all_data.loc[:, eval_features]
training_data.loc[training_data['evaluation_label'].astype(int) != 1, 'evaluation_label'] = -1
training_data = clean_dataset(training_data)

length of df: 514782
length of df: 514782 after dropping na
length of indices_to_keep: 514782


In [48]:
# balance the positive / negative cases
balanced_training_data = pd.DataFrame()
balanced_training_data = balanced_training_data.append(training_data[training_data['evaluation_label'] == 1])
balanced_training_data = balanced_training_data.append(training_data[training_data['evaluation_label'] != 1].sample(9500))

In [61]:
training_features_data = balanced_training_data.loc[:, training_features]
training_labels = balanced_training_data.loc[:, ['evaluation_label']]

#### Train classifier using training data

In [62]:
# train model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={-1:0.3, 1:0.7})
# Train our classifier
model = lr.fit(training_features_data, training_labels)

  return f(*args, **kwargs)


In [63]:
# dump trained model to disk
with open(trained_model_file, 'wb') as fid:
    pickle.dump(model, fid)

In [64]:
# load model from disk
with open(trained_model_file, 'rb') as fid:
    model_loaded = pickle.load(fid)

#### Test using dev dataset

In [65]:
compute_features(dev_candidates_dir, dev_features_dir) 

V1MLK9TP.csv: 1 of 49
RCL5LZUM.csv: 2 of 49
FU7P6GOF.csv: 3 of 49
3OX1PGQD.csv: 4 of 49
XXYFPD8I.csv: 5 of 49
093BPOP2.csv: 6 of 49
JYC6D9MU.csv: 7 of 49
DBH21J5D.csv: 8 of 49
OYFD9B7F.csv: 9 of 49
6T4QNE30.csv: 10 of 49
TLAL3B63.csv: 11 of 49
JTWZYYBU.csv: 12 of 49
MBCHQ4TC.csv: 13 of 49
NE9XVY42.csv: 14 of 49
VNSUNG1M.csv: 15 of 49
RWEJTWBK.csv: 16 of 49
NBYU3S9Y.csv: 17 of 49
VEKB4XZC.csv: 18 of 49
PMSAYLPC.csv: 19 of 49
9V2P69CI.csv: 20 of 49
YV0V8O3A.csv: 21 of 49
DKR353LM.csv: 22 of 49
4HYT5D2J.csv: 23 of 49
J5WTHYK6.csv: 24 of 49
RF6RSJ5W.csv: 25 of 49
VB0WL533.csv: 26 of 49
RPS3P53T.csv: 27 of 49
EJMFROMS.csv: 28 of 49
JAV53EZQ.csv: 29 of 49
U4430LA9.csv: 30 of 49
VADKVBSJ.csv: 31 of 49
8N4ZTXDV.csv: 32 of 49
54SEC9F3.csv: 33 of 49
U7PSL9LZ.csv: 34 of 49
PT0GTLGV.csv: 35 of 49
SYRX0I75.csv: 36 of 49
CR0Q0GDE.csv: 37 of 49
OEMDOUBY.csv: 38 of 49
1UEUW7EP.csv: 39 of 49
FDOC6GMJ.csv: 40 of 49
BOXTVP7V.csv: 41 of 49
FV3PPNAQ.csv: 42 of 49
KL3RUA2V.csv: 43 of 49
E5SHJSQZ.csv: 44 of 

In [66]:
eval_file_names = []
eval_file_ids = []

for (dirpath, dirnames, filenames) in os.walk(dev_features_dir):
    for fn in filenames:
        if "csv" not in fn:
            continue
        abs_fn = dirpath + '/'+ fn
        assert os.path.isfile(abs_fn)
        if os.path.getsize(abs_fn) == 0:
            continue
        eval_file_names.append(abs_fn)
        eval_file_ids.append(fn.split('.csv')[0])
len(eval_file_names), eval_file_ids[:3]

(49, ['V1MLK9TP', 'RCL5LZUM', 'FU7P6GOF'])

In [67]:
training_features

['aligned_pagerank',
 'smallest_qnode_number',
 'monge_elkan',
 'monge_elkan_aliases',
 'des_cont_jaccard_normalized']

In [68]:
# generate testing data
testing_data = merge_df(eval_file_names)
testing_data.loc[testing_data['evaluation_label'].astype(int) != 1, 'evaluation_label'] = -1
testing_data = clean_dataset(testing_data)
testing_features = testing_data.loc[:, training_features]
testing_labels = testing_data.loc[:, ['evaluation_label']]

length of df: 237404
length of df: 237404 after dropping na
length of indices_to_keep: 237404


In [69]:
res = {}
for tid in eval_file_ids:
    test_data = testing_data[testing_data['table_id'] == tid]
    test_features = test_data.loc[:, training_features]
    test_labels = test_data.loc[:, ['evaluation_label']]

    preds = model_loaded.predict(test_features)
    assert len(preds) ==  len(test_labels)

    prob = model_loaded.predict_proba(test_features)
    
    test_df = test_data.copy()
    test_df['pred'] = preds
    test_df['prob_0'] = [p[0] for p in prob]
    test_df['prob_1'] = [p[1] for p in prob]
    res[tid] = test_df

In [70]:
res['IYEDUUIU']

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,smallest_qnode_number,aligned_pagerank,monge_elkan,monge_elkan_aliases,des_cont_jaccard,des_cont_jaccard_normalized,table_id,pred,prob_0,prob_1
0,0,0,Gray short-tailed opossum,15|5.1|Short-tailed opossum,Gray short-tailed opossum,Q194472,Monodelphis domestica|Gray short-tailed opossu...,Opossum gris|Opossum-musaraigne commun|Opossum...,exact-match,species of mammal,...,0,4.704356e-09,1.000000,0.768519,0.0,0.0,IYEDUUIU,1,0.105729,0.894271
1,0,0,Gray short-tailed opossum,15|5.1|Short-tailed opossum,Gray short-tailed opossum,Q194472,Haus-Spitzmausbeutelratte|Gray short-tailed op...,Opossum sud-americain|Catita|Opossum-soricidé ...,fuzzy-augmented,species of mammal,...,0,0.000000e+00,1.000000,0.768519,0.0,0.0,IYEDUUIU,1,0.105729,0.894271
2,0,0,Gray short-tailed opossum,15|5.1|Short-tailed opossum,Gray short-tailed opossum,Q1425981,Monodelphis|Short-tailed opossum|Spitzmausbeut...,Monodelphis|Opossum à queue courte|Spitzmausbe...,fuzzy-augmented,genus of mammals,...,0,0.000000e+00,0.907407,0.668210,0.0,0.0,IYEDUUIU,1,0.409124,0.590876
3,0,0,Gray short-tailed opossum,15|5.1|Short-tailed opossum,Gray short-tailed opossum,Q196005,Sepia short-tailed opossum|Monodelphis adusta,Monodelphis adusta,fuzzy-augmented,species of mammal,...,0,0.000000e+00,0.827778,0.525583,0.0,0.0,IYEDUUIU,-1,0.768784,0.231216
4,0,0,Gray short-tailed opossum,15|5.1|Short-tailed opossum,Gray short-tailed opossum,Q21314200,Monodelphis gardneri|Gardners Spitzmausbeutelr...,Monodelphis gardneri|Gardner\s short-tailed op...,fuzzy-augmented,species of mammal,...,0,0.000000e+00,0.548313,0.861111,0.0,0.0,IYEDUUIU,-1,0.997482,0.002518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,3,19,American short-tailed shrew,Northern Short-tailed Shrew|20|2.2,American short-tailed shrew,Q49459301,Short-tailed Swallowtail|Papilio brevicauda br...,,fuzzy-augmented,,...,0,0.000000e+00,0.741667,0.000000,0.0,0.0,IYEDUUIU,-1,0.961609,0.038391
3773,3,19,American short-tailed shrew,Northern Short-tailed Shrew|20|2.2,American short-tailed shrew,Q1344842,Python curtus|Píton-sangüíneo|Python malais|Su...,Piton-sanguineo|Python a queue courte|Python c...,fuzzy-augmented,species of reptile,...,0,0.000000e+00,0.504167,0.598900,0.0,0.0,IYEDUUIU,-1,0.999099,0.000901
3774,3,19,American short-tailed shrew,Northern Short-tailed Shrew|20|2.2,American short-tailed shrew,Q1515902,Pestratte|Short-tailed bandicoot rat|Rat à que...,Rat a queue courte|Nesokia indica,fuzzy-augmented,species of mammal,...,0,0.000000e+00,0.706944,0.600637,0.0,0.0,IYEDUUIU,-1,0.965417,0.034583
3775,3,19,American short-tailed shrew,Northern Short-tailed Shrew|20|2.2,American short-tailed shrew,Q893958,Borneo-Kurzschwanzpython|Python breitensteini|...,Python breitensteini|Python curtus breitensteini,fuzzy-augmented,species of reptile,...,0,0.000000e+00,0.513319,0.513319,0.0,0.0,IYEDUUIU,-1,0.999012,0.000988
