In [1]:
import pandas as pd
import numpy as np
from time import time
from lr.text_processing.util import pre_process_nli_df
from lr.text_processing.transformations.wordnetsyn import p_h_transformation_noun_minimal_edition
from lr.text_processing.transformations.wordnetsyn import parallelize, path_base_transormation

### Loading data

In [2]:
train_path = "data/toy/train.csv"
dev_path = "data/toy/dev.csv"

train_path_mod = "data/toy/train_mod.csv"
dev_path_mod = "data/toy/dev_mod.csv"


train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
pre_process_nli_df(train)
pre_process_nli_df(dev)

### Transforming data

In [3]:
n_cores = 2
transformation = lambda df: parallelize(df,
                                        func=p_h_transformation_noun_minimal_edition,
                                        n_cores=n_cores)

In [4]:
init = time()
dev_t = transformation(dev)
print(time() - init)

2.738999366760254


In [5]:
init = time()
train_t = transformation(train)
print(time() - init)

5.955520391464233


### Saving transformations

In [6]:
dev_t.to_csv(dev_path_mod,index=False)
train_t.to_csv(train_path_mod,index=False)

### Running M tests

In [7]:
from lr.training.util import get_ternary_label
from lr.stats.h_testing import DGP
from lr.stats.h_testing import  LIMts_test
from lr.training.language_representation import Tfidf
from lr.models.logistic_regression import LRWrapper

In [8]:
train_trans = lambda df: path_base_transormation(df, train_path_mod)
dev_trans = lambda df: path_base_transormation(df, dev_path_mod)

In [9]:
max_features = 500
rho = 0.76
label_translation = get_ternary_label

hyperparams = {"RepresentationFunction": Tfidf,
               "max_features": max_features,
               "label_translation": label_translation,
               "penalty": "l2",
               "C": 1,
               'solver': 'lbfgs'}

In [13]:
M,E,S = 5,1,1000


tests1 = LIMts_test(train=train,
                    dev=dev,
                    train_transformation=train_trans,
                    dev_transformation=dev_trans,
                    rho=rho,
                    Model=LRWrapper,
                    hyperparams=hyperparams,
                    M=M,
                    E=E,
                    S=S,
                    verbose=True)

m = 1 | e = 1 | time: 2.31 sec
m = 2 | e = 1 | time: 2.33 sec
m = 3 | e = 1 | time: 2.23 sec
m = 4 | e = 1 | time: 2.24 sec
m = 5 | e = 1 | time: 2.26 sec


In [14]:
tests1.head(3)

Unnamed: 0,m,e,validation_accuracy,transformed_validation_accuracy,observable_t_stats,p_value,transformation_time,training_time,test_time,boot_t_1,...,boot_t_991,boot_t_992,boot_t_993,boot_t_994,boot_t_995,boot_t_996,boot_t_997,boot_t_998,boot_t_999,boot_t_1000
0,1,1,0.265,0.23,1.046351,0.278,0.010079,0.063981,2.314187,-2.948839,...,1.913524,1.096497,1.516196,0.64957,-0.603572,1.053361,0.823387,1.783315,0.590281,1.158569
1,2,1,0.345,0.2,4.068998,0.0,0.005354,0.050927,2.330168,0.356147,...,0.0,1.922685,-0.566139,0.397517,0.262658,-0.235735,0.617802,-0.142864,-0.577832,-0.134846
2,3,1,0.245,0.21,1.023727,0.286,0.004595,0.054964,2.231004,2.002252,...,-0.982559,-1.318478,1.614943,-0.277403,-0.316307,0.428768,-0.457735,0.142864,-0.294948,-0.137367
