In [1]:
from lr.models.transformers.processor import clean_df
from lr.models.transformers.train_functions import set_seed
from lr.models.transformers.BertWrapper import BertWrapper
from lr.text_processing.transformations.wordnet import path_base_transformation
from lr.stats.h_testing import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from time import time
import shutil
import os

## Variables

In [2]:
folder = "snli"
result_folder = "results/snli/bert/sin_p_h/"
transformation_name = "wordnet sin tranformation p and h"

## Data

In [3]:
train = pd.read_csv("data/{}/train.csv".format(folder))
dev_o = pd.read_csv("data/{}/dev.csv".format(folder))
train = train.head(1000)
dev_o = dev_o.head(1000)

train = clean_df(train, n_cores=8)
dev_o = clean_df(dev_o, n_cores=8)

## Transformations

In [4]:
train_path_mod = "data/{}/train_p_h_syn_noun.csv".format(folder)
dev_path_mod = "data/{}/dev_p_h_syn_noun.csv".format(folder)


def train_trans(df): return path_base_transformation(df, train_path_mod)
def dev_trans(df): return path_base_transformation(df, dev_path_mod)

## Val df transformation

In [5]:
dev_t = dev_trans(dev_o)

## Hyperparams

In [6]:
hyperparams = {"local_rank": -1,
               "max_seq_length": 200,
               "overwrite_cache": False,
               "num_train_epochs": 1.0,
               "per_gpu_train_batch_size": 32,
               "per_gpu_eval_batch_size": 50,
               "gradient_accumulation_steps": 1,
               "learning_rate": 5e-5,
               "weight_decay": 0.0,
               "adam_epsilon": 1e-8,
               "max_grad_norm": 1.0,
               "max_steps": 4,
               "warmup_steps": 0,
               "save_steps": 3,
               "no_cuda": False,
               "n_gpu": 1,
               "data_set_name": folder,
               "transformation_name": transformation_name,
               "number_of_simulations": 1000,
               "rho": 0.7,
               "model_name_or_path": "bert",
               "output_dir": "bert_draft",
               "random_state": 42,
               "dgp_seed": 123,
               "fp16": False,
               "fp16_opt_level": "01",
               "device": "cpu",
               "verbose": True,
               "model_type": "bert",
               "pad_on_left": False,
               "pad_token": 0,
               "n_cores": 7,
               'eval_sample_size': 100,
               "pad_token_segment_id": 0,
               "mask_padding_with_zero": True,
               "base_path": "data/{}/cached_".format(folder)}

## Selecting one data by DGP

In [7]:
dgp_seed = hyperparams["dgp_seed"]
rho = hyperparams["rho"]
rs = hyperparams["random_state"]



set_seed(dgp_seed, 0)
dgp = DGP(train, train_trans, rho=rho)
train_ = dgp. sample_transform()

## Testing

In [8]:
test_results =  h_test_transformer(df_train=train_,
                                   df_dev=dev_o,
                                   df_dev_t=dev_t,
                                   ModelWrapper=BertWrapper,
                                   hyperparams=hyperparams)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/29 [00:00<?, ?it/s][A
Iteration:   3%|▎         | 1/29 [00:12<05:49, 12.46s/it][A
Iteration:   7%|▋         | 2/29 [00:24<05:36, 12.47s/it][A

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Evaluating:  50%|█████     | 1/2 [00:05<00:05,  5.77s/it][A[A

Evaluating: 100%|██████████| 2/2 [00:11<00:00,  5.80s/it][A[A


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s][A[A

Evaluating:  50%|█████     | 1/2 [00:05<00:05,  5.86s/it][A[A

Evaluating: 100%|██████████| 2/2 [00:11<00:00,  5.84s/it][A[A

Iteration:  10%|█         | 3/29 [01:01<08:29, 19.61s/it][A
Iteration:  14%|█▍        | 4/29 [01:26<08:58, 21.56s/it][A
Epoch:   0%|          | 0/1 [01:26<?, ?it/s]
Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

eval path =  data/snli/cached_test_200


Evaluating: 100%|██████████| 20/20 [01:54<00:00,  5.72s/it]
Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

eval path =  data/snli/cached_test_t_200


Evaluating: 100%|██████████| 20/20 [01:54<00:00,  5.74s/it]


In [9]:
test_results

Unnamed: 0,data,model,transformation,rho,dgp_seed,random_state,number_of_simulations,validation_accuracy,transformed_validation_accuracy,observable_t_stats,...,boot_t_991,boot_t_992,boot_t_993,boot_t_994,boot_t_995,boot_t_996,boot_t_997,boot_t_998,boot_t_999,boot_t_1000
0,snli,bert,wordnet sin tranformation p and h,0.7,123,42,1000,0.347959,0.35,-0.471458,...,0.0,-0.471458,-0.894792,-0.774834,1.635219,-1.342875,0.0,2.693193,-0.5346,0.277361


In [12]:
result_path = result_folder + "rho_{:.1f}_dgp_seed_{}_random_state_{}".format(rho, dgp_seed, rs)
result_path = result_path.replace(".", "p") + ".csv"

test_results.to_csv(result_path, index=False)