In [10]:
import numpy as np
import pandas as pd
import csv
from my_paths import *

In [2]:
XLS_PATH = xls_path
DATA_DIR = data_dir
OUTPUT_DIR = output_dir
PICKLE_DIR = pickle_dir

In [3]:
df = pd.read_excel(XLS_PATH)
df['STE'] = df['STE'].str.replace("\t"," ")
df['STE'] = df['STE'].str.strip() 
df['len'] = df['STE'].apply(lambda x: len(x.split()))
df = df[df.len >= 10]

# get categories with at least 10 counts
counts = df.groupby('TEXT_TYPE').count()['TIMID']
cols = counts[counts >= 10].index

print('num classes before: {}\nnum classes after:  {}'
      .format(len(df.TEXT_TYPE.unique()), len(cols)))

df = df[df.TEXT_TYPE.isin(cols)] # drop labels with too few samples
assert(df.TEXT_TYPE.isna().sum() == 0) # make sure there are no NaN in target labels

num classes before: 20
num classes after:  14


In [4]:
X = np.array(df.STE)
y = np.array(df.TEXT_TYPE)

# shuffle data
indices = np.arange(len(X))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

with open(DATA_DIR+'/train.tsv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='\"')
        for lab, text in zip(y_train, X_train):
            writer.writerow([lab, text])
            
with open(DATA_DIR+'/dev.tsv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='\"')
    for lab, text in zip(y_test, X_test):
        writer.writerow([lab, text])

In [6]:
from sklearn.model_selection import KFold
import subprocess

kf = KFold(n_splits=3)
i = 1

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    with open(DATA_DIR+'/train.tsv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='\"')
        for lab, text in zip(y_train, X_train):
            writer.writerow([lab, text])
            
    with open(DATA_DIR+'/dev.tsv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='\"')
        for lab, text in zip(y_test, X_test):
            writer.writerow([lab, text])
            
    bash_cmd = f"""python run_classifier.py
    --data_dir={DATA_DIR}
    --bert_model=bert-base-uncased
    --task_name=ste
    --output_dir={OUTPUT_DIR}{i}
    --gradient_accumulation_steps=2
    --do_lower_case
    --train_batch_size=32
    --learning_rate=2e-5
    --num_train_epochs=3
    --max_seq_length=128
    --do_train
    --do_eval"""

    process = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    print(output)
    print(f'FINISHED CV ITERATION: {i}')
    i += 1

b'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\r\n'
FINISHED CV ITERATION: 1
b'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\r\n'
FINISHED CV ITERATION: 2
b'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\r\n'
FINISHED CV ITERATION: 3


In [None]:
lines = []
for i in range(1,6):
    with open(f'{OUTPUT_DIR}{i}/eval_results.txt', 'r') as f:
        for line in f.readlines():
            if 'eval_accuracy' in line:
                lines.append(line)

In [None]:
sum(float(x.replace('\n', '').split()[-1]) for x in lines) / len(lines)

In [None]:
lines

In [None]:
accs = [float(x.replace('\n', '').split()[-1]) for x in lines]

In [None]:
accs

In [None]:
sum(accs) / len(accs)

In [None]:
import pickle
pickle.dump(accs, open(PICKLE_DIR+'/BERT-cv-50-epochs-128-max-seq-len.p', 'wb'))