In [2]:
import numpy as np
import pandas as pd
import csv
from my_paths import *

In [2]:
XLS_PATH = xls_path
DATA_DIR = data_dir
OUTPUT_DIR = output_dir
PICKLE_DIR = pickle_dir

In [4]:
OUTPUT_DIR

'D:/bert-folder/CV_output'

In [5]:
df = pd.read_excel(XLS_PATH)
df['STE'] = df['STE'].str.replace("\t"," ")
df['STE'] = df['STE'].str.strip() 
df['len'] = df['STE'].apply(lambda x: len(x.split()))
df = df[df.len >= 10]

# get categories with at least 10 counts
counts = df.groupby('TEXT_TYPE').count()['TIMID']
cols = counts[counts >= 10].index

print('num classes before: {}\nnum classes after:  {}'
      .format(len(df.TEXT_TYPE.unique()), len(cols)))

df = df[df.TEXT_TYPE.isin(cols)] # drop labels with too few samples
assert(df.TEXT_TYPE.isna().sum() == 0) # make sure there are no NaN in target labels

num classes before: 20
num classes after:  14


In [16]:
df[df.TIMID == 6146]

Unnamed: 0,TIMID,TITLE,PRODUCT,PRODUCTGROUP,PRODUCTLINE,TEXT_TYPE,STATUS,M_COMMENT,FILEADR,STE,len
2,6146,Product description_one model_battery,Battery/Cordless,Chainsaws,Handheld,Introduction,Released,,DAT0056\00069991.xml,Product description Work is constantly in prog...,23


### Using all classes (without running the code above to filter)

In [4]:
X = np.array(df.STE)
y = np.array(df.TEXT_TYPE)

# shuffle data
indices = np.arange(len(X))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

In [5]:
DATA_DIR

'D:/bert-folder/data'

In [6]:
OUTPUT_DIR

'D:/bert-folder/CV_output'

In [7]:
from sklearn.model_selection import KFold
import subprocess

kf = KFold(n_splits=4)
i = 1

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    with open(DATA_DIR+'/train.tsv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='\"')
        for lab, text in zip(y_train, X_train):
            writer.writerow([lab, text])
            
    with open(DATA_DIR+'/dev.tsv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t', quotechar='\"')
        for lab, text in zip(y_test, X_test):
            writer.writerow([lab, text])
            
    bash_cmd = f"""python run_classifier.py
    --data_dir={DATA_DIR}
    --bert_model=bert-base-uncased
    --task_name=ste
    --output_dir={OUTPUT_DIR}{i}
    --gradient_accumulation_steps=16
    --do_lower_case
    --train_batch_size=32
    --learning_rate=2e-5
    --num_train_epochs=10
    --max_seq_length=512
    --do_train
    --do_eval"""

    process = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    print(output)
    print(f'FINISHED CV ITERATION: {i}')
    i += 1

b'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\r\n'
FINISHED CV ITERATION: 1
b'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\r\n'
FINISHED CV ITERATION: 2
b'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\r\n'
FINISHED CV ITERATION: 3
b'Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\r\n'
FINISHED CV ITERATION: 4


In [8]:
lines = []
for i in range(1,5):
    with open(f'D:/bert-folder/CV_output-{i}/eval_results.txt', 'r') as f:
        for line in f.readlines():
            if 'eval_accuracy' in line:
                lines.append(line)

In [9]:
sum(float(x.replace('\n', '').split()[-1]) for x in lines) / len(lines)

0.8663982430453879

In [10]:
lines

['eval_accuracy = 0.8755490483162518\n',
 'eval_accuracy = 0.8609077598828697\n',
 'eval_accuracy = 0.862371888726208\n',
 'eval_accuracy = 0.8667642752562226\n']

In [11]:
accs = [float(x.replace('\n', '').split()[-1]) for x in lines]

In [12]:
accs

[0.8755490483162518, 0.8609077598828697, 0.862371888726208, 0.8667642752562226]

In [13]:
sum(accs) / len(accs)

0.8663982430453879

In [None]:
# pickle CV-results for 512 seq length running 10 epochs
import pickle
pickle.dump(accs, open(PICKLE_DIR+'/BERT-cv-10-epochs-512-max-seq-len.p', 'wb'))

In [None]:
import pickle
pickle.dump(accs, open(PICKLE_DIR+'/BERT-cv-50-epochs-128-max-seq-len.p', 'wb'))

In [6]:
import pickle
accs = pickle.load(open('D:/thesis-etteplan/pickled-results/BERT-cv-12-epochs-128-max-seq-len.p', 'rb'))
sum(accs) / len(accs)

0.8923665201121848

In [7]:
accs = pickle.load(open('D:/thesis-etteplan/pickled-results/BERT-cv-20-epochs-192-max-seq-len.p', 'rb'))
sum(accs) / len(accs)

0.88579686209744

In [8]:
accs = pickle.load(open('D:/thesis-etteplan/pickled-results/BERT-cv-50-epochs-128-max-seq-len.p', 'rb'))
sum(accs) / len(accs)

0.8827274012245111