In [1]:
import os
import math
import random
import csv
import sys

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

def read_tsv(filename, quotechar=None):
    with open(filename, "r", encoding='utf-8') as f:
        return list(csv.reader(f, delimiter="\t", quotechar=quotechar))   

def flatten(l):
    return [item for sublist in l for item in sublist]


In [3]:
"""
SST-2 train data size: 67349 
SST-2 dev data size: 872 
"""
DATADIR = 'xxx'

def get_sst_data(train_file=DATADIR + '/train.txt',
                 dev_file=DATADIR + '/dev.txt'):

    train = pd.read_csv(train_file, sep='\t', encoding='utf8', keep_default_na=False)
    train.columns=['text', 'label']
    print("SST-2 train data size: %d "%(len(train)))
    
    dev = pd.read_csv(dev_file, sep='\t', encoding='utf8', keep_default_na=False)
    dev.columns=['text', 'label']
    print("SST-2 dev data size: %d "%(len(dev)))
    label_list = np.unique(train['label'])

    return train, dev, label_list

train, dev, label_list = get_sst_data()
train.head()

SST-2 train data size: 67349 
SST-2 dev data size: 872 


Unnamed: 0,text,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


In [4]:
# subsample data 
n = 1000
train = train.sample(n, random_state=42)

X_train = train['text']
y_train = train['label']

# use the dev set for testing
test = dev
X_test = test['text']
y_test = test['label']

In [5]:
model = BertClassifier(max_seq_length=64, train_batch_size=16)
model

Building sklearn text classifier...


BertClassifier(bert_config_json=None, bert_model='bert-base-uncased',
        bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
        fp16=False, from_tf=False, gradient_accumulation_steps=1,
        ignore_label=None, label_list=None, learning_rate=2e-05,
        local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
        max_seq_length=64, num_mlp_hiddens=500, num_mlp_layers=0,
        random_state=42, restore_file=None, train_batch_size=16,
        use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)

In [6]:
%%time
model = model.fit(X_train, y_train)

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████| 57/57 [00:15<00:00,  4.17it/s, loss=0.529]
Validating: 100%|██████████| 13/13 [00:00<00:00, 22.70it/s]

Epoch 1, Train loss: 0.5295, Val loss: 0.4408, Val accy: 81.00%



Training  : 100%|██████████| 57/57 [00:15<00:00,  4.16it/s, loss=0.167]
Validating: 100%|██████████| 13/13 [00:00<00:00, 22.89it/s]

Epoch 2, Train loss: 0.1668, Val loss: 0.4380, Val accy: 87.00%



Training  : 100%|██████████| 57/57 [00:15<00:00,  4.16it/s, loss=0.0434]
Validating: 100%|██████████| 13/13 [00:00<00:00, 22.74it/s]

Epoch 3, Train loss: 0.0434, Val loss: 0.5512, Val accy: 86.00%
CPU times: user 36.1 s, sys: 16.5 s, total: 52.6 s
Wall time: 54.1 s





In [7]:
from tqdm import tqdm
# score model
accy = model.score(X_test, y_test)

# make class probability predictions
y_prob = model.predict_proba(X_test)
print("class prob estimates:\n", y_prob)

# make predictions
y_pred = model.predict(X_test)
print("Accuracy: %0.2f%%"%(metrics.accuracy_score(y_pred, y_test) * 100))

target_names = ['negative', 'positive']
print(classification_report(y_test, y_pred, target_names=target_names))

Testing: 100%|██████████| 109/109 [00:03<00:00, 27.99it/s]



Loss: 0.3717, Accuracy: 88.07%


Predicting: 100%|██████████| 109/109 [00:03<00:00, 27.94it/s]

class prob estimates:
 [[0.00176739 0.9982326 ]
 [0.978774   0.02122599]
 [0.00462427 0.99537575]
 ...
 [0.96313787 0.03686218]
 [0.1856012  0.81439877]
 [0.00501524 0.99498475]]



Predicting: 100%|██████████| 109/109 [00:03<00:00, 27.85it/s]

Accuracy: 88.07%
              precision    recall  f1-score   support

    negative       0.88      0.87      0.88       428
    positive       0.88      0.89      0.88       444

   micro avg       0.88      0.88      0.88       872
   macro avg       0.88      0.88      0.88       872
weighted avg       0.88      0.88      0.88       872






In [8]:
#save model to disk
savefile = '/data/test.bin'
model.save(savefile)

# load model from disk
new_model = load_model(savefile)

# predict with new model
accy = new_model.score(X_test, y_test)

Loading model from /data/test.bin...
Defaulting to linear classifier/regressor
Building sklearn text classifier...


Testing: 100%|██████████| 109/109 [00:03<00:00, 27.95it/s]


Loss: 0.3717, Accuracy: 88.07%





In [9]:
%%time
scores = []; 
for seed in [4, 27, 33]:
    model.random_state = seed
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████| 57/57 [00:16<00:00,  4.04it/s, loss=0.502]
Validating: 100%|██████████| 13/13 [00:00<00:00, 22.24it/s]

Epoch 1, Train loss: 0.5023, Val loss: 0.5222, Val accy: 81.00%



Training  : 100%|██████████| 57/57 [00:16<00:00,  4.10it/s, loss=0.184]
Validating: 100%|██████████| 13/13 [00:00<00:00, 21.74it/s]

Epoch 2, Train loss: 0.1842, Val loss: 0.4111, Val accy: 87.00%



Training  : 100%|██████████| 57/57 [00:16<00:00,  3.99it/s, loss=0.0377]
Validating: 100%|██████████| 13/13 [00:00<00:00, 21.17it/s]

Epoch 3, Train loss: 0.0377, Val loss: 0.5228, Val accy: 88.00%



Testing: 100%|██████████| 109/109 [00:04<00:00, 28.08it/s]



Loss: 0.3578, Accuracy: 88.07%
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████| 57/57 [00:16<00:00,  3.73it/s, loss=0.533]
Validating: 100%|██████████| 13/13 [00:00<00:00, 20.84it/s]

Epoch 1, Train loss: 0.5332, Val loss: 0.2840, Val accy: 86.00%



Training  : 100%|██████████| 57/57 [00:16<00:00,  3.94it/s, loss=0.204]
Validating: 100%|██████████| 13/13 [00:00<00:00, 21.30it/s]

Epoch 2, Train loss: 0.2042, Val loss: 0.2330, Val accy: 91.00%



Training  : 100%|██████████| 57/57 [00:16<00:00,  3.98it/s, loss=0.0656]
Validating: 100%|██████████| 13/13 [00:00<00:00, 21.87it/s]

Epoch 3, Train loss: 0.0656, Val loss: 0.2525, Val accy: 91.00%



Testing: 100%|██████████| 109/109 [00:04<00:00, 27.26it/s]



Loss: 0.4295, Accuracy: 85.89%
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████| 57/57 [00:16<00:00,  4.10it/s, loss=0.585]
Validating: 100%|██████████| 13/13 [00:00<00:00, 21.12it/s]

Epoch 1, Train loss: 0.5846, Val loss: 0.4408, Val accy: 76.00%



Training  : 100%|██████████| 57/57 [00:16<00:00,  3.94it/s, loss=0.291]
Validating: 100%|██████████| 13/13 [00:00<00:00, 20.98it/s]

Epoch 2, Train loss: 0.2907, Val loss: 0.2925, Val accy: 90.00%



Training  : 100%|██████████| 57/57 [00:17<00:00,  3.86it/s, loss=0.0896]
Validating: 100%|██████████| 13/13 [00:00<00:00, 20.87it/s]

Epoch 3, Train loss: 0.0896, Val loss: 0.2609, Val accy: 91.00%



Testing: 100%|██████████| 109/109 [00:04<00:00, 22.76it/s]


Loss: 0.4628, Accuracy: 84.75%
CPU times: user 1min 58s, sys: 55.7 s, total: 2min 54s
Wall time: 2min 58s





In [10]:
# lets add the accy from our earlier run as well that uses the default seed=42
scores = np.array(scores + [accy])
print(scores)
print("%0.2f%% (+/-%0.03f)"% (stats.mean(scores), stats.stdev(scores) * 2))

[88.0733945  85.89449541 84.74770642 88.0733945 ]
86.70% (+/-3.313)


In [12]:
"""
QQP train data size: 363849 
QQP dev data size: 40430 
"""

DATADIR = './glue_examples/glue_data'

def get_quora_df(filename):
    rows = read_tsv(filename)
    df=pd.DataFrame(rows[1:], columns=rows[0])
    df=df[['question1', 'question2', 'is_duplicate']]
    df = df[pd.notnull(df['is_duplicate'])]
    df.columns=['text_a', 'text_b', 'label']
    return df

def get_quora_data(train_file=DATADIR+'/QQP/train.tsv', 
                   dev_file=DATADIR+'/QQP/dev.tsv'):
    train = get_quora_df(train_file)
    print("QQP train data size: %d "%(len(train)))
    dev = get_quora_df(dev_file)
    print("QQP dev data size: %d "%(len(dev)))

    label_list = np.unique(train['label'].values)
    return train, dev, label_list

train, dev, label_list = get_quora_data()
train.head()

QQP train data size: 363849 
QQP dev data size: 40430 


Unnamed: 0,text_a,text_b,label
0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,How do I control my horny emotions?,How do you control your horniness?,1
2,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,What can one do after MBBS?,What do i do after my MBBS ?,1
4,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [13]:
# subsample data 
n = 1000
train = train.sample(n, random_state=42)
dev = dev.sample(n, random_state=42)

X_train = train[['text_a', 'text_b']]
y_train = train['label']

# use the dev set for testing...
test = dev
X_test = test[['text_a', 'text_b']]
y_test = test['label']

In [14]:
%%time
# define model
model = BertClassifier(max_seq_length=64, train_batch_size=16)

# finetune model
model.fit(X_train, y_train)

# score model
model.score(X_test, y_test)

# make predictions
y_pred = model.predict(X_test)
print("Accuracy: %0.2f%%"%(metrics.accuracy_score(y_pred, y_test) * 100))

target_names = ['not duplicate', 'is duplicate']
print(classification_report(y_test, y_pred, target_names=target_names))

Building sklearn text classifier...
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 900, validation data size: 100


Training  : 100%|██████████| 57/57 [00:15<00:00,  4.18it/s, loss=0.643]
Validating: 100%|██████████| 13/13 [00:00<00:00, 21.79it/s]

Epoch 1, Train loss: 0.6428, Val loss: 0.5923, Val accy: 65.00%



Training  : 100%|██████████| 57/57 [00:16<00:00,  4.10it/s, loss=0.424]
Validating: 100%|██████████| 13/13 [00:00<00:00, 21.50it/s]

Epoch 2, Train loss: 0.4240, Val loss: 0.6151, Val accy: 64.00%



Training  : 100%|██████████| 57/57 [00:17<00:00,  3.72it/s, loss=0.225]
Validating: 100%|██████████| 13/13 [00:00<00:00, 14.36it/s]

Epoch 3, Train loss: 0.2246, Val loss: 0.7069, Val accy: 64.00%



Testing: 100%|██████████| 125/125 [00:05<00:00, 22.48it/s]


Loss: 0.5384, Accuracy: 74.90%



Predicting: 100%|██████████| 125/125 [00:05<00:00, 21.51it/s]

Accuracy: 74.90%
               precision    recall  f1-score   support

not duplicate       0.85      0.72      0.78       617
 is duplicate       0.64      0.80      0.71       383

    micro avg       0.75      0.75      0.75      1000
    macro avg       0.74      0.76      0.74      1000
 weighted avg       0.77      0.75      0.75      1000

CPU times: user 44 s, sys: 21.1 s, total: 1min 5s
Wall time: 1min 6s



