In [1]:
# !pip3 install -r requirements.txt
# !pip install transformers

In [2]:
import pandas as pd
import numpy as np
import datetime
import time
import matplotlib.pyplot as plt
#import ipdb


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix

import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig

import os

## Parameters

In [3]:
## BERT parameters
bert_tokenizer_model_id = 'bert-base-uncased'
# bert_pretrained_model_id = 'google/bert_uncased_L-2_H-128_A-2' # tiny
# bert_pretrained_model_id = 'google/bert_uncased_L-4_H-256_A-4' # mini
# bert_pretrained_model_id = 'google/bert_uncased_L-4_H-512_A-8' # small
# bert_pretrained_model_id = 'google/bert_uncased_L-8_H-512_A-8' # medium
bert_pretrained_model_id = 'google/bert_uncased_L-12_H-768_A-12' # base

## other training parameters
max_doc_length = 256   # max in train data is 62 in main and 258 in extra data
clip = 0.25            #gradient clipping
lr = 0.00003           #initial learning rate
wdecay=1.2e-6          #weight decay applied to all weights
epochs = 30            #maximum number of epochs
batch_size = 4         #batch size
save = 'model.pt'      #path to save the final model
use_extra_data = True  #if extra data should be used

train_max_number_batches = -1 # only for the sake of debugging. Set to -1 to be ignored
inference_max_number_batches = -1 # only for the sake of debugging. Set to -1 to be ignored

## log parameters
log_interval = 100     #log interval during training
log_interval_val = 100 #log interval during validation

In [4]:
# Check PyTorch GPU capabilities:

print("\nPyTorch:")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('%d GPU(s) available.' % torch.cuda.device_count())
    print('GPU-Name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


PyTorch:
1 GPU(s) available.
GPU-Name: A100-PCIE-40GB


## Preparing Data

### Loading Captions

### Reading Data, tokenization, and loading into Tensors

In [5]:
labelcaptions = {"negative": 0, "neutral": 1, "positive": 2}

df = pd.read_csv('./all-data.csv', header=None, sep=",", encoding='ISO-8859-1', names=["label","text"])
df["label"] = df["label"].apply(lambda x: labelcaptions[x])
df["sentence_id"] = np.array(list(range(len(df))))

df_extra = pd.read_csv('./public-test-set.csv', header=None, sep=",", encoding='UTF-8', names=["label","text"])
df_extra["label"] = df_extra["label"].apply(lambda x: {"negative": 0, "neutral": 1, "positive": 2}[x])
df_extra["sentence_id"] = np.array(list(range(len(df),len(df)+len(df_extra))))

#### Split into train, val, test

In [6]:
train_size = 0.8
val_size = 0.2
test_size = 0
assert train_size + val_size + test_size == 1.

train_ind = slice(0,int(len(df)*train_size))
val_ind = slice(int(len(df)*train_size),int(len(df)*train_size)+int(len(df)*val_size))
test_ind = slice(int(len(df)*train_size)+int(len(df)*val_size),int(len(df)*train_size)+int(len(df)*val_size)++int(len(df)*test_size))

# Shuffle
df_shuffle = df.sample(frac=1)

# Split
df_train_, df_val, df_test = df_shuffle[train_ind], df_shuffle[val_ind], df_shuffle[test_ind]
df_train_["label"] = df_train_["label"].to_numpy().astype(np.int64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [7]:
def get_extended_df(df1, df2):
    labels = np.array(list(df1["label"].to_numpy()) + list(df2["label"].to_numpy()))
    texts = np.array(list(df1["text"].to_numpy()) + list(df2["text"].to_numpy()))
    df = pd.DataFrame({"label":labels, "text":texts})
    df["sentence_id"] = np.array(list(range(len(df))))
    df = df.sample(frac=1)
    return df

In [8]:
if use_extra_data: df_train = get_extended_df(df_train_, df_extra)
else: df_train = df_train_
df_train, df_train["text"].apply(lambda x: len(x.split(" "))).max()

(      label                                               text  sentence_id
 8205      2  $TSLA  i honestly figured she would trade inde...         8205
 7076      2                                          $PHIO big         7076
 4014      1               $AAPL y’all see that 6000 sell. Ouch         4014
 8976      2  $ZSAN alright fam in at 68 with 3k shares, let...         8976
 2521      2  Operating profit of Kauppalehti group rose to ...         2521
 ...     ...                                                ...          ...
 6976      1                                    $OPK hell yeah!         6976
 5521      2  @Californiamaster  Unlike most internet compan...         5521
 7673      2  $SRNE WILL BE OVER 20.00 SHORTLY AFTER PR DROP...         7673
 7217      2  $SE twice it touch 160, the 3rd time will be b...         7217
 7121      2  $PSTI President Trump Will Be Trumpeting Thera...         7121
 
 [8994 rows x 3 columns],
 258)

#### Bert stuff

In [9]:
tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_model_id, do_lower_case=True)

# sample subword tokenization
_sample_text = 'From a deceptively simple premise , this deeply moving French drama develops a startling story that works both as a detailed personal portrait and as a rather frightening examination of modern times .'
print (tokenizer.tokenize(_sample_text))


['from', 'a', 'dec', '##eptive', '##ly', 'simple', 'premise', ',', 'this', 'deeply', 'moving', 'french', 'drama', 'develops', 'a', 'startling', 'story', 'that', 'works', 'both', 'as', 'a', 'detailed', 'personal', 'portrait', 'and', 'as', 'a', 'rather', 'frightening', 'examination', 'of', 'modern', 'times', '.']


In [10]:
{id:word for word, id in tokenizer.get_vocab().items()}[1300]

'ي'

In [11]:
def convert_text_bertids_tensor(texts):
    #### complete the code (1 point) - START ####
    # do tokenization here and return a dictionary, containing tensors of 'input_ids', 'token_type_ids', and 'attention_mask'
    # the result looks like the following dictionary for the samples below
    #{'input_ids': tensor([[  101,  2013,  1037, 11703, 22048,  2135,  3722, 18458,  1010,  2023,
    #      6171,  3048,  2413,  3689, 11791,  1037, 19828,  2466,  2008,  2573,
    #      2119,  2004,  1037,  6851,  3167,  6533,  1998,  2004,  1037,  2738,
    #     17115,  7749,  1997,  2715,  2335,  1012,   102],
    #    [  101,  2009,  1005,  1055,  1037, 13012, 21031,  1997,  1037,  3185,
    #      1010,  2007,  1037,  2261, 11680,  4193,  2019,  4895, 28578, 17007,
    #      3085,  3730,  2415,  1012,   102,     0,     0,     0,     0,     0,
    #         0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    #     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    #    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    #     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    #     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    #    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    #     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
    
    
    #### complete the code - END ####

    return tokenizer(texts, padding=True, return_tensors="pt", truncation=True, max_length=max_doc_length)

# sample
_sample_text1 = 'From a deceptively simple premise , this deeply moving French drama develops a startling story that works both as a detailed personal portrait and as a rather frightening examination of modern times .'
_sample_text2 = "It 's a trifle of a movie , with a few laughs surrounding an unremarkable soft center ."
print (convert_text_bertids_tensor([_sample_text1, _sample_text2]))



{'input_ids': tensor([[  101,  2013,  1037, 11703, 22048,  2135,  3722, 18458,  1010,  2023,
          6171,  3048,  2413,  3689, 11791,  1037, 19828,  2466,  2008,  2573,
          2119,  2004,  1037,  6851,  3167,  6533,  1998,  2004,  1037,  2738,
         17115,  7749,  1997,  2715,  2335,  1012,   102],
        [  101,  2009,  1005,  1055,  1037, 13012, 21031,  1997,  1037,  3185,
          1010,  2007,  1037,  2261, 11680,  4193,  2019,  4895, 28578, 17007,
          3085,  3730,  2415,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1

In [12]:
def get_document_label_tensor(df):
    documents = []
    for x in df['text'].values:
        documents.append(x.strip())
    labels = np.array(list(df['label'].values))
    data = convert_text_bertids_tensor(documents)
    
    return data, labels

x_train, y_train = get_document_label_tensor(df_train)
print ('Train data items:', str(x_train['input_ids'].shape), str(y_train.shape))

x_val, y_val = get_document_label_tensor(df_val)
print ('Validation data items:', str(x_val['input_ids'].shape), str(y_val.shape))

# x_test, y_test = get_document_label_tensor(df_test)
# print ('Test data items:', str(x_test['input_ids'].shape), str(y_test.shape))

Train data items: torch.Size([8994, 256]) (8994,)
Validation data items: torch.Size([969, 80]) (969,)


In [13]:
# dataloaders 
dataset_train = TensorDataset(x_train['input_ids'], x_train['token_type_ids'], x_train['attention_mask'], torch.LongTensor(y_train))
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=7, drop_last=True)

dataset_val = TensorDataset(x_val['input_ids'], x_val['token_type_ids'], x_val['attention_mask'], torch.LongTensor(y_val))
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True, num_workers=7, drop_last=True)

dataset_test = dataset_val #TensorDataset(x_test['input_ids'], x_test['token_type_ids'], x_test['attention_mask'], torch.LongTensor(y_test))
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, num_workers=7, drop_last=True)

### Practical Functions

In [14]:
def model_save(fn, model, criterion, optimizer):
    with open(fn, 'wb') as f:
        torch.save([model.state_dict(), criterion.state_dict(), optimizer.state_dict()], f)

def model_load(fn):
    with open(fn, 'rb') as f:
        model_state, criterion_state, optimizer_state = torch.load(f)
    return model_state, criterion_state, optimizer_state

### Model

In [15]:
class BERTClassifierModel(torch.nn.Module):
    '''
    Classification model with BERT
    '''
    def __init__(self, bert, nout):
        super(BERTClassifierModel, self).__init__()
        
        self.bert = bert
        self.embedding_size = self.bert.config.hidden_size 

        self.output_projection_layer = torch.nn.Linear(self.embedding_size, nout)

    '''
    input format: seq_len, batch
    '''
    def forward(self, input_batch):       
        #### complete the code (2 points) - START ####
        # use the contents of `input_batch`, and pass them as parameters to self.bert
        # use the output of BERT together with self.output_projection_layer to provide predictions 
        # the final results should consist of two variables:
        #   `log_probs` -> tensor of logarithms of the predicted probabilities for classes 
        #   `final_representations` -> tensor of the output BERT vectors, based on which the prediction is done (for visualization purposes) 
        
        _out = self.bert.forward(input_ids=input_batch["input_ids"], attention_mask=input_batch["attention_mask"], token_type_ids=input_batch["token_type_ids"])
        
        final_representations = _out["last_hidden_state"][:,0,:]
        logits = self.output_projection_layer(final_representations)
        #ipdb.set_trace()
        
        log_probs = torch.nn.LogSoftmax(dim=1)(logits)

        #### complete the code - END ####
        
        return log_probs, final_representations


In [16]:
## DUMMY TEST
bert = BertModel.from_pretrained(bert_pretrained_model_id, cache_dir="cache")
_model = BERTClassifierModel(bert=bert, nout=4)
_input_ids = torch.LongTensor(np.random.randint(low=0, high=1000, size=(max_doc_length, batch_size)))
_token_type_ids = torch.LongTensor(np.zeros(shape=(max_doc_length, batch_size)))
_attention_mask = torch.LongTensor(np.ones(shape=(max_doc_length, batch_size)))
print ('input_ids shape: %s' % str(_input_ids.shape))
_input_batch = {'input_ids': _input_ids, 'token_type_ids': _token_type_ids, 'attention_mask': _attention_mask}
_output, _representations = _model.forward(_input_batch)
print ('output shape: %s' % str(_output.shape))
print ('representations shape: %s' % str(_representations.shape))
print ('done!')

input_ids shape: torch.Size([256, 4])
output shape: torch.Size([256, 4])
representations shape: torch.Size([256, 768])
done!


### Instantiating Model

In [17]:
bert = BertModel.from_pretrained(bert_pretrained_model_id, cache_dir="cache", output_attentions=True)

model = BERTClassifierModel(bert=bert, nout=len(labelcaptions.keys()))
model.to(device)
print('Model:', model)

criterion = torch.nn.NLLLoss()

params = list(model.parameters())

optimizer = torch.optim.Adam(params, lr=lr, weight_decay=wdecay)

stored_res = 0

Model: BERTClassifierModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

### Training ...

In [18]:
def softmax(X): return [np.exp(x) / np.sum(np.exp(x)) for x in X]

def predict(dataloader, model):
    
    model.to(device)
    # Turn on evaluation mode which disables dropout.
    model.eval()
    _predictions = []
    _outputs = []
    _labels = []
    _representations = []
    for i, batch in enumerate(dataloader):
        _input_ids, _token_type_ids, _attention_mask, _label = [e.to(device) for e in batch]
        _input_batch = {'input_ids': _input_ids,
                        'token_type_ids': _token_type_ids, 
                        'attention_mask': _attention_mask}
        
        with torch.no_grad():
            _output, _batch_representations = model.forward(_input_batch)
            _batch_predictions = torch.argmax(_output, dim=1)
        _outputs.extend(softmax(_output.cpu().numpy()))
        _predictions.extend(_batch_predictions.cpu().numpy())
        _labels.extend(_label.cpu().numpy())
        _representations.extend(_batch_representations.cpu().numpy())
        
        if i % log_interval_val == 0 and i > 0:
            print('Prediction | %5d batches | %5d data |' % (i, i*batch_size))
            
        if (i > inference_max_number_batches) and (inference_max_number_batches != -1):
            break
            
    return _predictions, _outputs, _labels, np.array(_representations)

def train(dataloader, model, criterion, optimizer):
    model.to(device)
    # Turn on training mode which enables dropout.
    model.train()
    start_time = time.time()
    log_interval_loss = 0
    for i, batch in enumerate(dataloader):
        _input_ids, _token_type_ids, _attention_mask, _label = [e.to(device) for e in batch]
        _input_batch = {'input_ids': _input_ids,
                        'token_type_ids': _token_type_ids, 
                        'attention_mask': _attention_mask}
        
        #### complete the code (2 points) - START ####
        # here the actual training happens. Required steps:
        #  forward pass
        #  calculating loss
        #  back-propagation
        #  updating parameters
        
        optimizer.zero_grad()
        
        _output, _ = model.forward(_input_batch)
        
        loss = criterion(_output, _label)
        loss.backward()
        
        if clip:
            torch.nn.utils.clip_grad_norm_(params, clip)
            
        optimizer.step()
        
        #### complete the code - END ####
        
        log_interval_loss += loss.item()
        if i % log_interval == 0 and i > 0:
            cur_loss = log_interval_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d} batches | {:5d} data | ms/batch {:5.2f} | loss {:5.3f}'.
                  format(epoch, i, i*batch_size, elapsed * 1000 / log_interval, cur_loss))
            log_interval_loss = 0
            start_time = time.time()
            
        if (i > train_max_number_batches) and (train_max_number_batches != -1):
            break

In [19]:
print('=' * 89)
print('Start training')

EVAL_MEASURE = 'accuracy'


# Loop over epochs.
best_val_res = []
try: stored_res = stored_res
except: stored_res = 0
for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    train(dataloader_train, model, criterion, optimizer)

    print('Epoch %d validation' % epoch)
    yhat, _, y, _ = predict(dataloader_val, model)
    val_results = classification_report(y, yhat, output_dict=True)
    val_res = val_results[EVAL_MEASURE]
    
    print('-' * 89)
    print('| end of epoch %3d | time: %5.2fs | validation %s %.3f | ' % 
          (epoch, (time.time() - epoch_start_time), EVAL_MEASURE, val_res))
    print('-' * 89)

    if val_res > stored_res:
        save = f"models/BERT_valacc={val_res*100:.2f}%_bs={batch_size}_doc_len={max_doc_length}_epoch={epoch}_lr={lr}_valsize={val_size*100:.0f}_{bert_pretrained_model_id.split('/')[-1]}"
        model_save(save, model, criterion, optimizer)
        print('Saving model (new best validation)')
        stored_res = val_res

    best_val_res.append(val_res)
print('End of training')

Start training
| epoch   1 |   100 batches |   400 data | ms/batch 73.89 | loss 1.004
| epoch   1 |   200 batches |   800 data | ms/batch 58.16 | loss 0.840
| epoch   1 |   300 batches |  1200 data | ms/batch 58.28 | loss 0.857
| epoch   1 |   400 batches |  1600 data | ms/batch 58.00 | loss 0.771
| epoch   1 |   500 batches |  2000 data | ms/batch 54.73 | loss 0.707
| epoch   1 |   600 batches |  2400 data | ms/batch 57.39 | loss 0.776
| epoch   1 |   700 batches |  2800 data | ms/batch 56.44 | loss 0.616
| epoch   1 |   800 batches |  3200 data | ms/batch 55.66 | loss 0.671
| epoch   1 |   900 batches |  3600 data | ms/batch 57.16 | loss 0.689
| epoch   1 |  1000 batches |  4000 data | ms/batch 55.33 | loss 0.666
| epoch   1 |  1100 batches |  4400 data | ms/batch 56.18 | loss 0.664
| epoch   1 |  1200 batches |  4800 data | ms/batch 57.67 | loss 0.717
| epoch   1 |  1300 batches |  5200 data | ms/batch 55.13 | loss 0.644
| epoch   1 |  1400 batches |  5600 data | ms/batch 53.88 | lo

RuntimeError: [enforce fail at inline_container.cc:274] . unexpected pos 63883968 vs 63883856

### Test set Evaluation

In [None]:
# Load the best saved model.
model_state, criterion_state, optimizer_state = model_load(save)
model.load_state_dict(model_state)
criterion.load_state_dict(criterion_state)
optimizer.load_state_dict(optimizer_state)

# Run on test data.
yhat, yhat_proba, y, x_test_representations = predict(dataloader_test, model)
results = classification_report(y, yhat, output_dict=True)
print (classification_report(y, yhat))    

print('=' * 89)
print('| End of testing | test %s %.3f ' % (EVAL_MEASURE, results[EVAL_MEASURE]))
print('=' * 89)

### Plots

In [None]:
# Can somehow distinguish between positive and negative, but not very good
scores = (yhat_proba * np.array([-1,0,1])).sum(axis=1)
plt.figure(figsize=(15,10))
plt.hist(scores[np.array(y)==0], color="red", alpha=0.5,label="negative", bins=30)
plt.hist(scores[np.array(y)==1], color="gray", alpha=0.5,label="neutral", bins=30)
plt.hist(scores[np.array(y)==2], color="green", alpha=0.5,label="positive", bins=30)
plt.legend()
plt.show()

In [None]:
#code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range (cm.shape[0]):
        for j in range (cm.shape[1]):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

tuples=[(key, value) for key, value in labelcaptions.items()]
tuples.sort(key=lambda x: x[1])
labelcaptions_inorder = [t[0] for t in tuples]

# Compute confusion matrix
cnf_matrix = confusion_matrix(y, yhat)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
#plt.figure(figsize=(8,6))
#plot_confusion_matrix(cnf_matrix, classes=labelcaptions_inorder,
#                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(8,6))
plot_confusion_matrix(cnf_matrix, classes=labelcaptions_inorder, normalize=True,
                      title='Normalized confusion matrix')

plt.show()