In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import matplotlib.pyplot as plt
# import ipdb


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix

import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig

import os

In [2]:
labelcaptions = {"negative": 0, "neutral": 1, "positive": 2}

df = pd.read_csv('./all-data.csv', header=None, sep=",", encoding='ISO-8859-1', names=["label","text"])
df["label"] = df["label"].apply(lambda x: labelcaptions[x])
df["sentence_id"] = np.array(list(range(len(df))))

In [3]:
df_test = pd.read_csv('./private-test-set.csv', sep=",", header=0, encoding='UTF-8', names=["ID","text"])[:-1]
df_test

Unnamed: 0,ID,text
0,0,$AEZS BOOM !!
1,1,Discord https://t.co/VJCTdWxjI6 To get advise...
2,2,$TGTX few hours negative action making people ...
3,3,$gaxy $spy $vvpr $nio $xspa $opti $fb $idex $i...
4,4,"Today Top Flow in S&amp;P 500 #SP500, Buy Flow..."
...,...,...
11994,11994,Trimmed $UPWK 💯% gain (still holding 50% of or...
11995,11995,$SPG where’s the guys braving that they sold a...
11996,11996,$TRHC Results of Tabula Rasa HealthCare’s Firs...
11997,11997,$Mist is looking good


In [4]:
## BERT parameters
bert_tokenizer_model_id = 'bert-base-uncased'
bert_pretrained_model_id = 'google/bert_uncased_L-12_H-768_A-12'

## other training parameters
max_doc_length = 256   # max in train data is 62 in main and 258 in extra data
clip = 0.25            #gradient clipping
lr = 0.00003           #initial learning rate
wdecay=1.2e-6          #weight decay applied to all weights
epochs = 30            #maximum number of epochs
batch_size = 64         #batch size
save = 'model.pt'      #path to save the final model
use_extra_data = True  #if extra data should be used

train_max_number_batches = -1 # only for the sake of debugging. Set to -1 to be ignored
inference_max_number_batches = -1 # only for the sake of debugging. Set to -1 to be ignored

## log parameters
log_interval = 100     #log interval during training
log_interval_val = 100 #log interval during validation

In [5]:
tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_model_id, do_lower_case=True)

def convert_text_bertids_tensor(texts): return tokenizer(texts, padding=True, return_tensors="pt", truncation=True, max_length=max_doc_length)

def get_document_label_tensor(df, test=False):
    documents = []
    for x in df['text'].values:
        documents.append(x.strip())
    data = convert_text_bertids_tensor(documents)
    
    if test:
        ID = df['ID'].values.astype(np.int64)
        return ID, data
    else:
        labels = np.array(list(df['label'].values))
        return data, labels

x, y = get_document_label_tensor(df)
print ('Data items:', str(x['input_ids'].shape), str(y.shape))

# dataloaders 
dataset = TensorDataset(x['input_ids'], x['token_type_ids'], x['attention_mask'], torch.LongTensor(y))
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=7, drop_last=True)

Data items: torch.Size([4846, 150]) (4846,)


In [6]:
### TEST
ID_test, x_test = get_document_label_tensor(df_test, test=True)
dataset_test = TensorDataset(x_test['input_ids'], x_test['token_type_ids'], x_test['attention_mask'], torch.LongTensor(ID_test))
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=7, drop_last=False)

In [7]:
path = 'models/' + 'BERT_valacc=86.54%_bs=6_doc_len=256_epoch=1_lr=3e-05_valsize=20_bert_uncased_L-12_H-768_A-12'

In [8]:
# Check PyTorch GPU capabilities:

print("\nPyTorch:")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('%d GPU(s) available.' % torch.cuda.device_count())
    print('GPU-Name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


PyTorch:
1 GPU(s) available.
GPU-Name: A100-PCIE-40GB


In [9]:
class BERTClassifierModel(torch.nn.Module):
    '''
    Classification model with BERT
    '''
    def __init__(self, bert, nout):
        super(BERTClassifierModel, self).__init__()
        
        self.bert = bert
        self.embedding_size = self.bert.config.hidden_size 

        self.output_projection_layer = torch.nn.Linear(self.embedding_size, nout)

    '''
    input format: seq_len, batch
    '''
    def forward(self, input_batch):       
        #### complete the code (2 points) - START ####
        # use the contents of `input_batch`, and pass them as parameters to self.bert
        # use the output of BERT together with self.output_projection_layer to provide predictions 
        # the final results should consist of two variables:
        #   `log_probs` -> tensor of logarithms of the predicted probabilities for classes 
        #   `final_representations` -> tensor of the output BERT vectors, based on which the prediction is done (for visualization purposes) 
        
        _out = self.bert.forward(input_ids=input_batch["input_ids"], attention_mask=input_batch["attention_mask"], token_type_ids=input_batch["token_type_ids"])
        
        final_representations = _out["last_hidden_state"][:,0,:]
        logits = self.output_projection_layer(final_representations)
        #ipdb.set_trace()
        
        log_probs = torch.nn.LogSoftmax(dim=1)(logits)

        #### complete the code - END ####
        
        return log_probs, final_representations


In [10]:
bert = BertModel.from_pretrained(bert_pretrained_model_id, cache_dir="cache", output_attentions=True)

model = BERTClassifierModel(bert=bert, nout=len(labelcaptions.keys()))
model.to(device)
print('Model:', model)

criterion = torch.nn.NLLLoss()

params = list(model.parameters())

optimizer = torch.optim.Adam(params, lr=lr, weight_decay=wdecay)

stored_res = 0

Model: BERTClassifierModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [11]:
def model_load(fn):
    with open(fn, 'rb') as f:
        model_state, criterion_state, optimizer_state = torch.load(f)
    return model_state, criterion_state, optimizer_state

In [12]:
def softmax(X): return [np.exp(x) / np.sum(np.exp(x)) for x in X]

def predict(dataloader, model):
    
    model.to(device)
    # Turn on evaluation mode which disables dropout.
    model.eval()
    _predictions = []
    _outputs = []
    _labels = []
    _representations = []
    for i, batch in enumerate(dataloader):
        _input_ids, _token_type_ids, _attention_mask, _label = [e.to(device) for e in batch]
        _input_batch = {'input_ids': _input_ids,
                        'token_type_ids': _token_type_ids, 
                        'attention_mask': _attention_mask}
        
        with torch.no_grad():
            _output, _batch_representations = model.forward(_input_batch)
            _batch_predictions = torch.argmax(_output, dim=1)
        _outputs.extend(softmax(_output.cpu().numpy()))
        _predictions.extend(_batch_predictions.cpu().numpy())
        _labels.extend(_label.cpu().numpy())
        _representations.extend(_batch_representations.cpu().numpy())
        
        if i % log_interval_val == 0 and i > 0:
            print('Prediction | %5d batches | %5d data |' % (i, i*batch_size))
            
        if (i > inference_max_number_batches) and (inference_max_number_batches != -1):
            break
            
    return _predictions, _outputs, _labels, np.array(_representations)

In [13]:
EVAL_MEASURE = 'accuracy'

# Load the best saved model.
model_state, criterion_state, optimizer_state = model_load(path)
model.load_state_dict(model_state)
criterion.load_state_dict(criterion_state)
optimizer.load_state_dict(optimizer_state)

# Run on test data.
yhat, yhat_proba, y, x_test_representations = predict(dataloader, model)
results = classification_report(y, yhat, output_dict=True)
print (classification_report(y, yhat))    

print('=' * 89)
print('| End of testing | test %s %.3f ' % (EVAL_MEASURE, results[EVAL_MEASURE]))
print('=' * 89)

              precision    recall  f1-score   support

           0       0.84      0.95      0.89       596
           1       0.95      0.92      0.93      2854
           2       0.88      0.89      0.88      1350

    accuracy                           0.91      4800
   macro avg       0.89      0.92      0.90      4800
weighted avg       0.92      0.91      0.91      4800

| End of testing | test accuracy 0.914 


In [14]:
########### TEST ##################
def predict_test(dataloader, model):
    
    model.to(device)
    # Turn on evaluation mode which disables dropout.
    model.eval()
    _predictions = []
    _outputs = []
    _labels = []
    _representations = []
    IDS = []
    for i, batch in enumerate(dataloader):
        _input_ids, _token_type_ids, _attention_mask, ID = [e.to(device) for e in batch]
        _input_batch = {'input_ids': _input_ids,
                        'token_type_ids': _token_type_ids, 
                        'attention_mask': _attention_mask}
        
        with torch.no_grad():
            _output, _batch_representations = model.forward(_input_batch)
            _batch_predictions = torch.argmax(_output, dim=1)
        _outputs.extend(softmax(_output.cpu().numpy()))
        _predictions.extend(_batch_predictions.cpu().numpy())
        _representations.extend(_batch_representations.cpu().numpy())
        IDS.extend(ID.cpu().numpy())
        
        if i % log_interval_val == 0 and i > 0:
            print('Prediction | %5d batches | %5d data |' % (i, i*batch_size))
            
        if (i > inference_max_number_batches) and (inference_max_number_batches != -1):
            break
            
    return IDS, _predictions, _outputs, np.array(_representations)

In [15]:
############# TEST ###############################
IDS, yhat, yhat_proba, x_test_representations = predict_test(dataloader_test, model)

Prediction |   100 batches |  6400 data |


In [16]:
########### TEST #######################
import csv
def save_results(yhat, IDS):
    preds = [{val:key for key,val in labelcaptions.items()}[e] for e in yhat]
    df = pd.DataFrame({'ID': IDS, 'Text': preds})
    df.to_csv("submission_test1.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)
    return df

In [17]:
preds = save_results(yhat, IDS)
preds

Unnamed: 0,ID,Text
0,0,positive
1,1,neutral
2,2,positive
3,3,positive
4,4,positive
...,...,...
11994,11994,positive
11995,11995,negative
11996,11996,neutral
11997,11997,positive
