In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [2]:
# pip install transformers

In [3]:
# !nvidia-smi # double check you're in a GPU runtime


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

import random
import os
import io

from torch import nn
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import (DistilBertTokenizer, BertConfig, AdamW, BertForSequenceClassification,
                          DistilBertConfig,
                          get_linear_schedule_with_warmup, DistilBertForSequenceClassification)

In [5]:
from tqdm import tqdm, trange,tnrange,tqdm_notebook
from torch.optim import Adam

In [6]:
# identify and specify the GPU as the device, later in training loop we will load data into device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#n_gpu = torch.cuda.device_count()
#torch.cuda.get_device_name(0)

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

In [7]:
device

device(type='cuda')

In [8]:
data = pd.read_csv("train_data.csv")

In [9]:
data.shape

(2999, 2)

In [10]:
data = data.dropna()

In [11]:
data.shape

(2999, 2)

In [12]:
data.head()

Unnamed: 0,label,text
0,0,"Such evidence-based consumer facing findings, ..."
1,0,"Soon after the result was declared, Nath chall..."
2,1,Need detail breakout of commodity and transpor...
3,1,Send emails to Christi for further assistance ...
4,1,Please let me know what you find out from Gerald.


In [13]:
# count = 0
# for i in data.label:
#     if i=="ACTION":
#         data.label[count] = 1
#     else:
#         data.label[count] = 0
#     count+=1

In [14]:
data.label = list(map(int, data.label))

In [15]:
data['label'].value_counts()

0    1667
1    1332
Name: label, dtype: int64

In [16]:
# data.sample(frac = 1, random_state=42)

In [17]:
# data.to_csv("train_data.csv",index=False)

In [18]:
from sklearn.utils import shuffle
data = shuffle(data)
data.shape
data = data[:6000]
# data.tail(20)


In [19]:
data['label'].value_counts()

0    1667
1    1332
Name: label, dtype: int64

In [20]:
# data['level'].value_counts()

In [21]:
# array of sentences
sentences =  data.text.values

In [22]:
batch_size = 32 
# batch_size = 1065
MAX_LEN = 128

In [23]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [24]:
# tokenizer

In [25]:
# tokenizer.save_pretrained("distilbert-tokenizer")

In [26]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = [label for label in df['label']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = MAX_LEN, truncation=True,
                                return_tensors="pt") for text in df['text']]
#         i=0
#         for text in df['response_post']:
#             print(text)
#             i+=1
#             print(i)
#         self.texts = [tokenizer(df, 
#                                padding='max_length', max_length = MAX_LEN, truncation=True,
#                                 return_tensors="pt")]
            

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [27]:
# data_6k = data[:4001]

In [28]:
# type(data_6k.response_post[1000])

In [29]:
np.random.seed(112)
df_train, df_val, df_test = np.split(data.sample(frac=1, random_state=42), 
                                     [int(.8*len(data)), int(.9*len(data))])

print(len(df_train),len(df_val), len(df_test))

2399 300 300


In [30]:
# df_train.iloc[453]

In [31]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        
        super(BertClassifier, self).__init__()
#         config = DistilBertConfig(n_layers=1,num_labels=768)
        self.bert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=768)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()
#         self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):

        pooled_output = self.bert(input_ids= input_id, attention_mask=mask).logits
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
#         final_layer_1 = self.sigmoid(final_layer)

        return final_layer

In [32]:
model=BertClassifier()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [33]:
# type(model)

In [34]:
# model
# deleteEncodingLayers(model, 3)

# Train main model

In [35]:
def train(model, train_data, val_data, learning_rate, epochs, adam_epsilon = 1e-8):

    train, val = Dataset(train_data), Dataset(val_data)
#     print(train, val)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)
    
#     print(len(train_dataloader))
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    criterion = nn.CrossEntropyLoss()
    #optimizer = Adam(model.parameters(), lr= learning_rate)
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
#             print(train_dataloader)
#             for i in train_dataloader:
#                 print(i)
            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
#                 print(mask.shape, input_id.shape)  ### [132,1,32]   [132,32]
#                 print(type(mask[0][0][0]))    ### tensor

                output = model(input_id, mask)
#                 print(output)
#                 print(train_label.size())
                batch_loss = criterion(output, train_label)
#                 print(batch_loss)
            
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
#                 print(output.argmax(dim=1))
#                 print(train_label)
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                
                # Clip the norm of the gradients to 1.0
                # Gradient clipping is not in AdamW anymore
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
                # Update parameters and take a step using the computed gradient
                optimizer.step()
                # Update learning rate schedule
                scheduler.step()
                # Clear the previous accumulated gradients
                optimizer.zero_grad()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)
                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')


EPOCHS = 5
# model = BertClassifier()
# LR = 1e-6
# LR = 0.9
LR = 2e-6 #
num_warmup_steps = 0
num_training_steps = len(df_train)*EPOCHS
adam_epsilon = 1e-8
train(model, df_train, df_val, LR, EPOCHS, adam_epsilon)

  1%|▌                                           | 1/75 [00:00<00:53,  1.38it/s]


RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 5.80 GiB total capacity; 4.16 GiB already allocated; 36.19 MiB free; 4.21 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [69]:
predictions = []
def evaluate(model, test_data):

    test = Dataset(test_data)
#     print(test)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
#             print(len(output[0]))
#             print(test_label)
            predictions.append({'Actual':test_label,'Pred':output.argmax(dim=1)})
            acc = (output.argmax(dim=1) == test_label).sum()
#             print(acc)
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return predictions
    
predictions = evaluate(model, df_test)
# print(predictions)

Test Accuracy:  0.981


In [70]:
df_metric = pd.DataFrame()

actuals = []
preds = []
for res in predictions:
  act = res['Actual']
  pre = res['Pred']
  actuals.extend(act.tolist())
  preds.extend(pre.tolist())

print(classification_report(actuals, preds))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       143
           1       0.97      0.99      0.98       127

    accuracy                           0.98       270
   macro avg       0.98      0.98      0.98       270
weighted avg       0.98      0.98      0.98       270



In [39]:
from sklearn.metrics import confusion_matrix
actuals = []
preds = []
for res in predictions:
  act = res['Actual']
  pre = res['Pred']
  actuals.extend(act.tolist())
  preds.extend(pre.tolist())
    
print(confusion_matrix(actuals, preds))

[[144   4]
 [  1 112]]


In [10]:
input_text = "my machine is not working"

padded_sequence = tokenizer(input_text, padding='max_length', max_length=128, truncation=True, return_tensors="pt")

input_id = padded_sequence['input_ids'].squeeze(1).to(device)
mask = padded_sequence['attention_mask'].to(device)

predictions = []

output = model(input_id, mask)
print(output)
output1 = output.argmax(dim=1)

if output1 == 1:
    # print(1)
    if output[0][1]>4.6:
        output1 = 1#'Empathy'
    else:
        # print(0)
        output1 = 0#'Not Empathy'
else:
    # print(0)
    output1 = 0#'Not Empathy'
predictions.append(output1)
print(predictions)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

In [72]:
output_model = 'action_item.pth'

In [73]:
model.state_dict

<bound method Module.state_dict of BertClassifier(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_featu

In [74]:
torch.save({'model_state_dict': model.state_dict()}, output_model)