In [None]:
import pandas as pd
import numpy as np
from torch import nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel
import torch

In [None]:
df = pd.read_csv('./raw_data/fulltrain.csv')

In [None]:
df.head()

Unnamed: 0,label,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [None]:
df.columns = ["label", "text"]
label_counts = df['label'].value_counts()
df_len = np.arange(len(df))
label_1_idx = df_len[df.iloc[df_len]['label'] == 1]
label_1 = np.random.choice(label_1_idx, 150, replace=False)
label_2_idx = df_len[df.iloc[df_len]['label'] == 2]
label_2 = np.random.choice(label_2_idx, 150, replace=False)
label_3_idx = df_len[df.iloc[df_len]['label'] == 3]
label_3 = np.random.choice(label_3_idx, 150, replace=False)
label_4_idx = df_len[df.iloc[df_len]['label'] == 4]
label_4 = np.random.choice(label_4_idx, 150, replace=False)

new_texts = np.append(label_1, label_2)
new_texts = np.append(new_texts, label_3)
new_texts = np.append(new_texts, label_4)

new_data = {
    "label": [df.iloc[i]["label"] for i in new_texts],
    "text": [df.iloc[i]['text'] for i in new_texts]
}

new_df = pd.DataFrame(new_data)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(new_df.index.values,
                                                    new_df.label.values,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=new_df.label.values)

In [None]:
new_df['data_type'] = ['none'] * new_df.shape[0]

new_df.loc[X_train, 'data_type'] = 'train'
new_df.loc[X_val, 'data_type'] = 'val'

new_df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
label,data_type,Unnamed: 2_level_1
1,train,120
1,val,30
2,train,120
2,val,30
3,train,120
3,val,30
4,train,120
4,val,30


In [None]:
model_path = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_path)

class Dataset(torch.utils.data.Dataset):
  def __init__(self, texts_idx, labels, dataframe=new_df):
    self.labels = labels
    self.texts = [tokenizer(dataframe.iloc[idx]['text'],
                            padding='max_length',
                            max_length=512,
                            truncation=True,
                            return_tensors='pt')
                  for idx in texts_idx]  
  def classes(self):
      return self.labels

  def __len__(self):
      return len(self.labels)

  def get_batch_labels(self, idx):
      # Fetch a batch of labels
      return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
      # Fetch a batch of inputs
      return self.texts[idx]

  def __getitem__(self, idx):

      batch_texts = self.get_batch_texts(idx)
      batch_y = self.get_batch_labels(idx)

      return batch_texts, batch_y 

In [35]:
class BertClassifier(nn.Module):
  def __init__(self, dropout=0.5, bert=BertModel.from_pretrained(model_path)):
    super(BertClassifier, self).__init__()

    self.bert = bert
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, 5)
    self.relu = nn.ReLU()
  
  def forward(self, input_id, mask):
    _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
    dropout_output = self.dropout(pooled_output)
    linear_output = self.linear(dropout_output)
    final_layer = self.relu(linear_output)
    
    return final_layer

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
from tqdm import tqdm

def train(model, xtrain, ytrain, xtest, ytest, learning_rate, epochs):

    train, val = Dataset(xtrain, ytrain), Dataset(xtest, ytest)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(xtrain): .3f} \
                | Train Accuracy: {total_acc_train / len(xtrain): .3f} \
                | Val Loss: {total_loss_val / len(xtest): .3f} \
                | Val Accuracy: {total_acc_val / len(xtest): .3f}')
                            
            model.bert.save_pretrained('./raw_data/model_bert_mid1.pth')
                  
EPOCHS = 2
model = BertClassifier()
LR = 1e-5
train(model, X_train, y_train, X_val, y_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 480/480 [33:02<00:00,  4.13s/it]


Epochs: 1 | Train Loss:  1.162                 | Train Accuracy:  0.573                 | Val Loss:  0.732                 | Val Accuracy:  0.792


100%|██████████| 480/480 [39:42<00:00,  4.96s/it]


Epochs: 2 | Train Loss:  0.526                 | Train Accuracy:  0.877                 | Val Loss:  0.293                 | Val Accuracy:  0.950


In [31]:
df_test = pd.read_csv('./raw_data/balancedtest.csv')

df_test_len = np.arange(len(df_test))
label_1_idx_test = df_test_len[df_test.iloc[df_test_len]['label'] == 1]
label_1_test = np.random.choice(label_1_idx_test, 150, replace=False)
label_2_idx_test = df_test_len[df_test.iloc[df_test_len]['label'] == 2]
label_2_test = np.random.choice(label_2_idx_test, 150, replace=False)
label_3_idx_test = df_test_len[df_test.iloc[df_test_len]['label'] == 3]
label_3_test = np.random.choice(label_3_idx_test, 150, replace=False)
label_4_idx_test = df_test_len[df_test.iloc[df_test_len]['label'] == 4]
label_4_test = np.random.choice(label_4_idx_test, 150, replace=False)

new_texts_test = np.append(label_1_test, label_2_test)
new_texts_test = np.append(new_texts_test, label_3_test)
new_texts_test = np.append(new_texts_test, label_4_test)

new_test_data = {
    "label": [df_test.iloc[i]["label"] for i in new_texts_test],
    "text": [df_test.iloc[i]['text'] for i in new_texts_test]
}

new_test_df = pd.DataFrame(new_test_data)

In [33]:
from sklearn.metrics import classification_report


def evaluate(model, xdata, ydata, df):
    y_true = []
    y_output = []
    test = Dataset(xdata, ydata, df)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in tqdm(test_dataloader):

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              y_true.append(test_label)
              y_output.append(output.argmax(dim=1))
              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(xdata): .3f}')
    return y_true, y_output

test_labels, test_res = evaluate(model, new_test_df.index.values, new_test_df.label.values, new_test_df)

100%|██████████| 600/600 [15:18<00:00,  1.53s/it]

Test Accuracy:  0.447





In [34]:
print(classification_report(list(map(lambda x: x.sum().item(), test_labels)), list(map(lambda x: x.sum().item(), test_res))))

              precision    recall  f1-score   support

           1       0.74      0.55      0.63       150
           2       0.32      0.16      0.21       150
           3       0.23      0.29      0.26       150
           4       0.53      0.78      0.63       150

    accuracy                           0.45       600
   macro avg       0.45      0.45      0.43       600
weighted avg       0.45      0.45      0.43       600

