In [1]:
import torch
from transformers import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import textwrap
import math
from sklearn.model_selection import train_test_split
from IPython.display import clear_output

import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import T5EncoderModel, T5Config, T5Tokenizer



clear_output()

In [2]:
df_train = pd.read_csv('/home/m_nsu/ICLR/Datasets/Amazon/train_40k.csv')
df_test = pd.read_csv('/home/m_nsu/ICLR/Datasets/Amazon/val_10k.csv')

In [3]:
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

In [4]:
df_train = df_train[['Text','Cat1','Cat2','Cat3']]
df_val = df_val[['Text','Cat1','Cat2','Cat3']]
df_test = df_test[['Text','Cat1','Cat2','Cat3']]

In [5]:
df_train.head()

Unnamed: 0,Text,Cat1,Cat2,Cat3
14307,"The concept of this toy is good. However, if y...",pet supplies,dogs,toys
17812,"This dryer ruined my hair!!! At first, after I...",beauty,hair care,styling tools
11020,Much to my surprise after a year of waiting th...,toys games,novelty gag toys,miniatures
15158,The tree is beautiful but upon arrival when I ...,grocery gourmet food,fresh flowers live indoor plants,live indoor plants
24990,Watchmaker offered to install a new battery in...,health personal care,household supplies,unknown


In [6]:
df_test = df_test[df_test['Cat3'] != 'unknown']   #Dropping "unknown" rows
df_train = df_train[df_train['Cat3'] != 'unknown']   #Dropping "unknown" rows
df_val = df_val[df_val['Cat3'] != 'unknown']   #Dropping "unknown" rows


In [7]:
df = pd.concat([df_train, df_val, df_test], axis=0)
df

Unnamed: 0,Text,Cat1,Cat2,Cat3
14307,"The concept of this toy is good. However, if y...",pet supplies,dogs,toys
17812,"This dryer ruined my hair!!! At first, after I...",beauty,hair care,styling tools
11020,Much to my surprise after a year of waiting th...,toys games,novelty gag toys,miniatures
15158,The tree is beautiful but upon arrival when I ...,grocery gourmet food,fresh flowers live indoor plants,live indoor plants
5980,HI MY NAME IS SHARON AND I JUST LOVE IT!!!!!!!...,toys games,action toy figures,playsets
...,...,...,...,...
9995,Stays on continuously without shutting off! It...,health personal care,health care,pain relievers
9996,these look great in our 10 gallon tank- colors...,pet supplies,fish aquatic pets,aquarium d cor
9997,"This works great, but needs a better way to at...",pet supplies,dogs,carriers travel products
9998,she absolutely LOVES this thing. I dice up gre...,pet supplies,dogs,toys


In [8]:
# Label Encode Cat1
df['Cat1-map'], map = pd.factorize(df['Cat1'])
cat1_map = dict(zip(map, range(len(map))))
map_cat1 = {v: k for k, v in cat1_map.items()}

df_train['Cat1'] = df_train["Cat1"].apply(lambda x: cat1_map[x])
df_val['Cat1'] = df_val["Cat1"].apply(lambda x: cat1_map[x])
df_test['Cat1'] = df_test["Cat1"].apply(lambda x: cat1_map[x])
clear_output()

In [9]:
PRE_TRAINED_MODEL_NAME = 't5-large'

In [10]:
tokenizer = T5Tokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
config = T5Config.from_pretrained(PRE_TRAINED_MODEL_NAME)
clear_output()

In [11]:
MAX_LEN = 200
RANDOM_SEED = 42
#device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )
device = torch.device(1)

In [12]:
print(device)

cuda:1


In [13]:
class IMDBDataset(Dataset):

  def __init__(self, texts, cats1, tokenizer, max_len):
    self.texts = texts
    self.cats1 = cats1
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, item):
    text = str(self.texts[item])
    cat1 = self.cats1[item]


    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation = True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'cat1': torch.tensor(cat1, dtype=torch.long),

    }

In [14]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = IMDBDataset(
    texts=df.Text.to_numpy(),
    cats1=df['Cat1'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=8
  )

In [15]:
BATCH_SIZE = 32

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [16]:
data = next(iter(train_data_loader))
data.keys()

dict_keys(['text', 'input_ids', 'attention_mask', 'cat1'])

In [17]:
class IMDBClassifier(nn.Module):
  def __init__(self, n_classes):
    super(IMDBClassifier, self).__init__()
    self.bert = T5EncoderModel.from_pretrained(PRE_TRAINED_MODEL_NAME,config=config)

    self.FC = nn.Linear(config.hidden_size,6, bias=False)


  def forward(self, input_ids, attention_mask):
    with torch.no_grad():
      pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        return_dict = False
      )
    pooled_output = torch.mean(pooled_output[0], dim=1) # Taking Averge pooled last layer embedding

    binary_out = self.FC(pooled_output)

    return binary_out

In [18]:
model = IMDBClassifier(6)
model = model.to(device)
clear_output()

In [19]:
for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False

In [20]:
#for name, param in model.named_parameters():
#    print(name, param.requires_grad)

In [21]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)


print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

torch.Size([32, 200])
torch.Size([32, 200])


In [22]:
#del test
torch.cuda.empty_cache()

In [23]:
outs = model(input_ids, attention_mask)
outs

tensor([[ 0.0249,  0.0193,  0.0329, -0.1503,  0.1006,  0.0638],
        [ 0.0591, -0.0151, -0.0414, -0.0412,  0.0110,  0.0158],
        [ 0.0961,  0.0309,  0.0557, -0.1691,  0.0725,  0.0697],
        [ 0.0705,  0.0090,  0.0810, -0.2398,  0.1624,  0.0076],
        [ 0.0837, -0.0442,  0.0184, -0.1942,  0.1155, -0.0371],
        [ 0.0668,  0.0120,  0.0150, -0.1447,  0.0347, -0.0112],
        [ 0.0523, -0.0279, -0.0150, -0.0679,  0.0882,  0.0108],
        [ 0.1011,  0.0453,  0.1575, -0.1325,  0.1529,  0.0866],
        [ 0.1294,  0.0150,  0.1329, -0.1760,  0.1204,  0.1360],
        [ 0.0766,  0.0692,  0.1277, -0.1734,  0.1420,  0.0151],
        [ 0.0326,  0.0131, -0.0589, -0.1034,  0.0782,  0.0776],
        [ 0.1268,  0.0259,  0.0868, -0.1840,  0.1307,  0.0224],
        [ 0.0893,  0.0555, -0.0041, -0.1224,  0.1298,  0.0171],
        [ 0.0446,  0.0081,  0.0749, -0.2381,  0.0906,  0.0702],
        [ 0.0317,  0.0408,  0.0046, -0.0554,  0.0434,  0.0122],
        [ 0.0358,  0.0134, -0.0512,  0.0

In [24]:
EPOCHS = 8

optimizer = AdamW(model.parameters(), lr=0.001)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=math.floor((1./5)*total_steps),
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)
clear_output()

In [25]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    cat1 = d["cat1"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    ).to(device)

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, cat1)

    correct_predictions += torch.sum(preds == cat1)
    losses.append(loss.item())


    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [26]:
def eval_model(model, data_loader, loss_fn, device, n_examples, on_new=False):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      cat1 = d["cat1"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
      ).to(device)
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, cat1)

      correct_predictions += torch.sum(preds == cat1)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [27]:
%%time

train_a = []
train_l = []
val_a = []
val_l = []
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  train_a.append(train_acc)
  train_l.append(train_loss)
  val_a.append(val_acc)
  val_l.append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'baseline_t5_best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/8
----------
Train loss 1.6205975795727645 accuracy 0.3933969769291965
Val   loss 1.400238669874296 accuracy 0.5027741083223249

Epoch 2/8
----------
Train loss 1.2036537059038592 accuracy 0.622149297268629
Val   loss 1.0362986182864709 accuracy 0.6672391017173052

Epoch 3/8
----------
Train loss 0.9555213156838686 accuracy 0.7087311058074781
Val   loss 0.9022605610296193 accuracy 0.7083223249669749

Epoch 4/8
----------
Train loss 0.8553834918447921 accuracy 0.7347852028639619
Val   loss 0.8388205477457006 accuracy 0.7256274768824307

Epoch 5/8
----------
Train loss 0.8022850884928556 accuracy 0.7478122513922036
Val   loss 0.8067228704816681 accuracy 0.7334214002642008

Epoch 6/8
----------
Train loss 0.7706339136793151 accuracy 0.7565632458233891
Val   loss 0.7877737359155582 accuracy 0.7373844121532365

Epoch 7/8
----------
Train loss 0.7567904820371407 accuracy 0.758419517369398
Val   loss 0.7781156912634645 accuracy 0.7393659180977543

Epoch 8/8
----------
Train loss 0.747

In [28]:
train_a = [i.item() for i in train_a]
train_l = [i.item() for i in train_l]
val_a = [i.item() for i in val_a]
val_l = [i.item() for i in val_l]

In [29]:
plt.plot(train_a, label='train accuracy')
plt.plot(val_a, label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

Accuracy of Cat1 on Test Set

In [30]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

0.6867444549563755

In [33]:
def get_predictions(model, data_loader):
  model = model.eval()

  review = []
  predictions = []

  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      texts = d["text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      cat1 = d["cat1"].to(device)


      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
      )
      _, preds = torch.max(outputs, dim=1)


      probs = F.softmax(outputs, dim=1)

      review.extend(texts)
      predictions.extend(preds)

      prediction_probs.extend(probs)
      real_values.extend(cat1)


  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()

  return review, predictions, prediction_probs, real_values

In [34]:
y_review_texts, y_pred, y_pred_probs, y_test= get_predictions(
  model,
  test_data_loader
)

In [35]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Cat1 Classification Report

In [37]:
class_names = list(cat1_map.keys())
print(classification_report(y_test, y_pred, target_names=class_names))

                      precision    recall  f1-score   support

        pet supplies       0.79      0.62      0.69      1576
              beauty       0.77      0.65      0.71      2027
          toys games       0.66      0.83      0.73      1533
grocery gourmet food       0.67      0.65      0.66       811
health personal care       0.67      0.72      0.70      2936
       baby products       0.45      0.51      0.48       630

            accuracy                           0.69      9513
           macro avg       0.67      0.66      0.66      9513
        weighted avg       0.70      0.69      0.69      9513

