#Deep Learning Experiments for Text Dataset

In [1]:
!pip install -q transformers torch pymongo

## Loading Data from Database

In [2]:
from pymongo import MongoClient
import os
from google.colab import userdata

In [3]:
CONN_STR = userdata.get('CONN_STR')

In [4]:
client = MongoClient(CONN_STR)
db = client['PAI-Project']
table = db['TextDataCleaned']

In [5]:
records = table.find({})
records = list(records)

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(records)

In [8]:
df.head()

Unnamed: 0,_id,target,input
0,676d86a5bd348b6b203f39ce,1,"Regular check-in post, with information about ..."
1,676d86a5bd348b6b203f39cf,1,Our most-broken and least-understood rules is ...
2,676d86a5bd348b6b203f39d0,1,"I haven’t been touched, or even hugged, in so ..."
3,676d86a5bd348b6b203f39d1,1,Being Depressed is Embarrassing\n\nI’m just so...
4,676d86a5bd348b6b203f39d2,1,I'm desperate for a friend and to feel loved b...


# Custom Dataset Class

In [9]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [10]:
class TextDataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [11]:
from sklearn.model_selection import train_test_split

X = df['input'].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
train_ds = TextDataset(X_train, y_train)
test_ds = TextDataset(X_test, y_test)

In [13]:
train_ds_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_ds_loader = DataLoader(test_ds, batch_size=16, shuffle=True)

# Custom Model and Tokenizer

In [14]:
from transformers import AutoTokenizer, AutoModel

In [15]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
class CustomBERT(nn.Module):
  def __init__(self, model_name, num_classes):
    super(CustomBERT, self).__init__()
    self.bert = AutoModel.from_pretrained(model_name)
    self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, x):
    outputs = self.bert(**x)
    pooled_output = outputs.pooler_output
    logits = self.fc(pooled_output)
    return logits

In [26]:
torch.set_float32_matmul_precision('high')

In [23]:
m = CustomBERT(model_name, 5)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
m.to(device);

In [24]:
opt = torch.optim.Adam(m.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

## Training Loop

In [25]:
def evaluate(m):
  m.eval()
  train_accs = 0
  c = 0
  for batch in train_ds_loader:
    if c == 50:
      break
    x, y = batch
    x = tokenizer(x, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    x = x.to(device)
    y = y.to(device)

    with torch.no_grad():
      with torch.autocast(device_type=device, dtype=torch.float16):
        logits = m(x)
    y_preds = torch.argmax(logits, dim=1)
    train_acc = (y_preds == y).float().mean()
    train_accs += train_acc
    c += 1

  avg_train_acc = train_accs / c
  print(f'Train Dataset Accuracy: {avg_train_acc}')

  test_accs = 0
  c= 0
  for batch in test_ds_loader:
    if c == 50:
      break
    x, y = batch
    x = tokenizer(x, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    x = x.to(device)
    y = y.to(device)
    with torch.no_grad():
      with torch.autocast(device_type=device, dtype=torch.float16):
        logits = m(x)
    y_preds = torch.argmax(logits, dim=1)
    test_acc = (y_preds == y).float().mean()
    test_accs += test_acc
    c += 1

  avg_test_acc = test_accs / c
  print(f'Test Dataset Accuracy: {avg_test_acc}')
  m.train()

In [27]:
from tqdm import tqdm

for epoch in range(5):
  pb = tqdm(train_ds_loader, leave=False, desc=f'Epoch {epoch}')
  for batch in pb:
    x, y = batch
    x = tokenizer(x, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    x = x.to(device)
    y = y.to(device)

    opt.zero_grad()
    with torch.autocast(device_type=device, dtype=torch.float16):
      logits = m(x)
      loss = loss_fn(logits, y)
    loss.backward()
    opt.step()
    pb.set_postfix({'loss': loss.item()})
  print(f'Stats for Epoch {epoch + 1}')
  evaluate(m)
  print('---')



Stats for Epoch 1
Train Dataset Accuracy: 0.8262499570846558
Test Dataset Accuracy: 0.7699999809265137
---




Stats for Epoch 2
Train Dataset Accuracy: 0.9024999737739563
Test Dataset Accuracy: 0.8037499785423279
---




Stats for Epoch 3
Train Dataset Accuracy: 0.9662500023841858
Test Dataset Accuracy: 0.8312499523162842
---




Stats for Epoch 4
Train Dataset Accuracy: 0.9837499856948853
Test Dataset Accuracy: 0.8362500071525574
---




Stats for Epoch 5
Train Dataset Accuracy: 0.9912499785423279
Test Dataset Accuracy: 0.8524999618530273
---


# Model with Dropout

to avoid overfitting.

In [30]:
class CustomBERTwithDropout(nn.Module):
  def __init__(self, model_name, num_classes):
    super().__init__()
    self.bert = AutoModel.from_pretrained(model_name)
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, x):
    outputs = self.bert(**x)
    pooled_output = outputs.pooler_output
    dropout_output = self.dropout(pooled_output)
    logits = self.fc(dropout_output)
    return logits

In [31]:
m2 = CustomBERTwithDropout(model_name, 5)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
m2.to(device);

In [32]:
opt = torch.optim.Adam(m2.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

In [35]:
for epoch in range(5):
  pb = tqdm(train_ds_loader, leave=False, desc=f'Epoch {epoch}')
  for batch in pb:
    x, y = batch
    x = tokenizer(x, padding='max_length', truncation=True, return_tensors='pt', max_length=512)
    x = x.to(device)
    y = y.to(device)

    opt.zero_grad()
    with torch.autocast(device_type=device, dtype=torch.float16):
      logits = m2(x)
      loss = loss_fn(logits, y)
    loss.backward()
    opt.step()
    pb.set_postfix({'loss': loss.item()})
  print(f'Stats for Epoch {epoch + 1}')
  evaluate(m2)
  print('---')



Stats for Epoch 1
Train Dataset Accuracy: 0.8499999642372131
Test Dataset Accuracy: 0.8199999928474426
---




Stats for Epoch 2
Train Dataset Accuracy: 0.9274999499320984
Test Dataset Accuracy: 0.8174999952316284
---




Stats for Epoch 3
Train Dataset Accuracy: 0.9724999666213989
Test Dataset Accuracy: 0.8399999737739563
---




Stats for Epoch 4
Train Dataset Accuracy: 0.9837499856948853
Test Dataset Accuracy: 0.8487499952316284
---




Stats for Epoch 5
Train Dataset Accuracy: 0.9887499809265137
Test Dataset Accuracy: 0.8362500071525574
---
