In [1]:
# referenced https://www.curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/ as a tutorial

import os
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader


In [2]:
class TextDataset(Dataset):
  def __init__(self, sa_texts, targets, tokenizer, max_token_len):
    self.sa_texts = sa_texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
    
  def __len__(self):
    return len(self.sa_texts)

  def __getitem__(self, item):
    text = str(self.sa_texts[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.10)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 8, 'pin_memory': True} if use_cuda else {}

PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
batch_size = 32
n_classes = 5
epochs = 4
model = SentimentClassifier(n_classes).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
loss = nn.CrossEntropyLoss().to(device)

In [4]:
# Data from here: https://github.com/linanqiu/reddit-dataset
# edited headers for ease of use
reddit_df = pd.read_csv("news_news.csv")
reddit_df = reddit_df.dropna()

# # Data from here: https://www.kaggle.com/crowdflower/twitter-airline-sentiment/data
# twitter_df = pd.read_csv("Tweets.csv")
# twitter_df = twitter_df.dropna()

# Data from here: https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data?select=train.tsv.zip
sst_df = pd.read_csv("train.tsv", sep='\t')
# https://stackoverflow.com/questions/42750551/converting-strings-to-a-lower-case-in-pandas
sst_df['text'] = sst_df['text'].str.lower()
sst_df = sst_df.dropna()

In [5]:
reddit_df.head()

Unnamed: 0.1,Unnamed: 0,text,id,subreddit,meta,time,author,ups,downs,authorlinkkarma,authorkarma,authorisgold
1,1,protesters lose jobs for not showing up to wo...,d02jvmq,news,news,1455667000.0,tiamdi,9.0,0.0,943,76932,0.0
2,2,i do believe they are nt understanding that th...,d02rgsy,news,news,1455678000.0,TechnologyIsAmazing,8.0,0.0,1,74,0.0
3,3,why did nt they care this much about their fre...,d02sy4c,news,news,1455681000.0,BlueSardines,1.0,0.0,90,17376,0.0
4,4,even if they wrote a program to stop the wait ...,d02svp7,news,news,1455681000.0,MagicalMick,1.0,0.0,19,1691,0.0
5,5,this is what the fbi was looking for the one ...,d02ty59,news,news,1455683000.0,disturbed_perturbed,1.0,0.0,3294,1161,0.0


In [6]:
sst_df.head()

Unnamed: 0,PhraseId,SentenceId,text,sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,2,1,a series of escapades demonstrating the adage ...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2


In [7]:
max_token_len = 0
for text in reddit_df.text:
    max_token_len = max(len(text), max_token_len)
for text in sst_df.text:
    max_token_len = max(len(text), max_token_len)
if max_token_len > 500:
    max_token_len = round(max_token_len, -3) # round max token len to nearest 1000
else:
    max_token_len = 1000 # make max token len 1000
print("Standardized Length for BERT tokenizer:", max_token_len)

Standardized Length for BERT tokenizer: 3000


In [8]:
train_loader = DataLoader(
    TextDataset(
        sa_texts=sst_df.text.to_numpy(), 
        targets=sst_df.sentiment.to_numpy(), 
        tokenizer=tokenizer, 
        max_token_len=max_token_len
    ), 
    batch_size=batch_size, 
    **kwargs
)

In [9]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [10]:
for i in range(epochs):
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss,
        optimizer,
        device,
        len(sst_df)
      )

RuntimeError: CUDA out of memory. Tried to allocate 412.00 MiB (GPU 0; 10.75 GiB total capacity; 8.13 GiB already allocated; 426.75 MiB free; 8.21 GiB reserved in total by PyTorch)