<a href="https://colab.research.google.com/github/miawoolfson/SentimAIl/blob/BERT-Model/BERT%20Model/Sentiment%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
import os
from time import time
from datetime import datetime

from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build

import json
import pickle
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from transformers import BertTokenizer, BertForSequenceClassification, BertConfig

import torch
from torch import optim, nn
from torch.utils.data import Dataset, DataLoader

# Configure

In [2]:
NAME = 'Multiple Emotions 1.0'

PATH = '/content/drive/My Drive/DS Final Project/BERT Model/'
DATASET = 'emotions_6_types.csv'
TEXT_COLUMN = 'sentence'
SENTIMENT_COLUMN = 'emotion'
MAX_SENTIMENT_GROUP_SIZE = 10000
MAX_TOKEN_LENGTH = 512

NUM_EPOCHS = 30
LEARNING_RATE = 2e-5
DROPOUT_PROB = 0.4

In [3]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
start_time = time()
drive.mount('/content/drive')
auth.authenticate_user()
drive_service = build('drive', 'v3')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Save and Load

In [4]:
model_dir = PATH + 'Models/Model - ' + NAME
os.makedirs(model_dir, exist_ok=True)

if os.path.exists(model_dir + '/config.json'):
    with open(model_dir + '/config.json', 'r') as f:
        config = json.load(f)

    DATASET = config['DATASET']
    MAX_TOKEN_LENGTH = config['MAX_TOKEN_LENGTH']
    LEARNING_RATE = config['LEARNING_RATE']
    DROPOUT_PROB = config['DROPOUT_PROB']

    print('Loaded config from', model_dir + '/config.json')
    print('WARNING - Config overridden!')
else:
  with open(model_dir + '/config.json', 'w') as f:
      json.dump({
          'DATASET': DATASET,
          'MAX_TOKEN_LENGTH': MAX_TOKEN_LENGTH,
          'LEARNING_RATE': LEARNING_RATE,
          'DROPOUT_PROB': DROPOUT_PROB
      }, f, indent=4)
  print('Saved config to', model_dir + '/config.json')

Saved config to /content/drive/My Drive/DS Final Project/BERT Model/Models/Model - Multiple Emotions 1.0/config.json


## Functions

In [5]:
class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

In [6]:
def plot_graph(eval_train, eval_test, title):
    plt.figure(figsize = (13,6))
    plt.plot(eval_train)
    plt.plot(eval_test)
    plt.title('model ' + title)
    plt.ylabel(title)
    plt.xlabel('epoch')
    plt.locator_params(axis="x", integer=True, tight=True)
    plt.legend(['train', 'test'], loc='upper left')

In [7]:
def plot_confusion_matrix(y_true, y_pred, figsize=(6, 5), cmap="Blues"):
    cm = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=figsize)
    sns.heatmap(cm, annot=True, fmt="d", cmap=cmap, xticklabels=map_sentiment.classes_, yticklabels=map_sentiment.classes_)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")

In [8]:
def compute_accuracy(preds, labels):
    return (preds == labels).float().mean().item()

# Prepare Data

## Load Data

In [9]:
if DATASET.endswith(".csv"):
    raw_df = pd.read_csv(PATH + 'Data/' + DATASET)
else:
  raw_df = pd.read_parquet(PATH + 'Data/' + DATASET)

if 'email' not in raw_df.columns:
    raw_df.rename(columns={TEXT_COLUMN: 'email'}, inplace=True)

if 'sentiment' not in raw_df.columns:
    raw_df.rename(columns={SENTIMENT_COLUMN: 'sentiment'}, inplace=True)

df = raw_df[['email', 'sentiment']]

In [10]:
group_size = min(df.groupby('sentiment').size().min(), MAX_SENTIMENT_GROUP_SIZE)

df = df.groupby('sentiment').head(group_size)

print(f'Saved {group_size} samples from each sentiment.')

Saved 10000 samples from each sentiment.


In [11]:
map_sentiment = LabelEncoder()
df['sentiment'] = map_sentiment.fit_transform(df['sentiment'])

## Split Data

In [12]:
X_df = df['email']
y_df = df['sentiment']

In [13]:
x_train, x_test_val, y_train, y_test_val = train_test_split(X_df, y_df, test_size=0.2, stratify = y_df)
x_val, x_test, y_val, y_test = train_test_split(x_test_val, y_test_val, test_size=0.5, stratify = y_test_val)

In [14]:
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

## Tokenize

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_TOKEN_LENGTH = 512

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
X_train_encoded = tokenizer.batch_encode_plus(x_train.tolist(), padding=True, truncation=True, max_length = MAX_TOKEN_LENGTH, return_tensors='pt')

X_val_encoded = tokenizer.batch_encode_plus(x_val.tolist(), padding=True, truncation=True, max_length = MAX_TOKEN_LENGTH, return_tensors='pt')

X_test_encoded = tokenizer.batch_encode_plus(x_test.tolist(), padding=True, truncation=True, max_length = MAX_TOKEN_LENGTH, return_tensors='pt')

# Model

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [18]:
train_dataset = EmailDataset(X_train_encoded, y_train)
val_dataset = EmailDataset(X_val_encoded, y_val)
test_dataset = EmailDataset(X_test_encoded, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

## Train

In [19]:
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels=len(map_sentiment.classes_)
config.hidden_dropout_prob = DROPOUT_PROB
config.attention_probs_dropout_prob = DROPOUT_PROB
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
start_epoch = 0
checkpoint_dir = PATH + '/Checkpoints/Model - ' + NAME

In [21]:
os.makedirs(checkpoint_dir, exist_ok=True)

checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.startswith("model_epoch_")]
if checkpoint_files:
    last_epoch = max([int(f.split('_')[-1].split('.')[0]) for f in checkpoint_files])
    checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{last_epoch}.pt")
    checkpoint = torch.load(checkpoint_path)

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    train_losses = checkpoint['train_losses']
    val_losses = checkpoint['val_losses']
    train_accuracies = checkpoint['train_accuracies']
    val_accuracies = checkpoint['val_accuracies']
    start_epoch = checkpoint['epoch'] + 1

    print(f"Resumed from epoch {start_epoch}")

In [None]:
for epoch in tqdm(range(start_epoch, NUM_EPOCHS), desc="Training"):
    model.train()
    running_loss = 0.0
    running_accuracy = 0.0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(output.logits, dim=1)
        loss = criterion(output.logits, labels)
        acc = compute_accuracy(preds, labels)

        running_loss += loss.item()
        running_accuracy += acc

        loss.backward()
        optimizer.step()

    avg_train_loss = running_loss / len(train_loader)
    avg_train_accuracy = running_accuracy / len(train_loader)
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_accuracy)

    # Validation
    model.eval()
    running_loss = 0.0
    running_accuracy = 0.0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            output = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(output.logits, dim=1)
            loss = criterion(output.logits, labels)
            acc = compute_accuracy(preds, labels)
            running_loss += loss.item()
            running_accuracy += acc

    avg_val_loss = running_loss / len(val_loader)
    avg_val_accuracy = running_accuracy / len(val_loader)
    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_accuracy)

    # Save full checkpoint for this epoch
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_accuracies': train_accuracies,
        'val_accuracies': val_accuracies
    }

    checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch}.pt")
    torch.save(checkpoint, checkpoint_path)
    if epoch > 0:
      last_checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch - 1}.pt")
      os.remove(last_checkpoint_path)
      drive_service.files().emptyTrash().execute()

Training:   3%|▎         | 1/30 [14:16<6:54:00, 856.58s/it]

In [None]:
plot_graph(train_losses, val_losses, 'loss')

In [None]:
plot_graph(train_accuracies, val_accuracies, 'Accuracy')

## Evaluate

In [None]:
model.eval()
total_loss = 0.0
total_accuracy = 0.0
total_batches = 0
pred_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        predictions = torch.argmax(outputs.logits, dim=1)
        pred_labels.extend(predictions.tolist())

        loss = criterion(outputs.logits, labels)
        acc = compute_accuracy(predictions, labels)

        total_loss += loss.item()
        total_accuracy += acc
        total_batches += 1

test_loss = total_loss / total_batches
test_accuracy = total_accuracy / total_batches

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

In [None]:
plot_confusion_matrix(y_test, pred_labels)

In [None]:
print("Classification Report: \n", classification_report(y_test, pred_labels))

# Wrap-Up

In [None]:
model_dir = PATH + '/Models/Model - ' + NAME
os.makedirs(model_dir, exist_ok=True)

tokenizer.save_pretrained(model_dir +'/Tokenizer')

model.save_pretrained(model_dir +'/Model')

with open(model_dir + '/mapped_sentiments.pkl', 'wb') as f:
    pickle.dump(map_sentiment, f)

In [None]:
def get_sentiment(email, Tokenizer=tokenizer, Model=model, Sentiments=map_sentiment, Max_input_len=MAX_TOKEN_LENGTH):
    encoded_email = Tokenizer.batch_encode_plus([email], padding=True, truncation=True, max_length=Max_input_len, return_tensors='pt')

    model.eval()
    with torch.no_grad():
        input_ids = encoded_email['input_ids'].to(device)
        attention_mask = encoded_email['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        predictions = torch.argmax(outputs.logits, dim=1)
        return  Sentiments.inverse_transform([predictions.item()])[0]

In [None]:
end_time = time()

In [None]:
print('Notebook ran for', round((end_time - start_time) / 60), 'minutes')

# Inference

## Load

In [None]:
model_dir = PATH + '/Models/Model - ' + NAME

bert_tokenizer = BertTokenizer.from_pretrained(model_dir +'/Tokenizer')

bert_model = BertForSequenceClassification.from_pretrained(model_dir +'/Model')

with open(model_dir + '/mapped_sentiments.pkl', 'rb') as f:
    sentiments = pickle.load(f)

## Inference File

In [None]:
FILE_NAME = 'Benchmark.csv'

In [None]:
inference_df = pd.read_csv(PATH + FILE_NAME)

In [None]:
inference_df['Sentiment by ' + NAME] = inference_df['email'].apply(lambda email: get_sentiment(email, Tokenizer=bert_tokenizer, Model=bert_model, Sentiments=sentiments))

In [None]:
inference_df.to_csv(PATH + FILE_NAME, index=False)

## Inference String

In [None]:
email = '''
The U.I looks good, but i wanted it darker.
Please fix.
'''
get_sentiment(email, Tokenizer=bert_tokenizer, Model=bert_model, Sentiments=sentiments)

# Benchmark

## Load Benchmark File

In [None]:
benchmark_file = 'Benchmark.csv'
benchmark_df = pd.read_csv(PATH + benchmark_file)

In [None]:
model_names = [col for col in benchmark_df.columns if col.startswith('Sentiment by ')]

benchmark_df['sentiment'] = benchmark_df['sentiment'].str.lower()

for model_name in model_names:
    benchmark_df[model_name] = benchmark_df[model_name].str.lower()

## Compare Models

In [None]:
accuracies = {}

for model_name in model_names:
    print(model_name + '\n', classification_report(benchmark_df['sentiment'], benchmark_df[model_name]))
    plot_confusion_matrix(benchmark_df['sentiment'], benchmark_df[model_name])
    plt.show()
    accuracies[model_name[13:]] = accuracy_score(benchmark_df['sentiment'], benchmark_df[model_name])

In [None]:
plt.bar(accuracies.keys(), accuracies.values())
plt.ylabel('Accuracy')
plt.title('Accuracy by Model')
plt.xticks(rotation=90, ha='right')
plt.ylim(0, 1)

plt.show()