In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import class_weight
import numpy as np




In [None]:
# Read the pickle file
features = pd.read_pickle('./Datasets/feature_extraction_full.pkl')

# Filter out rows where the emotion is "neutral"
filtered_features = features[features['emotion'] != 'neutral']
filtered_features.head()

In [3]:
# Count occurrences of each emotion
emotion_counts = filtered_features['emotion'].value_counts()
emotion_counts

emotion
happiness    187701
sadness      177354
fear          45606
suprise       19010
anger         10681
disgust        1472
Name: count, dtype: int64

In [None]:
# Dictionary to store sampled sentences for each emotion
sampled_data = {}

# Define the desired sample size for each emotion
sample_size = {
    'anger': 5000,
    'fear': 5000,
    'happiness': 5000,
    'sadness': 5000,
    'disgust': 5000,
    'suprise': 5000
}

# Sample sentences for each emotion
for emotion, count in sample_size.items():
    emotion_data = filtered_features[filtered_features['emotion'] == emotion]
    if len(emotion_data) >= count:
        sampled_data[emotion] = emotion_data.sample(n=count, random_state=42)
    else:
        sampled_data[emotion] = emotion_data

# Concatenate sampled data for all emotions
sampled_df = pd.concat(sampled_data.values())

# Shuffle the concatenated DataFrame
sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
sampled_df.head()

In [5]:
# Count occurrences of each emotion
emotion_counts = sampled_df['emotion'].value_counts()
emotion_counts

emotion
fear         5000
suprise      5000
anger        5000
happiness    5000
sadness      5000
disgust      1472
Name: count, dtype: int64

In [6]:
sampled_df['emotion'] = sampled_df['emotion'].replace('suprise', 'surprise')
# Count occurrences of each emotion
emotion_counts = sampled_df['emotion'].value_counts()
emotion_counts

emotion
fear         5000
surprise     5000
anger        5000
happiness    5000
sadness      5000
disgust      1472
Name: count, dtype: int64

In [7]:
# Prepare data
X = sampled_df['sentence']
y = sampled_df['emotion']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
# Tokenize the input texts
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [10]:
# Convert labels to numeric values
label2id = {label: i for i, label in enumerate(sampled_df['emotion'].unique())}
y_train = [label2id[label] for label in y_train]
y_test = [label2id[label] for label in y_test]

In [11]:
# Convert data into PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

In [13]:
# Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label2id))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Move the model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [15]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
num_training_steps = len(train_dataset) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [16]:
# Define Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

In [17]:
# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [18]:
# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(train_dataset.labels), y=train_dataset.labels)

# Convert class weights to PyTorch tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Define loss function with class weights
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

In [20]:
# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

In [21]:
# Evaluate the model
model.eval()
predicted_labels = []
true_labels = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels']
    true_labels.extend(labels.cpu().numpy())
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()
    predicted_labels.extend(batch_predictions)

In [22]:
# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, fscore, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", fscore)

Accuracy: 0.851558073654391
Precision: 0.8493036060684589
Recall: 0.851558073654391
F1 Score: 0.8496959503027467


# Predict on the kaggle test set

In [23]:
# Loading kaggle test file
file_path = "./Kaggle/Datasets multiclass/test (kaggle).csv"
test_kaggle = pd.read_csv(file_path, sep='\t')
test_kaggle.head()

Unnamed: 0,id,sentence
0,0,Girls are happy when they get flowers
1,1,His jaw dropped in disbelief when he saw the p...
2,2,Sometimes the ugly stench makes me wanna throw...
3,3,The foul odor from the garbage bin was disgust...
4,4,"I can’t believe it, they lost the game in the ..."


In [24]:
# Preprocess the text data
X_test_kaggle = test_kaggle['sentence']

In [25]:
# Tokenize the input texts for Kaggle test set
test_kaggle_encodings = tokenizer(X_test_kaggle.tolist(), truncation=True, padding=True)

# Convert data into PyTorch Dataset for Kaggle test set
class KaggleDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

test_kaggle_dataset = KaggleDataset(test_kaggle_encodings)

# Define DataLoader for Kaggle test set
test_kaggle_loader = DataLoader(test_kaggle_dataset, batch_size=64)

In [26]:
# Predict emotions on the Kaggle test set
model.eval()
predicted_kaggle_labels = []

for batch in test_kaggle_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()
    predicted_kaggle_labels.extend(batch_predictions)

In [27]:
# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, fscore, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", fscore)

Accuracy: 0.851558073654391
Precision: 0.8493036060684589
Recall: 0.851558073654391
F1 Score: 0.8496959503027467


In [28]:
# Convert predicted labels back to original emotion labels
id2label = {i: label for label, i in label2id.items()}
predicted_kaggle_emotions = [id2label[label] for label in predicted_kaggle_labels]

In [32]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': range(len(predicted_kaggle_emotions)),
    'emotion': predicted_kaggle_emotions
})

submission_df.head()

Unnamed: 0,id,emotion
0,0,happiness
1,1,surprise
2,2,disgust
3,3,disgust
4,4,anger


In [33]:
# Count occurrences of each emotion
emotion_counts = submission_df['emotion'].value_counts()
emotion_counts

emotion
anger        377
disgust      291
surprise     244
happiness    218
fear         179
sadness      127
Name: count, dtype: int64

In [34]:
# Save the model
torch.save(model.state_dict(), './Models/transformers_BERT_2.pth')
print("transformers_BERT_2 saved successfully.")

transformers_BERT_2 saved successfully.


In [35]:
# Save predictions to CSV
submission_df.to_csv('./Kaggle/Datasets multiclass/predictions_transformers_BERT_2.csv', index=False)
print("BERT predictions have been saved to predictions_transformers_BERT_2.csv")

BERT predictions have been saved to predictions_transformers_BERT_2.csv
