# 1. Import Needed Library

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tiktoken
import re, string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk, subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
print("Number of GPU: ", torch.cuda.device_count())
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
CUDA available: True
CUDA version: 12.6
GPU device name: NVIDIA GeForce RTX 4060 Laptop GPU
Using device: cuda


# 2. Split the dataset into Train, Validation, Test

In [6]:
df = pd.read_csv('../../Dataset/Processed dataset/processed_data.csv')
train_df = pd.read_csv("../../Dataset/Processed dataset/train_data.csv")
test_df = pd.read_csv("../../Dataset/Processed dataset/test_data.csv")
validation_df = pd.read_csv("../../Dataset/Processed dataset/validation_data.csv")

In [7]:
# Display split sizes
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(validation_df)}")
print(f"Test set size: {len(test_df)}")

Training set size: 161613
Validation set size: 53871
Test set size: 53871


In [8]:
train_df

Unnamed: 0,sentence,sentiment,label
0,feel submissive ever,sadness,4
1,feel playful enough try new combination,joy,2
2,find broken piece feeling nothing feeling noth...,anger,0
3,feel ecstatic worry make love automatic adica ...,joy,2
4,ive feeling really jealous friend rafia im ash...,anger,0
...,...,...,...
161608,feeling nervous,fear,1
161609,feel like punished believing austin,sadness,4
161610,look back little paragraph ive written feel bi...,anger,0
161611,feel inconvenienced trimmer blade dull,sadness,4


In [9]:
validation_df

Unnamed: 0,sentence,sentiment,label
0,feeling lake popular weekend summer huge parki...,joy,2
1,couldnt stop feeling threatened card grandmoth...,fear,1
2,feel way try ignored ignored got interested es...,sadness,4
3,feeling bitchy,anger,0
4,know little feel special bond,joy,2
...,...,...,...
53866,think bottom line b story pierce feel need acc...,love,3
53867,straight man ejacalute like week sexual intere...,love,3
53868,feel really rude stating fact would feel rude ...,anger,0
53869,im feeling especially triggered grumpy note ev...,anger,0


In [10]:
test_df['sentence']

0                                 im feeling cold im alone
1        feel like im th grade shy wouldnt say anything...
2                           feel like navy dress dangerous
3                               feel jaded chaser although
4                  feel petty vicious mean defensive angry
                               ...                        
53866    feeling especially tender tendency get weepy h...
53867    pryers feel like listening perhaps punished ba...
53868    promise never react even grievously provoked l...
53869                              feel submissive spoiled
53870    go period feeling like one could love unremark...
Name: sentence, Length: 53871, dtype: object

In [11]:
train_df.label.value_counts()

label
2    36000
0    34341
4    33000
1    28598
3    20699
5     8975
Name: count, dtype: int64

In [12]:
label_dict = dict(zip(train_df['label'], train_df['sentiment']))
label_dict

{4: 'sadness', 2: 'joy', 0: 'anger', 3: 'love', 5: 'surprise', 1: 'fear'}

# 3. Normalize Dataset With Tokenization And Vectorization

In [14]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, device):
        # Make sure all text values are strings
        self.text = dataframe['sentence'].astype(str).values
        self.labels = dataframe['label'].values
        self.tokenizer = tokenizer
        self.device = device

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.labels[idx]
        # Convert 'nan' to empty string if necessary
        if text == 'nan':
            text = ''
        encoding = self.tokenizer.encode(text)
        input_ids = torch.tensor(encoding, dtype=torch.long, device=self.device)
        label = torch.tensor(label, dtype=torch.long, device=self.device)
        return input_ids, label

In [35]:
text = tokenizer.encode(train_df['sentence'][0])

In [37]:
text

[36410, 850, 33532, 1683]

In [15]:
# Load and preprocess data

tokenizer = tiktoken.get_encoding('gpt2')

# Create datasets with device
train_dataset = TextDataset(train_df, tokenizer, device)
val_dataset = TextDataset(validation_df, tokenizer, device)
test_dataset = TextDataset(test_df, tokenizer, device)

In [27]:
for input_ids, label in train_dataset:
    print(input_ids)
    print(label)
    break




tensor([36410,   850, 33532,  1683], device='cuda:0')
tensor(4, device='cuda:0')


# 4. Set Up Dataloaders

In [24]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    max_length = max(len(ids) for ids in input_ids)
    
    # All tensors should already be on the correct device from the dataset
    device = input_ids[0].device
    
    input_ids = torch.stack([
        torch.cat([ids, torch.zeros(max_length - len(ids), dtype=torch.long, device=device)]) 
        for ids in input_ids
    ])
    labels = torch.stack(labels)
    return input_ids, labels

In [25]:
# Create dataloaders
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# 5. Initialize Transformer Model

In [27]:
from TransformerEncoderROPE import TransformerModelWithROPE

In [28]:
# Initialize the Transformer model
vocab_size = tokenizer.n_vocab
embed_size = 256
d_model = 256
num_heads = 8
d_ff = 512
output_size = len(train_df['label'].unique())
num_layers = 3
dropout = 0.2

# Initialize model and move to device
#model = TransformerModel(vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout)
model = TransformerModelWithROPE(vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout)
model = model.to(device)

In [29]:
# Initialize loss and optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [30]:
'''for input_ids, labels in val_dataloader:
    outputs = model(input_ids)
    print(outputs.size())
    print(outputs)
    print()
    print(torch.max(outputs,1))
    print()
    _, predicted = torch.max(outputs,1)
    print(_)
    print(predicted)
    break
'''

'for input_ids, labels in val_dataloader:\n    outputs = model(input_ids)\n    print(outputs.size())\n    print(outputs)\n    print()\n    print(torch.max(outputs,1))\n    print()\n    _, predicted = torch.max(outputs,1)\n    print(_)\n    print(predicted)\n    break\n'

In [31]:
# Define the accuracy function (using the first version from before)
def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, 1)
    total = labels.size(0)
    correct = (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy, correct, total

In [39]:
# Training loop with model saving
best_val_accuracy = 0.0  # Track the best validation accuracy
best_model_path = '../../Saved trained model/best_transformerROPE_model.pth'  # File path to save the best model

In [41]:
num_epochs = 1

for epoch in range(num_epochs):
    # Training step
    model.train()
    train_correct = 0
    train_total = 0
    for input_ids, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        
        # Calculate training accuracy
        batch_acc, batch_correct, batch_total = calculate_accuracy(outputs, labels)
        train_correct += batch_correct
        train_total += batch_total
        
        loss.backward()
        optimizer.step()
    
    train_accuracy = 100 * train_correct / train_total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")
    print(f"Correct/Total: {train_correct}/{train_total}, Training Accuracy after Epoch {epoch+1}: {train_accuracy:.2f}%")

    # Validation step
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for input_ids, labels in val_dataloader:
            outputs = model(input_ids)
            batch_acc, batch_correct, batch_total = calculate_accuracy(outputs, labels)
            val_correct += batch_correct
            val_total += batch_total

    val_accuracy = 100 * val_correct / val_total
    print(f"Correct/Total: {val_correct}/{val_total}, Validation Accuracy after Epoch {epoch+1}: {val_accuracy:.2f}%")

    # Save model if validation accuracy improves
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), best_model_path)
    
    print()  # Add a blank line for readability

Epoch 1/1, Loss: 0.3139
Correct/Total: 138160/161613, Training Accuracy after Epoch 1: 85.49%
Correct/Total: 46973/53871, Validation Accuracy after Epoch 1: 87.20%



# 6. Evaluation On Model Performance

In [44]:
# Assuming the same model architecture
model.load_state_dict(torch.load(best_model_path))
model.eval()  # Set to evaluation mode if using for inference

TransformerModelWithROPE(
  (embedding): Embedding(50257, 256)
  (encoder_layers): ModuleList(
    (0-2): 3 x TransformerEncoderLayerWithROPE(
      (rotary_emb): RotaryEmbedding()
      (wq): Linear(in_features=256, out_features=256, bias=True)
      (wk): Linear(in_features=256, out_features=256, bias=True)
      (wv): Linear(in_features=256, out_features=256, bias=True)
      (dense): Linear(in_features=256, out_features=256, bias=True)
      (feed_forward): Sequential(
        (0): Linear(in_features=256, out_features=512, bias=True)
        (1): ReLU()
        (2): Linear(in_features=512, out_features=256, bias=True)
      )
      (layernorm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
  )
  (fc): Linear(in_features=256, out_features=6, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [46]:
test_correct, test_total = 0, 0

for input_ids, labels in test_dataloader:
    outputs = model(input_ids)
    batch_acc, batch_correct, batch_total = calculate_accuracy(outputs, labels)
    test_correct += batch_correct
    test_total += batch_total

test_accuracy = 100 * test_correct / test_total
print(f"Correct/Total: {test_correct}/{test_total}, Accuracy: {test_accuracy:.2f}%")

Correct/Total: 47266/53871, Accuracy: 87.74%


# 7. Example Testing

In [47]:
def predict_emotion(sentence, model, tokenizer, device):
    # Preprocess the input sentence
    sentence = sentence.lower()
    encoding = tokenizer.encode(sentence)
    input_ids = torch.tensor(encoding, dtype=torch.long, device=device).unsqueeze(0)
    
    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids)
        accuracy, predicted = torch.max(outputs, 1)
          
    return label_dict[predicted.item()]

In [48]:
# Example usage
test_sentence = input()
predicted_emotion = predict_emotion(test_sentence, model, tokenizer, device)
print(f"Sentence: {test_sentence}")
print(f"Predicted emotion: {predicted_emotion}")

KeyboardInterrupt: Interrupted by user