# 1. Import Needed Library

In [106]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tiktoken
import re, string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk, subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 2. Split the dataset into Train, Validation, Test

In [140]:
df = pd.read_csv('Dataset/Processed dataset/processed_data.csv')
train_df = pd.read_csv("Dataset/Processed dataset/train_data.csv")
test_df = pd.read_csv("Dataset/Processed dataset/test_data.csv")
validation_df = pd.read_csv("Dataset/Processed dataset/validation_data.csv")

In [142]:
# Display split sizes
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(validation_df)}")
print(f"Test set size: {len(test_df)}")

Training set size: 161613
Validation set size: 53871
Test set size: 53871


In [150]:
train_df

Unnamed: 0,sentence,sentiment,label
0,feel submissive ever,sadness,4
1,feel playful enough try new combination,joy,2
2,find broken piece feeling nothing feeling noth...,anger,0
3,feel ecstatic worry make love automatic adica ...,joy,2
4,ive feeling really jealous friend rafia im ash...,anger,0
...,...,...,...
161608,feeling nervous,fear,1
161609,feel like punished believing austin,sadness,4
161610,look back little paragraph ive written feel bi...,anger,0
161611,feel inconvenienced trimmer blade dull,sadness,4


In [152]:
validation_df

Unnamed: 0,sentence,sentiment,label
0,feeling lake popular weekend summer huge parki...,joy,2
1,couldnt stop feeling threatened card grandmoth...,fear,1
2,feel way try ignored ignored got interested es...,sadness,4
3,feeling bitchy,anger,0
4,know little feel special bond,joy,2
...,...,...,...
53866,think bottom line b story pierce feel need acc...,love,3
53867,straight man ejacalute like week sexual intere...,love,3
53868,feel really rude stating fact would feel rude ...,anger,0
53869,im feeling especially triggered grumpy note ev...,anger,0


In [222]:
test_df['sentence']

0                                 im feeling cold im alone
1        feel like im th grade shy wouldnt say anything...
2                           feel like navy dress dangerous
3                               feel jaded chaser although
4                  feel petty vicious mean defensive angry
                               ...                        
53866    feeling especially tender tendency get weepy h...
53867    pryers feel like listening perhaps punished ba...
53868    promise never react even grievously provoked l...
53869                              feel submissive spoiled
53870    go period feeling like one could love unremark...
Name: sentence, Length: 53871, dtype: object

In [156]:
train_df.label.value_counts()

label
2    36000
0    34341
4    33000
1    28598
3    20699
5     8975
Name: count, dtype: int64

In [158]:
label_dict = dict(zip(train_df['label'], train_df['sentiment']))
label_dict

{4: 'sadness', 2: 'joy', 0: 'anger', 3: 'love', 5: 'surprise', 1: 'fear'}

In [277]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, device):
        # Make sure all text values are strings
        self.text = dataframe['sentence'].astype(str).values
        self.labels = dataframe['label'].values
        self.tokenizer = tokenizer
        self.device = device

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.labels[idx]
        # Convert 'nan' to empty string if necessary
        if text == 'nan':
            text = ''
        encoding = self.tokenizer.encode(text)
        input_ids = torch.tensor(encoding, dtype=torch.long, device=self.device)
        label = torch.tensor(label, dtype=torch.long, device=self.device)
        return input_ids, label

In [279]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    max_length = max(len(ids) for ids in input_ids)
    
    # All tensors should already be on the correct device from the dataset
    device = input_ids[0].device
    
    input_ids = torch.stack([
        torch.cat([ids, torch.zeros(max_length - len(ids), dtype=torch.long, device=device)]) 
        for ids in input_ids
    ])
    labels = torch.stack(labels)
    return input_ids, labels

In [281]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        # Linear layers for Q, K, V matrices
        self.wq = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.wk = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.wv = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)

        # Output linear transformation
        self.dense = nn.Linear(d_model, d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)   self.

        # Feed-forward network
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff)
            nn.ReLU(),
            nn.Linear(d_ff, d_model)  # (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
        )

        # Layer normalization and dropout
        self.layernorm1 = nn.LayerNorm(d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.layernorm2 = nn.LayerNorm(d_model)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x, batch_size):
        # Split the last dimension into (num_heads, depth)
        x = x.view(batch_size, -1, self.num_heads, self.depth)  # (batch_size, seq_len, d_model) -> (batch_size, seq_len, num_heads, depth)
        # Transpose the result to shape (batch_size, num_heads, seq_len, depth)
        return x.transpose(1, 2)  # (batch_size, seq_len, num_heads, depth) -> (batch_size, num_heads, seq_len, depth)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))  # (batch_size, num_heads, seq_len_q, seq_len_k)
        dk = torch.tensor(k.size(-1), dtype=torch.float32)  # scalar
        scaled_attention_logits = matmul_qk / torch.sqrt(dk)  # (batch_size, num_heads, seq_len_q, seq_len_k)

        if mask is not None:
            scaled_attention_logits = scaled_attention_logits.masked_fill(mask == 0, -1e9)

        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)  # (batch_size, num_heads, seq_len_q, seq_len_k)
        output = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len_q, depth_v)

        return output, attention_weights  # (batch_size, num_heads, seq_len_q, depth_v), (batch_size, num_heads, seq_len_q, seq_len_k)

    def forward(self, x, mask=None):
        batch_size = x.size(0)  # (batch_size, seq_len, d_model)

        # Apply linear layers and split into heads
        q = self.split_heads(self.wq(x), batch_size)  # (batch_size, num_heads, seq_len, depth)
        k = self.split_heads(self.wk(x), batch_size)  # (batch_size, num_heads, seq_len, depth)
        v = self.split_heads(self.wv(x), batch_size)  # (batch_size, num_heads, seq_len, depth)

        # Apply the custom scaled dot-product attention
        scaled_attention, _ = self.scaled_dot_product_attention(q, k, v, mask)  # (batch_size, num_heads, seq_len_q, depth_v)

        # Transpose and reshape back to (batch_size, seq_len, d_model)
        scaled_attention = scaled_attention.transpose(1, 2).contiguous()  # (batch_size, seq_len, num_heads, depth)
        concat_attention = scaled_attention.view(batch_size, -1, self.d_model)  # (batch_size, seq_len, d_model)

        # Apply the final linear layer to combine the heads
        attn_output = self.dense(concat_attention)  # (batch_size, seq_len, d_model)

        # Add & Norm
        x = self.layernorm1(x + self.dropout(attn_output))  # (batch_size, seq_len, d_model)

        # Feed-forward
        ff_output = self.feed_forward(x)  # (batch_size, seq_len, d_model)

        # Add & Norm
        x = self.layernorm2(x + self.dropout(ff_output))  # (batch_size, seq_len, d_model)

        return x  # (batch_size, seq_len, d_model)


![](https://storage.googleapis.com/mle-courses-prod/users/61b6fa1ba83a7e37c8309756/private-files/2dc6f3e0-5fb4-11ef-9b72-9db6eacc12d1-Screen_Shot_2024_08_21_at_18.54.35.png)

In [283]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        # Create a long enough 'positional' tensor and fill it with positional encodings
        # [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # (d_model/2,)

        # Khởi tạo một tensor `pe` với các giá trị bằng 0, sẽ chứa các giá trị mã hóa vị trí (positional encodings).
        # `pe` có dạng (max_len, d_model), trong đó `max_len` là chiều dài tối đa của chuỗi
        # và `d_model` là chiều kích thước của embeddings của mô hình.
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)

        # Đối với tất cả các vị trí trong chuỗi (từ 0 đến max_len-1) và đối với tất cả các chỉ số chẵn trong các chiều kích thước embedding,
        # tính toán giá trị sine của tích `position` và `div_term` và gán nó vào các vị trí tương ứng trong `pe`.
        # Gán các giá trị sine cho mọi chiều kích thước khác của embedding (bắt đầu từ chỉ số 0).
        # `position` có dạng (max_len, 1) và `div_term` có dạng (d_model/2,).
        # Phép toán `position * div_term` được broadcast thành dạng (max_len, d_model/2).
        # Kết quả được lưu trữ trong các chỉ số chẵn của `pe`, vì vậy phần được gán có dạng (max_len, d_model/2).
        pe[:, 0::2] = torch.sin(position * div_term)  # (max_len, d_model/2)

        # Tương tự, đối với tất cả các vị trí trong chuỗi và đối với tất cả các chỉ số lẻ trong các chiều kích thước embedding,
        # tính toán giá trị cosine của tích `position` và `div_term` và gán nó vào các vị trí tương ứng trong `pe`.
        # Gán các giá trị cosine cho các chiều kích thước còn lại của embedding (bắt đầu từ chỉ số 1).
        # Giống như trước, `position * div_term` được broadcast thành dạng (max_len, d_model/2).
        # Kết quả được lưu trữ trong các chỉ số lẻ của `pe`, vì vậy phần được gán có dạng (max_len, d_model/2).
        pe[:, 1::2] = torch.cos(position * div_term)  # (max_len, d_model/2)


        # Add a batch dimension and register it as a buffer
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)  # Register as buffer so it's not a parameter

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        x = x + self.pe[:, :x.size(1), :]  # Add positional encoding, (batch_size, seq_len, d_model)
        return x  # (batch_size, seq_len, d_model)


In [347]:
# Rotary Embedding and apply_rotary_pos_emb function
class RotaryEmbedding(nn.Module):
    def __init__(self, dim, base=10000):
        super(RotaryEmbedding, self).__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)

    def forward(self, seq_len):
        # Generate a range for sequence length and reshape for broadcasting
        t = torch.arange(seq_len, device=self.inv_freq.device).type_as(self.inv_freq).unsqueeze(1)
        # Calculate the frequency embeddings using broadcasting instead of einsum
        freqs = t * self.inv_freq.unsqueeze(0)  # Shape: [seq_len, dim//2]
        emb = torch.cat((freqs, freqs), dim=-1)  # Duplicate to match input dimension
        return emb[None, :, :]  # Shape: [1, seq_len, dim]

def apply_rotary_pos_emb(q, k, sinusoidal_pos):
    # Split the query and key tensors into even and odd dimensions
    q_cos, q_sin = q[..., 0::2], q[..., 1::2]
    k_cos, k_sin = k[..., 0::2], k[..., 1::2]

    # Split the positional encodings into cosine and sine parts
    cos, sin = sinusoidal_pos[..., 0::2], sinusoidal_pos[..., 1::2]

    # Apply rotary embeddings without einsum, element-wise operations
    q_rot = torch.cat([q_cos * cos - q_sin * sin, q_cos * sin + q_sin * cos], dim=-1)
    k_rot = torch.cat([k_cos * cos - k_sin * sin, k_cos * sin + k_sin * cos], dim=-1)

    return q_rot, k_rot

In [369]:
# Transformer Encoder Layer with Rotary Position Embedding
class TransformerEncoderLayerWithROPE(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayerWithROPE, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        self.rotary_emb = RotaryEmbedding(self.depth)

        # Linear layers for Q, K, V matrices
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        # Output linear transformation
        self.dense = nn.Linear(d_model, d_model)

        # Feed-forward network
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

        # Layer normalization and dropout
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(1, 2)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        dk = torch.tensor(k.size(-1), dtype=torch.float32, device=q.device)
        scaled_attention_logits = matmul_qk / torch.sqrt(dk)

        if mask is not None:
            scaled_attention_logits = scaled_attention_logits.masked_fill(mask == 0, -1e9)

        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, v)

        return output, attention_weights

    def forward(self, x, mask=None):
        batch_size = x.size(0)
        seq_len = x.size(1)

        # Apply linear layers and split into heads
        q = self.split_heads(self.wq(x), batch_size)
        k = self.split_heads(self.wk(x), batch_size)
        v = self.split_heads(self.wv(x), batch_size)

        # Rotary position embedding
        sinusoidal_pos = self.rotary_emb(seq_len)
        q_rot, k_rot = apply_rotary_pos_emb(q, k, sinusoidal_pos)

        # Apply the custom scaled dot-product attention
        scaled_attention, _ = self.scaled_dot_product_attention(q_rot, k_rot, v, mask)

        # Transpose and reshape back to (batch_size, seq_len, d_model)
        scaled_attention = scaled_attention.transpose(1, 2).contiguous()
        concat_attention = scaled_attention.view(batch_size, -1, self.d_model)

        # Apply the final linear layer to combine the heads
        attn_output = self.dense(concat_attention)

        # Add & Norm
        x = self.layernorm1(x + self.dropout(attn_output))

        # Feed-forward
        ff_output = self.feed_forward(x)

        # Add & Norm
        x = self.layernorm2(x + self.dropout(ff_output))

        return x

In [371]:
class TransformerModelWithROPE(nn.Module):
    def __init__(self, vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout=0.1):
        super(TransformerModelWithROPE, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayerWithROPE(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.embedding(x)  # (batch_size, seq_len, embed_size)
        for layer in self.encoder_layers:
            x = layer(x, mask)  # (batch_size, seq_len, d_model)
        x = x.mean(dim=1)  # (batch_size, d_model)
        x = self.fc(self.dropout(x))  # (batch_size, output_size)
        return x

In [285]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(d_model)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.embedding(x)  # (batch_size, seq_len, embed_size)
        x = self.positional_encoding(x)  # (batch_size, seq_len, d_model)
        for layer in self.encoder_layers:
            x = layer(x, mask)  # (batch_size, seq_len, d_model)
        x = x.mean(dim=1)  # (batch_size, d_model)
        x = self.fc(self.dropout(x))  # (batch_size, output_size)
        return x


In [287]:
train_df

Unnamed: 0,sentence,sentiment,label
0,feel submissive ever,sadness,4
1,feel playful enough try new combination,joy,2
2,find broken piece feeling nothing feeling noth...,anger,0
3,feel ecstatic worry make love automatic adica ...,joy,2
4,ive feeling really jealous friend rafia im ash...,anger,0
...,...,...,...
161608,feeling nervous,fear,1
161609,feel like punished believing austin,sadness,4
161610,look back little paragraph ive written feel bi...,anger,0
161611,feel inconvenienced trimmer blade dull,sadness,4


In [289]:
test_df

Unnamed: 0,sentence,sentiment,label
0,im feeling cold im alone,anger,0
1,feel like im th grade shy wouldnt say anything...,fear,1
2,feel like navy dress dangerous,anger,0
3,feel jaded chaser although,sadness,4
4,feel petty vicious mean defensive angry,anger,0
...,...,...,...
53866,feeling especially tender tendency get weepy h...,love,3
53867,pryers feel like listening perhaps punished ba...,sadness,4
53868,promise never react even grievously provoked l...,sadness,4
53869,feel submissive spoiled,sadness,4


In [291]:
validation_df

Unnamed: 0,sentence,sentiment,label
0,feeling lake popular weekend summer huge parki...,joy,2
1,couldnt stop feeling threatened card grandmoth...,fear,1
2,feel way try ignored ignored got interested es...,sadness,4
3,feeling bitchy,anger,0
4,know little feel special bond,joy,2
...,...,...,...
53866,think bottom line b story pierce feel need acc...,love,3
53867,straight man ejacalute like week sexual intere...,love,3
53868,feel really rude stating fact would feel rude ...,anger,0
53869,im feeling especially triggered grumpy note ev...,anger,0


In [293]:
print("Number of GPU: ", torch.cuda.device_count())
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
CUDA available: True
CUDA version: 12.6
GPU device name: NVIDIA GeForce RTX 4060 Laptop GPU
Using device: cuda


In [295]:
# Load and preprocess data

tokenizer = tiktoken.get_encoding('gpt2')

# Create datasets with device
train_dataset = TextDataset(train_df, tokenizer, device)
val_dataset = TextDataset(validation_df, tokenizer, device)
test_dataset = TextDataset(test_df, tokenizer, device)

In [297]:
for input_ids, label in train_dataset:
    print(input_ids)
    print(label)
    break




tensor([36410,   850, 33532,  1683], device='cuda:0')
tensor(4, device='cuda:0')


In [299]:
# Create dataloaders
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [373]:
# Initialize the Transformer model
vocab_size = tokenizer.n_vocab
embed_size = 256
d_model = 256
num_heads = 8
d_ff = 512
output_size = len(train_df['label'].unique())
num_layers = 3
dropout = 0.2

# Initialize model and move to device
#model = TransformerModel(vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout)
model = TransformerModelWithROPE(vocab_size, embed_size, d_model, num_heads, d_ff, output_size, num_layers, dropout)
model = model.to(device)

In [375]:
# Initialize loss and optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
'''for input_ids, labels in val_dataloader:
    outputs = model(input_ids)
    print(outputs.size())
    print(outputs)
    print()
    print(torch.max(outputs,1))
    print()
    _, predicted = torch.max(outputs,1)
    print(_)
    print(predicted)
    break
'''

In [377]:
# Define the accuracy function (using the first version from before)
def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, 1)
    total = labels.size(0)
    correct = (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy, correct, total

In [307]:
# Training loop with model saving
best_val_accuracy = 0.0  # Track the best validation accuracy
best_model_path = 'best_model.pth'  # File path to save the best model

In [None]:
num_epochs = 15

for epoch in range(num_epochs):
    # Training step
    model.train()
    train_correct = 0
    train_total = 0
    for input_ids, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        
        # Calculate training accuracy
        batch_acc, batch_correct, batch_total = calculate_accuracy(outputs, labels)
        train_correct += batch_correct
        train_total += batch_total
        
        loss.backward()
        optimizer.step()
    
    train_accuracy = 100 * train_correct / train_total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")
    print(f"Correct/Total: {train_correct}/{train_total}, Training Accuracy after Epoch {epoch+1}: {train_accuracy:.2f}%")

    # Validation step
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for input_ids, labels in val_dataloader:
            outputs = model(input_ids)
            batch_acc, batch_correct, batch_total = calculate_accuracy(outputs, labels)
            val_correct += batch_correct
            val_total += batch_total

    val_accuracy = 100 * val_correct / val_total
    print(f"Correct/Total: {val_correct}/{val_total}, Validation Accuracy after Epoch {epoch+1}: {val_accuracy:.2f}%")

    # Save model if validation accuracy improves
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), best_model_path)
    
    print()  # Add a blank line for readability

In [383]:
# Assuming the same model architecture
model.load_state_dict(torch.load('best_model.pth'))
model.eval()  # Set to evaluation mode if using for inference

RuntimeError: Error(s) in loading state_dict for TransformerModelWithROPE:
	Missing key(s) in state_dict: "encoder_layers.0.rotary_emb.inv_freq", "encoder_layers.1.rotary_emb.inv_freq", "encoder_layers.2.rotary_emb.inv_freq". 
	Unexpected key(s) in state_dict: "positional_encoding.pe". 

In [313]:
test_correct, test_total = 0, 0

for input_ids, labels in test_dataloader:
    outputs = model(input_ids)
    batch_acc, batch_correct, batch_total = calculate_accuracy(outputs, labels)
    test_correct += batch_correct
    test_total += batch_total

test_accuracy = 100 * test_correct / test_total
print(f"Correct/Total: {test_correct}/{test_total}, Accuracy: {test_accuracy:.2f}%")

Correct/Total: 49965/53871, Accuracy: 92.75%


In [315]:
def predict_emotion(sentence, model, tokenizer, device):
    # Preprocess the input sentence
    sentence = sentence.lower()
    encoding = tokenizer.encode(sentence)
    input_ids = torch.tensor(encoding, dtype=torch.long, device=device).unsqueeze(0)
    
    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids)
        accuracy, predicted = torch.max(outputs, 1)
          
    return label_dict[predicted.item()]

In [338]:
# Example usage
test_sentence = input()
predicted_emotion = predict_emotion(test_sentence, model, tokenizer, device)
print(f"Sentence: {test_sentence}")
print(f"Predicted emotion: {predicted_emotion}")

 feel like stunned


Sentence: feel like stunned
Predicted emotion: surprise
