In [32]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import tiktoken
import torch
import torch.nn as nn
import torch.optim as optim

In [48]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [34]:
# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Load the datasets
train_data = pd.read_csv('Dataset/Processed dataset/train_data.csv')
test_data = pd.read_csv('Dataset/Processed dataset/test_data.csv')
val_data = pd.read_csv('Dataset/Processed dataset/validation_data.csv')

In [36]:
# Drop the 'sentiment' column from all datasets
train_data = train_data.drop('sentiment', axis=1)
test_data = test_data.drop('sentiment', axis=1)
val_data = val_data.drop('sentiment', axis=1)

In [38]:
# Display basic information
print("Training Data Shape:", train_data.shape)
print("\nTest Data Shape:", test_data.shape)
print("\nValidation Data Shape:", val_data.shape)

Training Data Shape: (161613, 2)

Test Data Shape: (53871, 2)

Validation Data Shape: (53871, 2)


In [40]:
# Display first few rows
print("\nFirst few rows of training data:")
print(train_data.head())



First few rows of training data:
                                            sentence  label
0                               feel submissive ever      4
1            feel playful enough try new combination      2
2  find broken piece feeling nothing feeling noth...      0
3  feel ecstatic worry make love automatic adica ...      2
4  ive feeling really jealous friend rafia im ash...      0


In [42]:
# Initialize GPT-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
max_tokens = 512  # Maximum sequence length

In [50]:
def tokenize_text(text):
    # Ensure text is a string
    if not isinstance(text, str):
        text = str(text)
    
    # Handle 'nan' values
    if text == 'nan' or pd.isna(text):
        text = ""
        
    # Tokenize the text
    tokens = tokenizer.encode(text)
    
    # Truncate if longer than max_tokens
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    # Pad if shorter than max_tokens
    elif len(tokens) < max_tokens:
        tokens = tokens + [tokenizer.eot_token] * (max_tokens - len(tokens))
    return tokens

# Tokenize all texts
print("\nTokenizing texts...")
X_train = torch.tensor([tokenize_text(text) for text in train_data['sentence']], dtype=torch.float32).to(device)
X_test = torch.tensor([tokenize_text(text) for text in test_data['sentence']], dtype=torch.float32).to(device)
X_val = torch.tensor([tokenize_text(text) for text in val_data['sentence']], dtype=torch.float32).to(device)


Tokenizing texts...


In [52]:
X_train

tensor([[36410.,   850., 33532.,  ..., 50256., 50256., 50256.],
        [36410., 34264.,  1576.,  ..., 50256., 50256., 50256.],
        [19796.,  5445.,  3704.,  ..., 50256., 50256., 50256.],
        ...,
        [ 5460.,   736.,  1310.,  ..., 50256., 50256., 50256.],
        [36410., 20488., 26343.,  ..., 50256., 50256., 50256.],
        [36410.,  2138.,  6655.,  ..., 50256., 50256., 50256.]],
       device='cuda:0')

In [54]:
# Get labels directly from the datasets
y_train = torch.tensor(train_data['label'].values, dtype=torch.long).to(device)
y_test = torch.tensor(test_data['label'].values, dtype=torch.long).to(device)
y_val = torch.tensor(val_data['label'].values, dtype=torch.long).to(device)

In [56]:
# Get unique classes
num_classes = len(np.unique(y_train.cpu().numpy()))
print("\nNumber of classes:", num_classes)
print("Classes:", np.unique(y_train.cpu().numpy()))
print("\nTokenized sequence shape:", X_train.shape)


Number of classes: 6
Classes: [0 1 2 3 4 5]

Tokenized sequence shape: torch.Size([161613, 512])


In [58]:
class SoftmaxRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SoftmaxRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)
        
    def forward(self, x):
        return self.linear(x)

In [83]:
# Initialize model and move to GPU
model = SoftmaxRegression(X_train.shape[1], num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [85]:
# Training loop
num_epochs = 1000
batch_size = 1024
n_samples = len(X_train)
n_batches = (n_samples + batch_size - 1) // batch_size

In [None]:
print("\nTraining model...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    # Shuffle the data
    indices = torch.randperm(n_samples)
    X_train_shuffled = X_train[indices]
    y_train_shuffled = y_train[indices]
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, n_samples)
        
        batch_X = X_train_shuffled[start_idx:end_idx]
        batch_y = y_train_shuffled[start_idx:end_idx]
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Calculate validation accuracy
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_pred = torch.argmax(val_outputs, dim=1)
        val_accuracy = (val_pred == y_val).float().mean()
    
    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/n_batches:.4f}, Validation Accuracy: {val_accuracy*100:.2f}%")


Training model...
Epoch [100/1000], Loss: 46509853009.0127, Validation Accuracy: 17.48%
Epoch [200/1000], Loss: 46259554375.2911, Validation Accuracy: 21.49%


In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    # Validation accuracy
    val_outputs = model(X_val)
    val_pred = torch.argmax(val_outputs, dim=1)
    val_accuracy = (val_pred == y_val).float().mean()
    print(f"\nValidation Accuracy: {val_accuracy*100:.2f}%")
    
    # Test accuracy
    test_outputs = model(X_test)
    test_pred = torch.argmax(test_outputs, dim=1)
    test_accuracy = (test_pred == y_test).float().mean()
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")

In [None]:
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test.cpu().numpy(), test_pred.cpu().numpy(), 
                          target_names=np.unique(y_train.cpu().numpy())))

In [None]:
# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test.cpu().numpy(), test_pred.cpu().numpy())
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_train.cpu().numpy()),
            yticklabels=np.unique(y_train.cpu().numpy()))
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Function to predict sentiment for new text
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        # Tokenize the text using GPT-2 tokenizer
        text_tokens = torch.tensor([tokenize_text(text)], dtype=torch.float32).to(device)
        
        # Get prediction
        outputs = model(text_tokens)
        probabilities = torch.softmax(outputs, dim=1)[0]
        prediction = torch.argmax(probabilities).item()
        
        return np.unique(y_train.cpu().numpy())[prediction], probabilities.cpu().numpy()

In [None]:
# Example predictions
example_texts = input()

for text in example_texts:
    sentiment, probs = predict_sentiment(text)
    print(f"\nText: {text}")
    print(f"Predicted sentiment: {sentiment}")
    print("Probability distribution:")
    for label, prob in zip(np.unique(y_train.cpu().numpy()), probs):
        print(f"{label}: {prob:.4f}") 