In [1]:
# Install a pip package in the current Jupyter kernel
import sys
#!{sys.executable} -m pip3 install torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
reviews_data = pd.read_csv('IMDB Dataset.csv', sep = ",", skipinitialspace=True, engine="python")
len(reviews_data) # number of dialogues in complete dataset with all characters

50000

In [3]:
reviews_data.sample(10)

Unnamed: 0,review,sentiment
37694,A very promising directorial debut for Bill Pa...,positive
28151,This second film is just as interesting as the...,positive
46219,Why is Guy working for Buddy? Probably because...,negative
41758,I don't know what would be so great about this...,negative
16925,"Great music, but ain't these people PATHETIC?!...",negative
43165,I've tried to remember the name of this movie ...,positive
43134,"When a film is independent and not rated, such...",negative
45577,"I discovered ""The Patriot"" in a DVD-store and ...",negative
22993,Terrible movie. If there is one Turkish film y...,negative
19212,"With boundless, raw energy and an uncompromisi...",positive


In [5]:
# Tokenize the reviews
tokenized_reviews = [review.split() for review in reviews_data['review']]

# Word embedding using Word2Vec
word_embedding_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Function to get the average word embedding for a review
def get_average_embedding(review, model):
    embeddings = [model.wv[word] for word in review if word in model.wv]
    if not embeddings:
        return np.zeros(model.vector_size)
    return np.mean(embeddings, axis=0)

# Get average embeddings for all reviews
average_embeddings = [get_average_embedding(review, word_embedding_model) for review in tokenized_reviews]

# Convert the embeddings to PyTorch tensors
X = torch.tensor(average_embeddings, dtype=torch.float32)

# Extract and encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(reviews_data['sentiment'])
y = torch.tensor(encoded_labels, dtype=torch.long)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Neural Network
class FFNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize the model, loss function, and optimizer
input_size = word_embedding_model.vector_size
hidden_size = 128
output_size = len(label_encoder.classes_)

model = FFNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Set the device to GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
X_train, X_test = X_train.to(device), X_test.to(device)
y_train, y_test = y_train.to(device), y_test.to(device)

# Convert data into PyTorch DataLoader
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Train the model
epochs = 10
for epoch in range(epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {running_loss/len(train_loader)}")

# Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    _, predicted = torch.max(test_outputs, 1)
    correct = (predicted == y_test).sum().item()
    accuracy = correct / len(y_test)
    print(f"Test Accuracy: {accuracy}")


Epoch 1/10 - Loss: 0.4731556166648865
Epoch 2/10 - Loss: 0.4198583607435226
Epoch 3/10 - Loss: 0.40935526888370516
Epoch 4/10 - Loss: 0.40368233346939086
Epoch 5/10 - Loss: 0.40023200221061705
Epoch 6/10 - Loss: 0.3971774401426315
Epoch 7/10 - Loss: 0.3985624214887619
Epoch 8/10 - Loss: 0.39699130005836486
Epoch 9/10 - Loss: 0.3927047257423401
Epoch 10/10 - Loss: 0.39049288387298586
Test Accuracy: 0.8252


In [6]:
from sklearn.metrics import precision_recall_fscore_support

# Evaluate the model
model.eval()
with torch.no_grad():
    val_outputs = model(X_test)
    _, predicted = torch.max(val_outputs, 1)
    predicted = predicted.cpu().numpy()
    y_test_cpu = y_test.cpu().numpy()

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        y_test_cpu, predicted, average='weighted'
    )

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")


Precision: 0.8257
Recall: 0.8252
F1 Score: 0.8252
