In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,word_count,char_count,cleaned_review,sentiment_numeric,flesch_kincaid_grade,gunning_fog_index,lexical_diversity,nouns,verbs,adjectives,adverbs,tokens,dominant_topic,vader_sentiment,textblob_sentiment,vader_polarity,textblob_polarity
0,0,One of the other reviewers has mentioned that ...,positive,166,1116,one reviewer mentioned watching oz episode you...,1,68.0,70.98,0.825301,78,33,40,10,"['one', 'reviewer', 'mentioned', 'watching', '...",2,negative,positive,-0.9941,0.023881
1,1,A wonderful little production. <br /><br />The...,positive,84,640,wonderful little production filming technique ...,1,40.8,43.12,0.904762,33,18,20,11,"['wonderful', 'little', 'production', 'filming...",6,positive,positive,0.9571,0.127604
2,2,I thought this was a wonderful way to spend ti...,positive,85,572,thought wonderful way spend time hot summer we...,1,37.6,41.53,0.952941,39,19,18,6,"['thought', 'wonderful', 'way', 'spend', 'time...",3,positive,positive,0.9688,0.278571
3,3,Basically there's a family where a little boy ...,negative,67,443,basically there family little boy jake think t...,0,30.6,32.17,0.791045,32,13,12,5,"['basically', 'there', 'family', 'little', 'bo...",3,negative,positive,-0.9061,0.018056
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,125,843,petter matteis love time money visually stunni...,1,53.2,55.44,0.8,61,23,29,5,"['petter', 'matteis', 'love', 'time', 'money',...",7,positive,positive,0.9887,0.239534


In [3]:
# Check data size
print("Dataset Size:")
print(len(df))

Dataset Size:
50000


9- Feature Extraction for Sentiment Classification: Convert the text reviews into numerical representations suitable for
machine learning models. First, apply the Bag of Words (BoW) method, which represents the text based on word frequency
without considering word order. Next, implement TF-IDF to assign higher importance to less frequent but more meaningful words in the reviews. Finally, explore word embeddings such as Word2Vec, GloVe, or BERT to capture more advanced and
contextual word representations, providing richer semantic information for the sentiment classification models.

In [4]:
# Create Bag of Words (BoW) model
vectorizer_bow = CountVectorizer(max_features=5000)  # Limit to 5000 most frequent words
X_bow = vectorizer_bow.fit_transform(df['cleaned_review']).toarray()

# Check BoW features
print("BoW Feature Shape:", X_bow.shape)

BoW Feature Shape: (50000, 5000)


In [5]:
scaler = StandardScaler()
model = scaler.fit(X_bow)
X_bow = model.transform(X_bow)

In [6]:
# Create TF-IDF model
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_review']).toarray()

# Check TF-IDF features
print("TF-IDF Feature Shape:", X_tfidf.shape)

TF-IDF Feature Shape: (50000, 5000)


In [7]:
scaler = StandardScaler()
model = scaler.fit(X_tfidf)
X_tfidf = model.transform(X_tfidf)

In [8]:
# Train Word2Vec model
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing to all reviews
df['tokens'] = df['cleaned_review'].apply(tokenize_text)

word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])

# Check Word2Vec features
print("Word2Vec Feature Shape:", X_word2vec.shape)

Word2Vec Feature Shape: (50000, 100)


In [9]:
scaler = StandardScaler()
model = scaler.fit(X_word2vec)
X_word2vec = model.transform(X_word2vec)

10- Sentiment Prediction Using Extracted Features: Build a sentiment classification model using the features extracted in
Task 9. Train the model on the training dataset using features extracted via Bag of Words (BoW), TF-IDF, and word
embeddings such as Word2Vec, GloVe, or BERT. After training, evaluate the performance of the model on the test dataset.
The goal is to predict whether a review is positive or negative based on these numerical representations. You are required to
compare the performance of various classifiers, including Logistic Regression, Support Vector Machines (SVM), Random
Forest, and Deep Learning models (LSTM or CNN). Each classifier will be applied to BoW, TF-IDF and word embeddings,
and the results should be evaluated using metrics such as accuracy, precision, recall, and F1-score.

In [10]:
# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [11]:
# 1. Define Custom Dataset Class
class NumpyDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.targets[idx]
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        return x, y

# 2. Define LSTM Model Class
class LSTMNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x.unsqueeze(1), (h0, c0))
        out = self.fc(out[:, -1, :])
        out = torch.sigmoid(out)
        return out

# 3. Training Loop
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.8f}')

# 4. Create Test Function
def test_model(model, test_loader):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            outputs = model(inputs)
            all_preds.append(outputs.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    all_preds = np.concatenate(all_preds).flatten().round()
    all_targets = np.concatenate(all_targets).flatten()
    return all_targets, all_preds

# 5. Create Evaluation Function
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [12]:
# Define Training Parameters
hidden_size = 128  # Example hidden layer size
num_layers = 2 # Number of LSTM layers
output_size = 1 # Output size (single scalar value)
batch_size = 256 # Define Batch Size
num_epochs = 10 # Number of epochs

# BoW

In [13]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

In [14]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW:")
evaluate_model(y_test, clf_bow.predict(X_test))

Logistic Regression with BoW:
Accuracy: 0.82
Precision: 0.82
Recall: 0.82
F1-Score: 0.82


In [15]:
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW:")
evaluate_model(y_test, svm_bow.predict(X_test))

SVM with BoW:
Accuracy: 0.82
Precision: 0.82
Recall: 0.82
F1-Score: 0.82


In [16]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW:")
evaluate_model(y_test, rf_bow.predict(X_test))

Random Forest with BoW:
Accuracy: 0.84
Precision: 0.85
Recall: 0.84
F1-Score: 0.84


In [17]:
# LSTM model
# Prepare Dataset
input_size = 5000  # Number of features
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW:")
evaluate_model(y_test, y_pre)

Epoch [1/10], Loss: 0.09935626
Epoch [2/10], Loss: 0.01869978
Epoch [3/10], Loss: 0.00050604
Epoch [4/10], Loss: 0.00016974
Epoch [5/10], Loss: 0.00011066
Epoch [6/10], Loss: 0.00010869
Epoch [7/10], Loss: 0.00004452
Epoch [8/10], Loss: 0.00003513
Epoch [9/10], Loss: 0.00002754
Epoch [10/10], Loss: 0.00002235
LSTM with BoW:
Accuracy: 0.87
Precision: 0.86
Recall: 0.87
F1-Score: 0.87


# TF-IDF

In [18]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

In [19]:
# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with tf-idf:")
evaluate_model(y_test, clf_tfidf.predict(X_test))

Logistic Regression model with tf-idf:
Accuracy: 0.82
Precision: 0.82
Recall: 0.82
F1-Score: 0.82


In [20]:
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with tf-idf:")
evaluate_model(y_test, svm_tfidf.predict(X_test))

SVM with tf-idf:
Accuracy: 0.81
Precision: 0.81
Recall: 0.81
F1-Score: 0.81


In [21]:
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with tf-idf:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

Random Forest with tf-idf:
Accuracy: 0.84
Precision: 0.84
Recall: 0.84
F1-Score: 0.84


In [22]:
# LSTM model
# Prepare Dataset
input_size = 5000  # Number of features
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with tf-idf:")
evaluate_model(y_test, y_pre)

Epoch [1/10], Loss: 0.09836783
Epoch [2/10], Loss: 0.00815309
Epoch [3/10], Loss: 0.00364960
Epoch [4/10], Loss: 0.00011586
Epoch [5/10], Loss: 0.00008695
Epoch [6/10], Loss: 0.00005566
Epoch [7/10], Loss: 0.00004445
Epoch [8/10], Loss: 0.00003449
Epoch [9/10], Loss: 0.00002536
Epoch [10/10], Loss: 0.00002077
LSTM with tf-idf:
Accuracy: 0.86
Precision: 0.85
Recall: 0.88
F1-Score: 0.86


# Word2Vec

In [23]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'].values, test_size=0.5, random_state=42)

In [24]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec")
evaluate_model(y_test, clf_word2vec.predict(X_test))

Logistic Regression model with word2vec
Accuracy: 0.86
Precision: 0.85
Recall: 0.87
F1-Score: 0.86


In [25]:
# Train a Support Vector Machine (SVM)
svm_word2vec = SVC(kernel='linear')
svm_word2vec.fit(X_train, y_train)
print("SVM with word2vec:")
evaluate_model(y_test, svm_word2vec.predict(X_test))

SVM with word2vec:
Accuracy: 0.86
Precision: 0.85
Recall: 0.87
F1-Score: 0.86


In [26]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

Random Forest with word2vec:
Accuracy: 0.83
Precision: 0.82
Recall: 0.85
F1-Score: 0.84


In [27]:
# LSTM model
# Prepare Dataset
input_size = 100  # Number of features
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with word2vec:")
evaluate_model(y_test, y_pre)

Epoch [1/10], Loss: 0.10251184
Epoch [2/10], Loss: 0.10751328
Epoch [3/10], Loss: 0.10290660
Epoch [4/10], Loss: 0.08522266
Epoch [5/10], Loss: 0.07654339
Epoch [6/10], Loss: 0.06789526
Epoch [7/10], Loss: 0.08451746
Epoch [8/10], Loss: 0.07823770
Epoch [9/10], Loss: 0.06051233
Epoch [10/10], Loss: 0.06286611
LSTM with word2vec:
Accuracy: 0.86
Precision: 0.86
Recall: 0.87
F1-Score: 0.86
