# Preprocessing (can be skipped if Tasks 1-8 have been run before)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
import textstat

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from wordcloud import WordCloud
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
# Load data
df = pd.read_csv('IMDB_Dataset.csv') 
df.head()

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters and digits
    text = re.sub(r'\W|\d', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatizing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)
df['cleaned_review']

In [None]:
# Convert sentiment to numeric values: 1 for positive, 0 for negative
df['sentiment_numeric'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
# Save pre processed dataset to a file for further tasks
df.to_csv('IMDB_Dataset_Preprocessed.csv')

# Feature Extraction for Sentiment Classification
9- Feature Extraction for Sentiment Classification: Convert the text reviews into numerical representations suitable for machine learning models. First, apply the Bag of Words (BoW) method, which represents the text based on word frequency without considering word order. Next, implement TF-IDF to assign higher importance to less frequent but more meaningful words in the reviews. Finally, explore word embeddings such as Word2Vec, GloVe, or BERT to capture more advanced and contextual word representations, providing richer semantic information for the sentiment classification models.

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize

In [None]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv') 
df.head()


In [None]:
# Check data size
print("Dataset Size:")
print(len(df))

In [None]:
# Create Bag of Words (BoW) model
vectorizer_bow = CountVectorizer(max_features=5000)  # Limit to 5000 most frequent words
X_bow = vectorizer_bow.fit_transform(df['cleaned_review']).toarray()

# Check BoW features
print("BoW Feature Shape:", X_bow.shape)

In [None]:
# Create TF-IDF model
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_review']).toarray()

# Check TF-IDF features
print("TF-IDF Feature Shape:", X_tfidf.shape)

In [None]:
# Tokenize cleaned reviews for Word2Vec model
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing to all reviews
df['tokens'] = df['cleaned_review'].apply(tokenize_text)

In [10]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])

# Check Word2Vec features
print("Word2Vec Feature Shape:", X_word2vec.shape)

Word2Vec Feature Shape: (50000, 100)


# Sentiment Prediction Using Extracted Features
10- Sentiment Prediction Using Extracted Features: Build a sentiment classification model using the features extracted in Task 9. Train the model on the training dataset using features extracted via Bag of Words (BoW), TF-IDF, and word embeddings such as Word2Vec, GloVe, or BERT. After training, evaluate the performance of the model on the test dataset. The goal is to predict whether a review is positive or negative based on these numerical representations. You are required to compare the performance of various classifiers, including Logistic Regression, Support Vector Machines (SVM), Random Forest, and Deep Learning models (LSTM or CNN). Each classifier will be applied to BoW, TF-IDF and word embeddings, and the results should be evaluated using metrics such as accuracy, precision, recall, and F1-score.

In [11]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

## LSTM related classes and methods

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix


# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# 1. Define Custom Dataset Class
class NumpyDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x = self.features[idx]
        y = self.targets[idx]
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        return x, y

# 2. Define LSTM Model Class
class LSTMNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x.unsqueeze(1), (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# 3. Training Loop
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# 4. Create Test Function
def test_model(model, test_loader):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device).view(-1, 1)
            outputs = model(inputs)
            all_preds.append(outputs.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    all_preds = np.concatenate(all_preds).flatten().round()
    all_targets = np.concatenate(all_targets).flatten()
    return all_targets, all_preds


# Define LSTM Training Parameters
input_size = 5000  # Input size (BoW features)
hidden_size = 128  # Example hidden layer size
num_layers = 2 # Number of LSTM layers
output_size = 1 # Output size (single scalar value)
batch_size = 32 # Define Batch Size
num_epochs = 10 # Number of epochs

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


## Part 1 - BoW with LR, SVM and RF

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with BoW:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW:")
evaluate_model(y_test, rf_bow.predict(X_test))

## Part 2 - BoW with LSTM

In [None]:
print("X_bow Feature Shape:", X_bow.shape)

# Convert to numpy array
X_bow = np.asarray(X_bow)

scaler = StandardScaler()
model = scaler.fit(X_bow)
X_bow = model.transform(X_bow)

In [None]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with BoW:")
evaluate_model(y_test, y_pre)

## Part 3 - TF-IDF with LR, SVM and RF

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression with TF-IDF:")
evaluate_model(y_test, clf_tfidf.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with TF-IDF:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

## Part 4 - TF-IDF with LSTM

In [None]:
print("X_tfidf Feature Shape:", X_tfidf.shape)

# Convert to numpy array
X_tfidf = np.asarray(X_tfidf)

scaler = StandardScaler()
model = scaler.fit(X_tfidf)
X_tfidf = model.transform(X_tfidf)

In [None]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

In [None]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

In [None]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with TF-IDF:")
evaluate_model(y_test, y_pre)

## Part 5 - Word2Vec with LR, SVM and RF

In [12]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [13]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec")
evaluate_model(y_test, clf_word2vec.predict(X_test))


Logistic Regression with word2vec
Accuracy: 0.87
Precision: 0.87
Recall: 0.88
F1-Score: 0.87


In [14]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec:")
evaluate_model(y_test, svm_bow.predict(X_test))

SVM with word2vec:
Accuracy: 0.86
Precision: 0.90
Recall: 0.82
F1-Score: 0.86


In [15]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

Random Forest with word2vec:
Accuracy: 0.84
Precision: 0.83
Recall: 0.85
F1-Score: 0.84


## Part 6 - Word2Vec with LSTM

In [17]:
input_size = 100  # Input size (Word2Vec features)

print("X_word2vec Feature Shape:", X_word2vec.shape)

# Convert to numpy array
X_word2vec = np.asarray(X_word2vec)

scaler = StandardScaler()
model = scaler.fit(X_word2vec)
X_word2vec = model.transform(X_word2vec)

X_word2vec Feature Shape: (50000, 100)


In [18]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)
train_dataset = NumpyDataset(X_train, y_train)
test_dataset = NumpyDataset(X_test, y_test)

Train Feature Shape: (25000, 100)
Train Labels Shape: (25000,)
Test Feature Shape: (25000, 100)
Test Labels Shape: (25000,)


In [19]:
# Prepare DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Instantiate Model, Define Loss and Optimizer
model = LSTMNetwork(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Run Training Loops
train_model(model, train_loader, criterion, optimizer, num_epochs)

Epoch [1/10], Loss: 0.0786
Epoch [2/10], Loss: 0.1387
Epoch [3/10], Loss: 0.0807
Epoch [4/10], Loss: 0.0318
Epoch [5/10], Loss: 0.0109
Epoch [6/10], Loss: 0.0939
Epoch [7/10], Loss: 0.0232
Epoch [8/10], Loss: 0.0217
Epoch [9/10], Loss: 0.0036
Epoch [10/10], Loss: 0.0258


In [21]:
# Run Test
y_test, y_pre = test_model(model, test_loader)
print("LSTM with word2vec:")
evaluate_model(y_test, y_pre)

LSTM with word2vec:
Accuracy: 0.87
Precision: 0.85
Recall: 0.89
F1-Score: 0.87
