In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding


In [5]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,word_count,char_count,cleaned_review,sentiment_numeric,flesch_kincaid_grade,gunning_fog_index,lexical_diversity,nouns,verbs,adjectives,adverbs,tokens,dominant_topic,vader_sentiment,textblob_sentiment,vader_polarity,textblob_polarity
0,0,One of the other reviewers has mentioned that ...,positive,166,1116,one reviewer mentioned watching oz episode you...,1,68.0,70.98,0.825301,78,33,40,10,"['one', 'reviewer', 'mentioned', 'watching', '...",0,negative,positive,-0.9941,0.023881
1,1,A wonderful little production. <br /><br />The...,positive,84,640,wonderful little production filming technique ...,1,40.8,43.12,0.904762,33,18,20,11,"['wonderful', 'little', 'production', 'filming...",1,positive,positive,0.9571,0.127604
2,2,I thought this was a wonderful way to spend ti...,positive,85,572,thought wonderful way spend time hot summer we...,1,37.6,41.53,0.952941,39,19,18,6,"['thought', 'wonderful', 'way', 'spend', 'time...",2,positive,positive,0.9688,0.278571
3,3,Basically there's a family where a little boy ...,negative,67,443,basically there family little boy jake think t...,0,30.6,32.17,0.791045,32,13,12,5,"['basically', 'there', 'family', 'little', 'bo...",2,negative,positive,-0.9061,0.018056
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,125,843,petter matteis love time money visually stunni...,1,53.2,55.44,0.8,61,23,29,5,"['petter', 'matteis', 'love', 'time', 'money',...",1,positive,positive,0.9887,0.239534


In [6]:
# Check data size
print("Dataset Size:")
print(len(df))

Dataset Size:
50000


9- Feature Extraction for Sentiment Classification: Convert the text reviews into numerical representations suitable for
machine learning models. First, apply the Bag of Words (BoW) method, which represents the text based on word frequency
without considering word order. Next, implement TF-IDF to assign higher importance to less frequent but more meaningful words in the reviews. Finally, explore word embeddings such as Word2Vec, GloVe, or BERT to capture more advanced and
contextual word representations, providing richer semantic information for the sentiment classification models.

In [7]:
# Create Bag of Words (BoW) model
vectorizer_bow = CountVectorizer(max_features=5000)  # Limit to 5000 most frequent words
X_bow = vectorizer_bow.fit_transform(df['cleaned_review']).toarray()

# Check BoW features
print("BoW Feature Shape:", X_bow.shape)

BoW Feature Shape: (50000, 5000)


In [12]:
# Create TF-IDF model
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_review']).toarray()

# Check TF-IDF features
print("TF-IDF Feature Shape:", X_tfidf.shape)

TF-IDF Feature Shape: (50000, 5000)


In [None]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])

# Check Word2Vec features
print("Word2Vec Feature Shape:", X_word2vec.shape)

In [None]:
import gensim.downloader as api

# Load pre-trained GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-100")  # 100-dimensional embeddings

# Convert reviews to GloVe vectors
def get_glove_embeddings(review):
    words = review.split()
    return np.mean([glove_model[word] for word in words if word in glove_model] or [np.zeros(100)], axis=0)

X_glove = np.array([get_glove_embeddings(review) for review in df['cleaned_review']])

# Check GloVe features
print("GloVe Feature Shape:", X_glove.shape)

GloVe Feature Shape: (50000, 100)


10- Sentiment Prediction Using Extracted Features: Build a sentiment classification model using the features extracted in
Task 9. Train the model on the training dataset using features extracted via Bag of Words (BoW), TF-IDF, and word
embeddings such as Word2Vec, GloVe, or BERT. After training, evaluate the performance of the model on the test dataset.
The goal is to predict whether a review is positive or negative based on these numerical representations. You are required to
compare the performance of various classifiers, including Logistic Regression, Support Vector Machines (SVM), Random
Forest, and Deep Learning models (LSTM or CNN). Each classifier will be applied to BoW, TF-IDF and word embeddings,
and the results should be evaluated using metrics such as accuracy, precision, recall, and F1-score.

In [8]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")


# BoW

In [9]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [10]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW:")
evaluate_model(y_test, clf_bow.predict(X_test))

Logistic Regression with BoW:
Accuracy: 0.86
Precision: 0.86
Recall: 0.86
F1-Score: 0.86


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW:")
evaluate_model(y_test, rf_bow.predict(X_test))

Random Forest with BoW:
Accuracy: 0.84
Precision: 0.85
Recall: 0.84
F1-Score: 0.84


In [11]:
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))

lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)

print("LSTM with BoW:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))



Epoch 1/5


KeyboardInterrupt: 

# TF-IDF

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with tf-idf:")
evaluate_model(y_test, clf_tfidf.predict(X_test))

Logistic Regression model with tf-idf:
Accuracy: 0.88
Precision: 0.87
Recall: 0.90
F1-Score: 0.88


In [None]:
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with tf-idf:")
evaluate_model(y_test, svm_tfidf.predict(X_test))

In [None]:
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with tf-idf:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

Accuracy: 0.88
Precision: 0.87
Recall: 0.90
F1-Score: 0.89


In [None]:
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=5000, output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))

lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=1, batch_size=32)

print("LSTM with tf-idf:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))

# Word2Vec

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_word2vec = SVC(kernel='linear')
svm_word2vec.fit(X_train, y_train)
print("SVM with word2vec:")
evaluate_model(y_test, svm_word2vec.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

Accuracy: 0.85
Precision: 0.85
Recall: 0.86
F1-Score: 0.86


In [None]:
# LSTM model:
lstm_model_word2vec = Sequential()
lstm_model_word2vec.add(Embedding(input_dim=100, output_dim=100, input_length=100)) 
lstm_model_word2vec.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_word2vec.add(Dense(1, activation='sigmoid'))

lstm_model_word2vec.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_word2vec.fit(X_train, y_train, epochs=1, batch_size=32)

print("LSTM with word2vec:")
evaluate_model(y_test, lstm_model_word2vec.predict(X_test))

# GloVe

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_glove, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_glove = LogisticRegression(max_iter=1000)
clf_glove.fit(X_train, y_train)
print("Logistic Regression model with glove")
evaluate_model(y_test, clf_glove.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_glove = SVC(kernel='linear')
svm_glove.fit(X_train, y_train)
print("SVM with glove:")
evaluate_model(y_test, svm_glove.predict(X_test))

In [None]:
# Train a RF Classifier
rf_glove = RandomForestClassifier(n_estimators=100)
rf_glove.fit(X_train, y_train)
print("Random Forest with glove:")
evaluate_model(y_test, rf_glove.predict(X_test))

Accuracy: 0.76
Precision: 0.76
Recall: 0.77
F1-Score: 0.76


In [None]:
# LSTM model:
lstm_model_glove = Sequential()
lstm_model_glove.add(Embedding(input_dim=100, output_dim=100, input_length=100)) 
lstm_model_glove.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_glove.add(Dense(1, activation='sigmoid'))

lstm_model_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_glove.fit(X_train, y_train, epochs=1, batch_size=32)

print("LSTM with glove:")
evaluate_model(y_test, lstm_model_glove.predict(X_test))