## Part 1 - Word2Vec with LR, SVM and RF

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


In [None]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv') 
df.head()

In [None]:
# Check data size
print("Dataset Size:")
print(len(df))

In [None]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

In [None]:
# Tokenize cleaned reviews for Word2Vec model
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing to all reviews
df['tokens'] = df['cleaned_review'].apply(tokenize_text)

## Tuning Window Size

### Window Size 3

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=3, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using window size 3:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using window size 3")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using window size 3:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using window size 3:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

### Window Size 5

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using window size 5:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using window size 5")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using window size 5:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using window size 5:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

### Window Size 7

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=7, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using window size 7:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using window size 7")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using window size 7:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using window size 7:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

## Tuning Embedding Dimension

### Embedding Dimension 50

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=50, window=5, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(50)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using embedding dimension 50:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using embedding dimension 50")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using embedding dimension 50:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using embedding dimension 50:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

### Embedding Dimension 100

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using embedding dimension 100:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using embedding dimension 100")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using embedding dimension 100:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using embedding dimension 100:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

### Embedding Dimension 200

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=200, window=5, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(200)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using embedding dimension 200:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using embedding dimension 200")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using embedding dimension 200:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using embedding dimension 200:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

## Tuning Epoch Number

### Epoch Number 5

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=5)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using epoch number 5:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using epoch number 5")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using epoch number 5:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using epoch number 5:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

### Epoch Number 10

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=10)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using epoch number 10:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using epoch number 10")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using epoch number 10:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using epoch number 10:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

### Epoch Number 20

In [None]:
# Create Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4, epochs=20)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])
print("Word2Vec Shape using epoch number 20:", X_word2vec.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression with word2vec using epoch number 20")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with word2vec using epoch number 20:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec using epoch number 20:")
evaluate_model(y_test, rf_word2vec.predict(X_test))