## Part 1 - TF-IDF with LR, SVM and RF

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv') 
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,cleaned_review,sentiment_numeric,tokens
0,0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...,1,"['one', 'reviewer', 'mentioned', 'watching', '..."
1,1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1,"['wonderful', 'little', 'production', 'filming..."
2,2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1,"['thought', 'wonderful', 'way', 'spend', 'time..."
3,3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,0,"['basically', 'there', 'family', 'little', 'bo..."
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1,"['petter', 'matteis', 'love', 'time', 'money',..."


In [3]:
# Check data size
print("Dataset Size:")
print(len(df))

Dataset Size:
50000


In [4]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

## 3000 Features

### Unigram

In [5]:
# Create TF-IDF model
vectorizer_tfidf_unigram = TfidfVectorizer(max_features=3000)
X_tfidf_unigram = vectorizer_tfidf_unigram.fit_transform(df['cleaned_review']).toarray()

print("TF-IDF Unigram Feature Shape with 3000 features:", X_tfidf_unigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

TF-IDF Unigram Feature Shape with 3000 features: (50000, 3000)


In [6]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

Logistic Regression with TF-IDF using unigrams with 3000 features:
Accuracy: 0.88
Precision: 0.87
Recall: 0.90
F1-Score: 0.88


In [7]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using unigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

SVM with TF-IDF using unigrams with 3000 features:
Accuracy: 0.88
Precision: 0.85
Recall: 0.92
F1-Score: 0.88


In [8]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using unigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

Random Forest with TF-IDF using unigrams with 3000 features:
Accuracy: 0.84
Precision: 0.85
Recall: 0.84
F1-Score: 0.84


### Bigram

In [None]:
# Create TF-IDF model
vectorizer_tfidf_bigram = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf_bigram = vectorizer_tfidf_bigram.fit_transform(df['cleaned_review']).toarray()

print("TF-IDF Bigram Feature Shape with 3000 features:", X_tfidf_bigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using bigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using bigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using bigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Trigram

In [None]:
# Trigram
vectorizer_tfidf_trigram = TfidfVectorizer(max_features=3000, ngram_range=(1, 3))
X_tfidf_trigram = vectorizer_tfidf_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Trigram Feature Shape with 3000 features:", X_tfidf_trigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:

# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 3000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using trigrams with 3000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using trigrams with 3000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

## 5000 Features

### Unigram

In [None]:
# Unigram (varsayılan)
vectorizer_tfidf_unigram = TfidfVectorizer(max_features=5000)
X_tfidf_unigram = vectorizer_tfidf_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW Unigram Feature Shape with 5000 features:", X_tfidf_unigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using unigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Bigram

In [None]:
vectorizer_tfidf_bigram = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf_bigram = vectorizer_tfidf_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW bigram Feature Shape with 5000 features:", X_tfidf_bigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using bigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Trigram

In [None]:
# Trigram
vectorizer_tfidf_trigram = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_tfidf_trigram = vectorizer_tfidf_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW trigram Feature Shape with 5000 features:", X_tfidf_trigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using trigrams with 5000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))


## 7000 Features

### Unigram

In [None]:
# Unigram (varsayılan)
vectorizer_tfidf_unigram = TfidfVectorizer(max_features=7000)
X_tfidf_unigram = vectorizer_tfidf_unigram.fit_transform(df['cleaned_review']).toarray()

print("BoW unigram Feature Shape with 7000 features:", X_tfidf_unigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigram, df['sentiment_numeric'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using unigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using unigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using unigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))

### Bigram

In [None]:
# Bigram
vectorizer_tfidf_bigram = TfidfVectorizer(max_features=7000, ngram_range=(1, 2))
X_tfidf_bigram = vectorizer_tfidf_bigram.fit_transform(df['cleaned_review']).toarray()

print("BoW bigram Feature Shape with 7000 features:", X_tfidf_bigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigram, df['sentiment_numeric'], test_size=0.5, random_state=42)


In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using bigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using bigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))


In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using bigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))


### Trigram

In [None]:
# Trigram
vectorizer_tfidf_trigram = TfidfVectorizer(max_features=7000, ngram_range=(1, 3))
X_tfidf_trigram = vectorizer_tfidf_trigram.fit_transform(df['cleaned_review']).toarray()

print("BoW trigram Feature Shape with 7000 features:", X_tfidf_trigram.shape)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_trigram, df['sentiment_numeric'], test_size=0.5, random_state=42)


In [None]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with TF-IDF using trigrams with 7000 features:")
evaluate_model(y_test, clf_bow.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SGDClassifier(loss='hinge')  # 'hinge' loss corresponds to a linear SVM
svm_bow.fit(X_train, y_train)
print("SVM with TF-IDF using trigrams with 7000 features:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with TF-IDF using trigrams with 7000 features:")
evaluate_model(y_test, rf_bow.predict(X_test))