# Assignment 2

Mathia Quimpo, Matthew Chin, Wanqiu Zhang

## Exploration and Preprocessing

In [7]:
import pandas as pd
import numpy as np
import re
import os
import requests
import string
import nltk
import time
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.model_selection import GridSearchCV

In [8]:
# Load dataset
file_path = os.path.expanduser("~/Desktop/School/6200/Project/IMDB_Dataset.csv")
df = pd.read_csv(file_path)

print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [9]:
# Display column names
print("Columns in the dataset:\n", df.columns)

Columns in the dataset:
 Index(['review', 'sentiment'], dtype='object')


In [10]:
# Get general info
print(df.info())

# Descriptive statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000
review       0
sentiment    0
dtype: int64


In [11]:
# Display the entire review text
pd.set_option('display.max_colwidth', None)

In [12]:
# Add a new column for text length
df["Text_Length"] = df["review"].astype(str).apply(len)

# Display descriptive statistics for text length
print(df["Text_Length"].describe())

count    50000.000000
mean      1309.431020
std        989.728014
min         32.000000
25%        699.000000
50%        970.000000
75%       1590.250000
max      13704.000000
Name: Text_Length, dtype: float64


In [13]:
# Read a few random review samples
print(df["review"].sample(5, random_state=45))

42462                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   This movie was an absolute waste of time. It's nothing but a wanna-be gangster movie. It contains a very predictable plot. My feelings are unsympathetic to the characters, and the dialogue is mediocre at best. Half the time you are looking for something else to do, because the movie is that boring, since you already know what's going to happen. The other half of the time you're desperately hoping the protagonist grows a pair of balls or just ends his life by jumping off a bridge or something. Also, the secondary characters

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [15]:
# Text cleaning function
def clean_text(review):
    if not isinstance(review, str):
        return ""  # Handle missing values

    # 1. Remove HTML tags using a regex pattern
    review = re.sub(r"<.*?>", "", review)  # Remove HTML tags

    # 2. Replace URLs with a placeholder "URL"
    review = re.sub(r'http[s]?://\S+', 'URL', review)

    # 3. Convert to lowercase
    review = review.lower()

    # 4. Normalize whitespace (remove any leftover <br /> or extra spaces)
    review = re.sub(r"\s+", " ", review).strip()

    # 5. Remove special characters, numbers, and punctuation
    review = re.sub(r"[^a-z\s]", "", review)

    # 6. Tokenize the text into words
    words = word_tokenize(review)

    # 7. Remove stopwords
    words = [word for word in words if word not in stop_words]

    # 8. Lemmatize the tokens
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # 9. Convert back to string
    cleaned_text = " ".join(lemmatized_words)

    return cleaned_text, lemmatized_words

In [16]:
random_sample = df

In [17]:
# Apply text cleaning
random_sample[["cleaned_text", "tokens"]] = random_sample["review"].apply(clean_text).apply(pd.Series)

In [18]:
# Spot-checking
spotcheck_sample = random_sample[["review", "cleaned_text"]].sample(5, random_state=42)
print(spotcheck_sample)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [19]:
# Function to generate bigrams and trigrams
def generate_ngrams(text, n=2):
    tokens = word_tokenize(text.lower())
    n_grams = list(ngrams(tokens, n))
    return ["_".join(gram) for gram in n_grams]

# Generate bigrams and trigrams
random_sample["Bigrams"] = random_sample["cleaned_text"].apply(lambda x: generate_ngrams(x, 2) if isinstance(x, str) else [])
random_sample["Trigrams"] = random_sample["cleaned_text"].apply(lambda x: generate_ngrams(x, 3) if isinstance(x, str) else [])

# Check the results
print(random_sample[["review", "cleaned_text", "Bigrams", "Trigrams"]].sample(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

## TF-IDF Vectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer for unigrams, bigrams, and trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# Fit and transform the cleaned text from random_sample to create features
X_tfidf = tfidf_vectorizer.fit_transform(random_sample["cleaned_text"])

# Display the shape of the feature matrix
print(f"Feature matrix shape: {X_tfidf.shape}")

Feature matrix shape: (50000, 8642892)


In [22]:
y = random_sample['sentiment']
# Split data into training and testing sets (80% train, 20% test)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train_tfidf.shape}")
print(f"Test set size: {X_test_tfidf.shape}")

Training set size: (40000, 8642892)
Test set size: (10000, 8642892)


## Logistic Regression

In [23]:
# Initialize the Logistic Regression model
log_reg_tfidf = LogisticRegression(class_weight='balanced', max_iter=1000)

# Train the model
log_reg_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_log_reg_tfidf = log_reg_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Logistic Regression (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_log_reg_tfidf))

Logistic Regression (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.86      0.88      4961
    positive       0.87      0.90      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



### Hyperparameter Tuning

In [24]:
# Initialize the Logistic Regression model
log_reg_tfidf = LogisticRegression(class_weight='balanced', C=10, max_iter=100)

# Train the model
log_reg_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_log_reg_tfidf = log_reg_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Logistic Regression (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_log_reg_tfidf))

Logistic Regression (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



## Naive-Bayes

In [25]:
# Initialize the Naive Bayes model
nb_tfidf = MultinomialNB()

# Train the model
nb_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Naive Bayes (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_nb_tfidf))


Naive Bayes (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.90      0.89      4961
    positive       0.90      0.88      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



### Hyperparameter Tuning

In [26]:
# Initialize the Naive Bayes model
nb_tfidf = MultinomialNB(alpha=0.1)

# Train the model
nb_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Naive Bayes (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_nb_tfidf))

Naive Bayes (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.90      0.89      4961
    positive       0.90      0.89      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



## Decision Tree

In [27]:
# Initialize the Decision Tree model
dt_tfidf = DecisionTreeClassifier(max_depth=30, min_samples_split=50, min_samples_leaf=20, max_features="sqrt", random_state=42)

# Train the model
dt_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_dt_tfidf = dt_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Decision Tree (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_dt_tfidf))


Decision Tree (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.59      0.58      0.58      4961
    positive       0.59      0.59      0.59      5039

    accuracy                           0.59     10000
   macro avg       0.59      0.59      0.59     10000
weighted avg       0.59      0.59      0.59     10000



### Hyperparameter Tuning

In [28]:
# Initialize the Decision Tree model
dt_tfidf = DecisionTreeClassifier(max_depth=45, min_samples_split=10, min_samples_leaf=20, max_features="sqrt", random_state=42)

# Train the model
dt_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_dt_tfidf = dt_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Decision Tree (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_dt_tfidf))


Decision Tree (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.60      0.57      0.58      4961
    positive       0.60      0.63      0.61      5039

    accuracy                           0.60     10000
   macro avg       0.60      0.60      0.60     10000
weighted avg       0.60      0.60      0.60     10000



## Random Forest

In [29]:
# Encode y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [30]:
# Initialize the Random Forest model
rf_tfidf = RandomForestClassifier(
    n_estimators=50,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features="sqrt",
    bootstrap=True,
    random_state=42)

# Train the model
rf_tfidf.fit(X_train_tfidf, y_train_encoded)

# Make predictions
y_pred_rf_tfidf = rf_tfidf.predict(X_test_tfidf)

# Convert predictions back to original labels (optional)
y_pred_rf_tfidf_labels = label_encoder.inverse_transform(y_pred_rf_tfidf)

# Evaluate the model
print("Random Forest (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_rf_tfidf_labels))

Random Forest (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.75      0.74      4961
    positive       0.75      0.72      0.73      5039

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



### Hyperparameter Tuning

In [31]:
# Initialize the Random Forest model
rf_tfidf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features="sqrt",
    bootstrap=True,
    random_state=42)

# Train the model
rf_tfidf.fit(X_train_tfidf, y_train_encoded)

# Make predictions
y_pred_rf_tfidf = rf_tfidf.predict(X_test_tfidf)

# Convert predictions back to original labels (optional)
y_pred_rf_tfidf_labels = label_encoder.inverse_transform(y_pred_rf_tfidf)

# Evaluate the model
print("Random Forest (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_rf_tfidf_labels))

Random Forest (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.79      0.81      4961
    positive       0.80      0.84      0.82      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000



## Evaluation(Naive Bayesian)

In [34]:
sample_df = df.sample(n=100, random_state=42)
print("Selected 100 random rows from the dataset:")
print(sample_df.head())

Selected 100 random rows from the dataset:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [35]:
# Transform the text data in sample_df using the same TF-IDF vectorizer
X_sample_tfidf = tfidf_vectorizer.transform(sample_df["cleaned_text"])

In [36]:
# Predict the labels
predictions = nb_tfidf.predict(X_sample_tfidf)
probabilities = nb_tfidf.predict_proba(X_sample_tfidf)

In [37]:
sample_df["predicted_label"] = predictions
sample_df[["predicted_label", "sentiment","cleaned_text"]]

Unnamed: 0,predicted_label,sentiment,cleaned_text
33553,positive,positive,really liked summerslam due look arena curtain look overall interesting reason anyways could one best summerslams ever wwf didnt lex luger main event yokozuna time ok huge fat man v strong man im glad time changed terrible main event like every match luger terrible match card razor ramon v ted dibiase steiner brother v heavenly body shawn michael v curt hening event shawn named big monster body guard diesel irs v kid bret hart first take doink take jerry lawler stuff hart lawler always interesting ludvig borga destroyed marty jannetty undertaker took giant gonzalez another terrible match smoking gunns tatanka took bam bam bigelow headshrinkers yokozuna defended world title lex luger match boring terrible ending however deserves
9427,positive,positive,many television show appeal quite many different kind fan like farscape doesi know youngster year oldfans male female many different country think adore tv miniseries element found almost every show tv character driven drama could australian soap opera yet episode science fact fiction would give even hardiest trekkie run money brainbender stake wormhole theory time travel true equational formmagnificent embrace culture map possibility endless multiple star therefore thousand planet choose fromwith broad scope would expected nothing would able keep illusion long farscape really come elementit succeeds others failed especially like star trek universe practically zero kaos element ran idea pretty quickly kept rehashing course season manage keep audience attention using good continuity constant character evolution multiple thread every episode unique personal touch camera specific certain character group within whole structure allows extremely large area subject matter loyalty forged broken many way many many issue happened see pilot premiere passing keep tuning see crichton would ever get girl seeing television delighted see available dvd admit thing kept sane whilst hour night shift developed chronic insomniafarscape thing get extremely long nightsdo favour watch pilot see meanfarscape comet
199,negative,negative,film quickly get major chase scene ever increasing destruction first really bad thing guy hijacking steven seagal would beaten pulp seagals driving probably would ended whole premise movieit seems like decided make kind change movie plot plan enjoy action expect coherent plot turn sense logic may reduce chance getting headachei give hope steven seagal trying move back towards type character portrayed popular movie
12447,positive,positive,jane austen would definitely approve onegwyneth paltrow awesome job capturing attitude emma funny without excessively silly yet elegant put convincing british accent british maybe im best judge fooled meshe also excellent sliding doorsi sometimes forget shes american also brilliant jeremy northam sophie thompson phyllida law emma thompson sister mother bates woman nearly steal showand m law doesnt even lineshighly recommended
39489,negative,negative,expectation somewhat high went see movie thought steve carell could wrong coming great movie like anchorman yearold virgin little miss sunshine boy wrongill start right movie certain point steve carell allowed steve carell handful moment film made laugh due almost entirely given wiggleroom thing he undoubtedly talented individual shame signed turned opinion total trainwreckwith way ill discus went horrifyingly wrongthe film begin dan burn widower three girl considered nationally syndicated advice column prepares girl family reunion extended relative gather time otherthe family high atop list thing make awful movie family behaves like almost theyve transported pleasantville leave beaver caricature think family reach point become obnoxious simply frustrating touch football crossword puzzle competition family bowling talent show actual people behave almost sickeninganother big flaw woman carell supposed falling observing first scene steve carell like watching stroke victim trying rehabilitated imagine supposed unique original woman come mildly retardedit make think movie taking place another planet left theater wondering saw thinking dont think much
...,...,...,...
3928,positive,positive,walked movie theater expectation film witness everything illuminated walked joy barely come feel american film directorial debut actor leiv schreiber film follows man journey past accompanied eccentric group including brakedancing barely englishspeaking punk eukraine grandfather belief blind crazy dog first half film funny smart extremely european flavor usage small wonderful character second half film descends somber story discovery holocaust little movie brings many emotion many color wonderful conclusion story illumination also relationship connection acting incredibly powerful story mysterious interesting artistic appeal cinematography die brilliant absolutely touching scene everything illuminated managed capture heart
37286,positive,positive,buster keaton arguably enjoyable short minute film dont come ribticklingly funny gem dead pan comic get involved photographic mixup wanted felon lead elaborate evasion several street cop fellow passenger recognise face wanted sign goat chocabloc brilliant site gag opening scene bread queue right wonderful elevator chase end keaton film never feel though silence lacking sound never something needed movie explain wonderful yet incredibly dangerous thing isnt hard see influential really man every bit thoroughly amazing today
25045,negative,positive,beautiful woman backwoods inbred monster man super sweet monster truck road kill zombie brother friendsone anal retentive overly sensitive nerd foul mouthed adolescent slob throw together dash jeepers creeper texas chainsaw massacre road trip youve got monster man hilarious horrorcomedy outing never set say something simple straight forward laugh fest unpretentious well made horrorcomedy heart buddy flick film offered hilarious sickening set piece highly recommend fright fan looking entertained
15727,positive,positive,ok even cant stand liza movie truly hilarious scene john gielgud make liza one true romantic comedy classic th century dudley moore make drunk irresponsible look cute amusing damn fun watch oneliners best


## Evaluation(logistic Regression)

In [38]:
# Predict the labels
predictions = log_reg_tfidf.predict(X_sample_tfidf)
probabilities = log_reg_tfidf.predict_proba(X_sample_tfidf)

In [39]:
sample_df["predicted_loglabel"] = predictions
sample_df[["predicted_loglabel", "sentiment","cleaned_text"]]

Unnamed: 0,predicted_loglabel,sentiment,cleaned_text
33553,negative,positive,really liked summerslam due look arena curtain look overall interesting reason anyways could one best summerslams ever wwf didnt lex luger main event yokozuna time ok huge fat man v strong man im glad time changed terrible main event like every match luger terrible match card razor ramon v ted dibiase steiner brother v heavenly body shawn michael v curt hening event shawn named big monster body guard diesel irs v kid bret hart first take doink take jerry lawler stuff hart lawler always interesting ludvig borga destroyed marty jannetty undertaker took giant gonzalez another terrible match smoking gunns tatanka took bam bam bigelow headshrinkers yokozuna defended world title lex luger match boring terrible ending however deserves
9427,positive,positive,many television show appeal quite many different kind fan like farscape doesi know youngster year oldfans male female many different country think adore tv miniseries element found almost every show tv character driven drama could australian soap opera yet episode science fact fiction would give even hardiest trekkie run money brainbender stake wormhole theory time travel true equational formmagnificent embrace culture map possibility endless multiple star therefore thousand planet choose fromwith broad scope would expected nothing would able keep illusion long farscape really come elementit succeeds others failed especially like star trek universe practically zero kaos element ran idea pretty quickly kept rehashing course season manage keep audience attention using good continuity constant character evolution multiple thread every episode unique personal touch camera specific certain character group within whole structure allows extremely large area subject matter loyalty forged broken many way many many issue happened see pilot premiere passing keep tuning see crichton would ever get girl seeing television delighted see available dvd admit thing kept sane whilst hour night shift developed chronic insomniafarscape thing get extremely long nightsdo favour watch pilot see meanfarscape comet
199,negative,negative,film quickly get major chase scene ever increasing destruction first really bad thing guy hijacking steven seagal would beaten pulp seagals driving probably would ended whole premise movieit seems like decided make kind change movie plot plan enjoy action expect coherent plot turn sense logic may reduce chance getting headachei give hope steven seagal trying move back towards type character portrayed popular movie
12447,positive,positive,jane austen would definitely approve onegwyneth paltrow awesome job capturing attitude emma funny without excessively silly yet elegant put convincing british accent british maybe im best judge fooled meshe also excellent sliding doorsi sometimes forget shes american also brilliant jeremy northam sophie thompson phyllida law emma thompson sister mother bates woman nearly steal showand m law doesnt even lineshighly recommended
39489,negative,negative,expectation somewhat high went see movie thought steve carell could wrong coming great movie like anchorman yearold virgin little miss sunshine boy wrongill start right movie certain point steve carell allowed steve carell handful moment film made laugh due almost entirely given wiggleroom thing he undoubtedly talented individual shame signed turned opinion total trainwreckwith way ill discus went horrifyingly wrongthe film begin dan burn widower three girl considered nationally syndicated advice column prepares girl family reunion extended relative gather time otherthe family high atop list thing make awful movie family behaves like almost theyve transported pleasantville leave beaver caricature think family reach point become obnoxious simply frustrating touch football crossword puzzle competition family bowling talent show actual people behave almost sickeninganother big flaw woman carell supposed falling observing first scene steve carell like watching stroke victim trying rehabilitated imagine supposed unique original woman come mildly retardedit make think movie taking place another planet left theater wondering saw thinking dont think much
...,...,...,...
3928,positive,positive,walked movie theater expectation film witness everything illuminated walked joy barely come feel american film directorial debut actor leiv schreiber film follows man journey past accompanied eccentric group including brakedancing barely englishspeaking punk eukraine grandfather belief blind crazy dog first half film funny smart extremely european flavor usage small wonderful character second half film descends somber story discovery holocaust little movie brings many emotion many color wonderful conclusion story illumination also relationship connection acting incredibly powerful story mysterious interesting artistic appeal cinematography die brilliant absolutely touching scene everything illuminated managed capture heart
37286,positive,positive,buster keaton arguably enjoyable short minute film dont come ribticklingly funny gem dead pan comic get involved photographic mixup wanted felon lead elaborate evasion several street cop fellow passenger recognise face wanted sign goat chocabloc brilliant site gag opening scene bread queue right wonderful elevator chase end keaton film never feel though silence lacking sound never something needed movie explain wonderful yet incredibly dangerous thing isnt hard see influential really man every bit thoroughly amazing today
25045,positive,positive,beautiful woman backwoods inbred monster man super sweet monster truck road kill zombie brother friendsone anal retentive overly sensitive nerd foul mouthed adolescent slob throw together dash jeepers creeper texas chainsaw massacre road trip youve got monster man hilarious horrorcomedy outing never set say something simple straight forward laugh fest unpretentious well made horrorcomedy heart buddy flick film offered hilarious sickening set piece highly recommend fright fan looking entertained
15727,positive,positive,ok even cant stand liza movie truly hilarious scene john gielgud make liza one true romantic comedy classic th century dudley moore make drunk irresponsible look cute amusing damn fun watch oneliners best


In [40]:
# Import the time module
import time

# Time and evaluate Naive Bayes
start_time = time.time()
y_pred_nb = nb_tfidf.predict(X_test_tfidf)
nb_time = time.time() - start_time
nb_accuracy = accuracy_score(y_test, y_pred_nb)

# Time and evaluate Logistic Regression
start_time = time.time()
y_pred_lr = log_reg_tfidf.predict(X_test_tfidf)
lr_time = time.time() - start_time
lr_accuracy = accuracy_score(y_test, y_pred_lr)

# Define the comparison metrics
comparison_dict = {
    "Criteria": ["Speed", "Accuracy", "Interpretability", "Ease of Implementation"],
    "Naïve Bayes": [f"{nb_time:.4f} sec", f"{nb_accuracy:.2%}", "High", "Easy"],
    "Logistic Regression": [f"{lr_time:.4f} sec", f"{lr_accuracy:.2%}", "Medium", "Moderate"]
}

# Convert to a DataFrame
comparison_df = pd.DataFrame(comparison_dict)

# Display the table
print(comparison_df)

                 Criteria Naïve Bayes Logistic Regression
0                   Speed  0.2780 sec          0.0227 sec
1                Accuracy      89.53%              89.71%
2        Interpretability        High              Medium
3  Ease of Implementation        Easy            Moderate
