# Assignment 2

Mathia Quimpo, Matthew Chin, Wanqiu Zhang

## Exploration and Preprocessing

In [51]:
import pandas as pd
import numpy as np
import re
import os
import requests
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import matplotlib.pyplot as plt

In [5]:
# Load dataset
df = pd.read_csv(r"C:\Users\matth\Desktop\MSBA\BSAN 6200\Assignments\Assignment 2\IMDB Dataset.csv")

print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
# Display column names
print("Columns in the dataset:\n", df.columns)

Columns in the dataset:
 Index(['review', 'sentiment'], dtype='object')


In [7]:
# Get general info
print(df.info())

# Descriptive statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000
review       0
sentiment    0
dtype: int64


In [8]:
# Display the entire review text 
pd.set_option('display.max_colwidth', None)

In [9]:
# Add a new column for text length
df["Text_Length"] = df["review"].astype(str).apply(len)

# Display descriptive statistics for text length
print(df["Text_Length"].describe())

count    50000.000000
mean      1309.431020
std        989.728014
min         32.000000
25%        699.000000
50%        970.000000
75%       1590.250000
max      13704.000000
Name: Text_Length, dtype: float64


In [10]:
# Read a few random review samples
print(df["review"].sample(5, random_state=45))

42462                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   This movie was an absolute waste of time. It's nothing but a wanna-be gangster movie. It contains a very predictable plot. My feelings are unsympathetic to the characters, and the dialogue is mediocre at best. Half the time you are looking for something else to do, because the movie is that boring, since you already know what's going to happen. The other half of the time you're desperately hoping the protagonist grows a pair of balls or just ends his life by jumping off a bridge or something. Also, the secondary characters

In [11]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [12]:
# Text cleaning function
def clean_text(review):
    if not isinstance(review, str):
        return ""  # Handle missing values
    
    # 1. Remove HTML tags using a regex pattern
    review = re.sub(r"<.*?>", "", review)  # Remove HTML tags

    # 2. Replace URLs with a placeholder "URL"
    review = re.sub(r'http[s]?://\S+', 'URL', review)

    # 3. Convert to lowercase
    review = review.lower()

    # 4. Normalize whitespace (remove any leftover <br /> or extra spaces)
    review = re.sub(r"\s+", " ", review).strip()

    # 5. Remove special characters, numbers, and punctuation
    review = re.sub(r"[^a-z\s]", "", review)

    # 6. Tokenize the text into words
    words = word_tokenize(review)

    # 7. Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # 8. Lemmatize the tokens
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # 9. Convert back to string
    cleaned_text = " ".join(lemmatized_words)
    
    return cleaned_text, lemmatized_words

In [13]:
# Select 100 random rows
random_sample = df.sample(100)

In [14]:
# Apply text cleaning
random_sample[["cleaned_text", "tokens"]] = random_sample["review"].apply(clean_text).apply(pd.Series)

In [15]:
# Spot-checking
spotcheck_sample = random_sample[["review", "cleaned_text"]].sample(5, random_state=45)
print(spotcheck_sample)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [16]:
# Function to generate bigrams and trigrams
def generate_ngrams(text, n=2):
    tokens = word_tokenize(text.lower())  
    n_grams = list(ngrams(tokens, n))  
    return ["_".join(gram) for gram in n_grams]  

# Generate bigrams and trigrams
random_sample["Bigrams"] = random_sample["cleaned_text"].apply(lambda x: generate_ngrams(x, 2) if isinstance(x, str) else [])
random_sample["Trigrams"] = random_sample["cleaned_text"].apply(lambda x: generate_ngrams(x, 3) if isinstance(x, str) else [])

# Check the results
print(random_sample[["review", "cleaned_text", "Bigrams", "Trigrams"]].sample(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

## TF-IDF Vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer for unigrams, bigrams, and trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# Fit and transform the cleaned text from random_sample to create features
X_tfidf = tfidf_vectorizer.fit_transform(random_sample["cleaned_text"])

# Display the shape of the feature matrix
print(f"Feature matrix shape: {X_tfidf.shape}")

Feature matrix shape: (100, 29340)


In [19]:
y = random_sample['sentiment']
# Split data into training and testing sets (80% train, 20% test)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train_tfidf.shape}")
print(f"Test set size: {X_test_tfidf.shape}")

Training set size: (80, 29340)
Test set size: (20, 29340)


## Logistic Regression

In [21]:
# Initialize the Logistic Regression model
log_reg_tfidf = LogisticRegression(max_iter=1000)

# Train the model
log_reg_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_log_reg_tfidf = log_reg_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Logistic Regression (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_log_reg_tfidf))

Logistic Regression (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.45      1.00      0.62         9
    positive       0.00      0.00      0.00        11

    accuracy                           0.45        20
   macro avg       0.23      0.50      0.31        20
weighted avg       0.20      0.45      0.28        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Naive-Bayes

In [23]:
# Initialize the Naive Bayes model
nb_tfidf = MultinomialNB()

# Train the model
nb_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Naive Bayes (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_nb_tfidf))


Naive Bayes (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.45      1.00      0.62         9
    positive       0.00      0.00      0.00        11

    accuracy                           0.45        20
   macro avg       0.23      0.50      0.31        20
weighted avg       0.20      0.45      0.28        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Decision Tree

In [25]:
# Initialize the Decision Tree model
dt_tfidf = DecisionTreeClassifier()

# Train the model
dt_tfidf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_dt_tfidf = dt_tfidf.predict(X_test_tfidf)

# Evaluate the model
print("Decision Tree (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_dt_tfidf))


Decision Tree (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.55      0.67      0.60         9
    positive       0.67      0.55      0.60        11

    accuracy                           0.60        20
   macro avg       0.61      0.61      0.60        20
weighted avg       0.61      0.60      0.60        20



## Gradient Boosting (XGBoost)

In [26]:
# Convert string labels ('negative', 'positive') to numeric (0,1)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Convert y_train
y_test_encoded = label_encoder.transform(y_test)        # Convert y_test

# Initialize XGBoost model
xgb_tfidf = XGBClassifier(eval_metric='logloss', random_state=42)

# Train the model
xgb_tfidf.fit(X_train_tfidf, y_train_encoded)

# Make predictions
y_pred_xgb_tfidf = xgb_tfidf.predict(X_test_tfidf)

# Convert predictions back to original labels (optional)
y_pred_xgb_tfidf_labels = label_encoder.inverse_transform(y_pred_xgb_tfidf)

# Evaluate the model
print("XGBoost (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_xgb_tfidf_labels))

XGBoost (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.67      0.44      0.53         9
    positive       0.64      0.82      0.72        11

    accuracy                           0.65        20
   macro avg       0.65      0.63      0.63        20
weighted avg       0.65      0.65      0.64        20



## Gradient Boosting (LightGBM)

In [59]:
# Initialize the LightGBM model
lgb_tfidf = lgb.LGBMClassifier(verbose=-1)

# Train the model
lgb_tfidf.fit(X_train_tfidf, y_train_encoded)

# Make predictions
y_pred_lgb_tfidf = lgb_tfidf.predict(X_test_tfidf)

# Convert predictions back to original labels (optional)
y_pred_lgb_tfidf_labels = label_encoder.inverse_transform(y_pred_lgb_tfidf)

# Evaluate the model
print("LightGBM (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_lgb_tfidf_labels))


LightGBM (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.67      0.67      0.67         9
    positive       0.73      0.73      0.73        11

    accuracy                           0.70        20
   macro avg       0.70      0.70      0.70        20
weighted avg       0.70      0.70      0.70        20



## Random Forest

In [68]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_tfidf.fit(X_train_tfidf, y_train_encoded)

# Make predictions
y_pred_rf_tfidf = rf_tfidf.predict(X_test_tfidf)

# Convert predictions back to original labels (optional)
y_pred_rf_tfidf_labels = label_encoder.inverse_transform(y_pred_rf_tfidf)

# Evaluate the model
print("Random Forest (TF-IDF) Classification Report:")
print(classification_report(y_test, y_pred_rf_tfidf_labels))


Random Forest (TF-IDF) Classification Report:
              precision    recall  f1-score   support

    negative       0.45      1.00      0.62         9
    positive       0.00      0.00      0.00        11

    accuracy                           0.45        20
   macro avg       0.23      0.50      0.31        20
weighted avg       0.20      0.45      0.28        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
