In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# Text Processing and NLP
import emoji
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Machine Learning
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, accuracy_score, silhouette_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK data
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Feature Extraction - Without Stopwords

In [2]:
df = pd.read_csv("00_dataset/without_stopwords/cleaned_reviews.csv")
df.head()

Unnamed: 0,user_id,prod_id,review_text,rating,label,cleaned_text
0,5044,0,"Drinks were bad, the hot chocolate was watered...",1.0,1,"Drinks bad , hot chocolate water latte burnt t..."
1,5045,0,This was the worst experience I've ever had a ...,1.0,1,This bad experience I ever casual coffee/light...
2,5046,0,This is located on the site of the old Spruce ...,3.0,1,This locate site old Spruce St. Video . The mi...
3,5047,0,I enjoyed coffee and breakfast twice at Toast ...,5.0,1,I enjoy coffee breakfast twice Toast recent vi...
4,5048,0,I love Toast! The food choices are fantastic -...,5.0,1,I love Toast ! The food choice fantastic - I l...


In [3]:
print(df.isnull().sum())


user_id         0
prod_id         0
review_text     0
rating          0
label           0
cleaned_text    6
dtype: int64


In [3]:
df["cleaned_text"] = df["cleaned_text"].fillna("")

In [4]:
X = df["cleaned_text"]
y = df["label"]

# First, split into Train (80%) and Test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Next, split Train (80%) into Train (70%) and Validation (10%)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, stratify=y_train, random_state=42)

print("Training set: ", len(X_train))
print("Test set: ", len(X_test))
print("Validation set: ", len(X_val))

Training set:  425920
Test set:  121692
Validation set:  60846


In [6]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), sublinear_tf=True)  # Top 5000 words
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Save the vectorizer
joblib.dump(vectorizer, "00_dataset/without_stopwords/tfidfWithNGram/vectorizer.joblib")

['00_dataset/without_stopwords/tfidfWithNGram/vectorizer.joblib']

In [5]:
# Load the vectorizer
vectorizer = joblib.load("00_dataset/without_stopwords/tfidfWithNGram/vectorizer.joblib")

X_train_tfidf = vectorizer.transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [6]:
sia = SentimentIntensityAnalyzer()

def extract_features(text):
    sentiment_score = TextBlob(text).sentiment.polarity  # Sentiment (-1 to +1)
    vader_score = sia.polarity_scores(text)["compound"]  # VADER sentiment
    review_length = len(text.split())  # Word count
    exclamation_count = text.count("!")  # Number of !
    question_count = text.count("?")  # Number of ?
    uppercase_ratio = sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0  # % uppercase
    duplicate_word_count = len([w for w in text.split() if text.split().count(w) > 1])  # Repeated words
    emoji_count = len([char for char in text if char in emoji.EMOJI_DATA])  # Count emojis
    avg_word_length = sum(len(word) for word in text.split()) / review_length if review_length > 0 else 0  # Avg word length

    return [sentiment_score, vader_score, review_length, exclamation_count, question_count, uppercase_ratio,
            duplicate_word_count, emoji_count, avg_word_length]

# Apply feature extraction to dataset
df_train_features = X_train.apply(extract_features)
df_test_features = X_test.apply(extract_features)
df_val_features = X_val.apply(extract_features)

features_cols = ["sentiment_score", "vader_score", "review_length",
                 "exclamation_count", "question_count", "uppercase_ratio",
                 "duplicate_word_count", "emoji_count", "avg_word_length"]

df_train_features = pd.DataFrame(df_train_features.tolist(), columns=features_cols)
df_test_features = pd.DataFrame(df_test_features.tolist(), columns=features_cols)
df_val_features = pd.DataFrame(df_val_features.tolist(), columns=features_cols)
print("Engineered features extracted.")

Engineered features extracted.


In [7]:
# Convert sparse TF-IDF matrix to dense NumPy array
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()
X_val_tfidf_dense = X_val_tfidf.toarray()

# Convert TF-IDF dense matrix to DataFrame
df_train_tfidf = pd.DataFrame(X_train_tfidf_dense)
df_test_tfidf = pd.DataFrame(X_test_tfidf_dense)
df_val_tfidf = pd.DataFrame(X_val_tfidf_dense)

# **Check if row counts match before merging**
assert df_train_tfidf.shape[0] == df_train_features.shape[0], "Mismatch in train set sizes!"
assert df_test_tfidf.shape[0] == df_test_features.shape[0], "Mismatch in test set sizes!"
assert df_val_tfidf.shape[0] == df_val_features.shape[0], "Mismatch in validation set sizes!"

In [8]:
# Reset indices for training set
df_train_tfidf.reset_index(drop=True, inplace=True)
df_train_features.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

# Reset indices for testing set
df_test_tfidf.reset_index(drop=True, inplace=True)
df_test_features.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Reset indices for validation set
df_val_tfidf.reset_index(drop=True, inplace=True)
df_val_features.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

In [9]:
df_y_train = pd.DataFrame(y_train, columns=["label"])
df_y_test = pd.DataFrame(y_test, columns=["label"])
df_y_val = pd.DataFrame(y_val, columns=["label"])

# Check if the row counts of y labels match the features before merging
assert df_train_tfidf.shape[0] == df_y_train.shape[0], "Mismatch in train labels!"
assert df_test_tfidf.shape[0] == df_y_test.shape[0], "Mismatch in test labels!"
assert df_val_tfidf.shape[0] == df_y_val.shape[0], "Mismatch in validation labels!"

# Check if indices match
print(df_train_tfidf.index.equals(df_y_train.index))
print(df_test_tfidf.index.equals(df_test_features.index))
print(df_val_tfidf.index.equals(df_y_val.index))

True
True
True


In [10]:
# Check if indices match
print(df_train_tfidf.index.equals(df_train_features.index))
print(df_test_tfidf.index.equals(df_y_test.index))  
print(df_val_tfidf.index.equals(df_val_features.index))    

True
True
True


In [11]:
df_train_features.head()

Unnamed: 0,sentiment_score,vader_score,review_length,exclamation_count,question_count,uppercase_ratio,duplicate_word_count,emoji_count,avg_word_length
0,0.194444,0.9611,73,0,0,0.027708,23,0,4.452055
1,0.187037,0.9422,46,1,0,0.030534,12,0,4.717391
2,-0.147253,0.7906,38,0,0,0.031579,9,0,4.026316
3,0.253842,0.9874,108,8,0,0.076923,41,0,4.185185
4,0.257143,0.8903,53,0,0,0.040134,21,0,4.660377


In [19]:
df_train_features.dtypes

sentiment_score         float64
vader_score             float64
review_length             int64
exclamation_count         int64
question_count            int64
uppercase_ratio         float64
duplicate_word_count      int64
emoji_count               int64
avg_word_length         float64
dtype: object

In [20]:
df_train_tfidf.dtypes

0       float64
1       float64
2       float64
3       float64
4       float64
         ...   
4995    float64
4996    float64
4997    float64
4998    float64
4999    float64
Length: 5000, dtype: object

In [21]:
df_y_train.dtypes

label    int64
dtype: object

In [12]:
# Convert TF-IDF matrix to float32 (reduces memory usage by half)
df_train_tfidf = df_train_tfidf.astype(np.float32)
df_val_tfidf = df_val_tfidf.astype(np.float32)
df_test_tfidf = df_test_tfidf.astype(np.float32)

df_y_train = df_y_train.astype(np.int16)
df_y_val = df_y_val.astype(np.int16)
df_y_test = df_y_test.astype(np.int16)

# Convert engineered features to float32 and int16
float_columns = ["sentiment_score", "vader_score", "uppercase_ratio", "avg_word_length"]
int_columns = ["review_length", "exclamation_count", "question_count", "duplicate_word_count", "emoji_count"]

# Convert float64 -> float32
df_train_features[float_columns] = df_train_features[float_columns].astype(np.float32)
df_val_features[float_columns] = df_val_features[float_columns].astype(np.float32)
df_test_features[float_columns] = df_test_features[float_columns].astype(np.float32)

# Convert int64 -> int16
df_train_features[int_columns] = df_train_features[int_columns].astype(np.int16)
df_val_features[int_columns] = df_val_features[int_columns].astype(np.int16)
df_test_features[int_columns] = df_test_features[int_columns].astype(np.int16)

# Check updated data types
print(df_train_features.dtypes)


sentiment_score         float32
vader_score             float32
review_length             int16
exclamation_count         int16
question_count            int16
uppercase_ratio         float32
duplicate_word_count      int16
emoji_count               int16
avg_word_length         float32
dtype: object


In [13]:
df_train_tfidf.dtypes

0       float32
1       float32
2       float32
3       float32
4       float32
         ...   
4995    float32
4996    float32
4997    float32
4998    float32
4999    float32
Length: 5000, dtype: object

In [14]:
# Merge TF-IDF with extracted features
df_train_combined = pd.concat([df_train_tfidf, df_train_features, df_y_train], axis=1)
df_test_combined = pd.concat([df_test_tfidf, df_test_features, df_y_test], axis=1)
df_val_combined = pd.concat([df_val_tfidf, df_val_features, df_y_val], axis=1)

In [17]:
import pyarrow.parquet

df_train_combined.to_parquet("00_dataset/without_stopwords/tfidfWithNGram/train_features.parquet", index=False)
df_test_combined.to_parquet("00_dataset/without_stopwords/tfidfWithNGram/test_features.parquet", index=False)
df_val_combined.to_parquet("00_dataset/without_stopwords/tfidfWithNGram/val_features.parquet", index=False)

print("Features saved successfully")

Features saved successfully
