In [17]:
#Muhammad Hafiz Bin Kamaruzaman SW01081229
#Kamal Adeem Bin Kamaruddin IS01081937

# Data Preprocessing
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Load data
df = pd.read_csv('Reviews.csv')

# Keep only relevant columns
df = df[['Text', 'Score']]

# Drop rows with missing values
df.dropna(inplace=True)

# Remove neutral reviews (Score == 3)
df = df[df['Score'] != 3]

# Create sentiment labels
df['Sentiment'] = df['Score'].apply(lambda x: 'positive' if x > 3 else 'negative')

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\n', '', text)
    return text

df['Cleaned_Text'] = df['Text'].apply(clean_text)

In [14]:
# Feature Extraction
from sklearn.model_selection import train_test_split

# Labels
y = df['Sentiment']

# Split first to get train/test indices
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['Cleaned_Text'], y, test_size=0.2, random_state=42)

# Apply BoW and TF-IDF on split text
X_train_bow = bow_vectorizer.fit_transform(X_train_text)
X_test_bow = bow_vectorizer.transform(X_test_text)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

In [15]:
# Model Selection
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naive Bayes (BoW)
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)
y_pred_nb_bow = nb_bow.predict(X_test_bow)

# SVM (TF-IDF)
svm_tfidf = LinearSVC()
svm_tfidf.fit(X_train_tfidf, y_train)
y_pred_svm_tfidf = svm_tfidf.predict(X_test_tfidf)

In [16]:
# Model Evaluation
print("Naive Bayes (BoW) Performance:")
print(classification_report(y_test, y_pred_nb_bow))

print("SVM (TF-IDF) Performance:")
print(classification_report(y_test, y_pred_svm_tfidf))

Naive Bayes (BoW) Performance:
              precision    recall  f1-score   support

    negative       0.76      0.72      0.74     16379
    positive       0.95      0.96      0.95     88784

    accuracy                           0.92    105163
   macro avg       0.85      0.84      0.85    105163
weighted avg       0.92      0.92      0.92    105163

SVM (TF-IDF) Performance:
              precision    recall  f1-score   support

    negative       0.87      0.79      0.83     16379
    positive       0.96      0.98      0.97     88784

    accuracy                           0.95    105163
   macro avg       0.92      0.88      0.90    105163
weighted avg       0.95      0.95      0.95    105163



In [None]:
# Discussion

# Naive Bayes (BoW)
# - Pros: Simple and fast.
# - Cons: Assumes independence between features.

# SVM (TF-IDF)
# - Pros: Handles high-dimensional data well.
# - Cons: Slower training; sensitive to parameter tuning.

# Recommendation
# SVM with TF-IDF generally performs better for sentiment classification tasks due to its effectiveness with sparse, high-dimensional data.