# Dataset Download #

In [None]:
# # Optional: Download dataset from KaggleHub
# import os
# import kagglehub

# data = kagglehub.dataset_download('kritanjalijain/amazon-reviews')
# dataset_path = os.path.dirname(data)
# print(f"The dataset is saved at: {dataset_path}")

# Import Libraries # 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import time
import ssl
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 100)

# Load Dataset #

In [None]:
# Load training and test datasets
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

# Display datasets
print("----- Train Dataset -----")
display(train_df.head())

print("----- Test Dataset -----")
display(test_df.head())

# Shapes
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Column info
print("Train columns:", list(train_df.columns))
print("Test columns:", list(test_df.columns))
train_df.info()

# Clean Column Names & Check Missing Values #

In [None]:
# Fix column names
train_df.columns = ['score', 'summary', 'text']
test_df.columns = ['score', 'summary', 'text']

display(train_df.head())

# Check missing values
print("\nMissing values per column:")
print(train_df.isnull().sum())
test_df.sample(5)

# Map Polarity to Sentiment #

In [None]:
train_df['sentiment'] = train_df['score'].map({1:'negative', 2:'positive'})
test_df['sentiment'] = test_df['score'].map({1:'negative', 2:'positive'})

# Check distribution
print("Train sentiment distribution:\n", train_df['sentiment'].value_counts())
print("\nTest sentiment distribution:\n", test_df['sentiment'].value_counts())

# Text Cleaning Function #

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply text cleaning
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

# Display sample
print("Original review:\n", train_df['text'].iloc[0])
print("\nCleaned review:\n", train_df['clean_text'].iloc[0])

# Remove Stopwords #

In [None]:
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

train_df['clean_text'] = train_df['clean_text'].apply(remove_stopwords)
test_df['clean_text'] = test_df['clean_text'].apply(remove_stopwords)

print("After removing stopwords:\n", train_df['clean_text'].iloc[0])

# Sentiment Distribution Visualization #

In [None]:
sentiment_counts = train_df['sentiment'].value_counts().reset_index()
sentiment_counts.columns = ['sentiment', 'count']

color_map = {'positive': 'green', 'negative': 'red'}

plt.figure(figsize=(6, 4))
ax = sns.barplot(data=sentiment_counts, x='sentiment', y='count', color=None)

for bar, sentiment in zip(ax.patches, sentiment_counts['sentiment']):
    bar.set_color(color_map[sentiment])

for index, row in sentiment_counts.iterrows():
    plt.text(index, row['count'] + 100, row['count'], ha='center', fontsize=10)

plt.title("Train Sentiment Distribution", fontsize=14)
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

# Review Length Distribution #

In [None]:
train_df['review_len'] = train_df['clean_text'].apply(lambda x: len(x.split()))
review_len_clipped = train_df['review_len'].clip(upper=200)

plt.figure(figsize=(8, 5))
plt.hist(review_len_clipped, bins=50, color='skyblue', edgecolor='black', linewidth=1)
plt.title("Review Length Distribution (words, clipped at 200)", fontsize=14)
plt.xlabel("Review Length (words)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Top Words by Sentiment #

In [None]:
neg_text = train_df[train_df['sentiment']=='negative']['clean_text']
pos_text = train_df[train_df['sentiment']=='positive']['clean_text']

neg_counter = Counter(word for review in neg_text for word in review.split())
pos_counter = Counter(word for review in pos_text for word in review.split())

neg_top10 = pd.DataFrame(neg_counter.most_common(10), columns=['word', 'count'])
pos_top10 = pd.DataFrame(pos_counter.most_common(10), columns=['word', 'count'])

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.barplot(data=neg_top10, x='count', y='word', hue='word', palette=sns.color_palette("Set1", 10), legend=False, ax=axes[0])
axes[0].set_title("Top 10 Negative Words")
axes[0].set_xlabel("Count")
axes[0].set_ylabel("Word")

sns.barplot(data=pos_top10, x='count', y='word', hue='word', palette=sns.color_palette("Set2", 10), legend=False, ax=axes[1])
axes[1].set_title("Top 10 Positive Words")
axes[1].set_xlabel("Count")
axes[1].set_ylabel("")

plt.suptitle("Top 10 Words by Sentiment", fontsize=15)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Custom Stopwords & Final Cleaned Text #

In [None]:
custom_stopwords = ['one','would','get','like','also','us','book','time','even','movie','good','read']
stop_words_extended = stop_words.union(custom_stopwords)

def remove_custom_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words_extended])

train_df['clean_text_final'] = train_df['clean_text'].apply(remove_custom_stopwords)
test_df['clean_text_final'] = test_df['clean_text'].apply(remove_custom_stopwords)

neg_text_final = " ".join(train_df[train_df['sentiment']=='negative']['clean_text_final'])
pos_text_final = " ".join(train_df[train_df['sentiment']=='positive']['clean_text_final'])

neg_counter_final = Counter(neg_text_final.split())
pos_counter_final = Counter(pos_text_final.split())

print("Top 10 negative words (custom stopwords removed):", neg_counter_final.most_common(10))
print("Top 10 positive words (custom stopwords removed):", pos_counter_final.most_common(10))

# Review Length Distribution (After Stopwords Removal) #

In [None]:
train_df['review_len_final'] = train_df['clean_text_final'].apply(lambda x: len(x.split()))
review_len_clipped = train_df['review_len_final'].clip(upper=100)

plt.figure(figsize=(8, 4))
plt.hist(review_len_clipped, bins=50, color='blue', edgecolor='black', linewidth=1)
plt.title("Review Length Distribution (words, after stopwords removal)", fontsize=14)
plt.xlabel("Number of words", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Prepare TF-IDF and Subsets #

In [None]:
TRAIN_SIZE = min(100000, len(train_df))
TEST_SIZE  = min(100000, len(test_df))

train_subset = train_df.sample(n=TRAIN_SIZE, random_state=42)
test_subset  = test_df.sample(n=TEST_SIZE, random_state=42)

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=3, max_df=0.9, sublinear_tf=True)
X_train_sub = tfidf.fit_transform(train_subset['clean_text_final'])
X_test_sub = tfidf.transform(test_subset['clean_text_final'])

y_train_sub = train_subset['sentiment'].map({'negative':0, 'positive':1}).values
y_test_sub = test_subset['sentiment'].map({'negative':0, 'positive':1}).values

print("X_train_sub shape:", X_train_sub.shape)
print("X_test_sub shape:", X_test_sub.shape)

# Define Evaluation Function #

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pred_time = time.time() - start_time

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "Prediction Time (s)": pred_time
    }
    return y_pred, metrics

# Train & Evaluate Models #

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss', random_state=42)
}

results = {}
best_model = None
best_model_name = None
best_score = -1

for name, model in models.items():
    print(f"Training {name} ...")
    y_pred, metrics = evaluate_model(model, X_train_sub, y_train_sub, X_test_sub, y_test_sub)
    results[name] = metrics

    if metrics["F1-Score"] > best_score:
        best_score = metrics["F1-Score"]
        best_model = model
        best_model_name = name

results_df = pd.DataFrame(results).T.sort_values(by="F1-Score", ascending=False)
print("\nAll Model Metrics & Time Comparison on Subset:")
print(results_df)
print(f"\nBest Model: {best_model_name} (F1-Score = {best_score:.4f})")

# Visualize Model Performance #

In [None]:
sns.set_style("whitegrid")
metrics_to_plot = ["Accuracy", "F1-Score"]
plot_df = results_df.copy()
plot_df["Model"] = plot_df.index
colors = sns.color_palette("viridis", len(plot_df))

plt.figure(figsize=(12, 6))
for i, metric in enumerate(metrics_to_plot, 1):
    plt.subplot(1, len(metrics_to_plot), i)
    sns.barplot(x=plot_df["Model"], y=plot_df[metric], hue=plot_df["Model"], palette=colors, legend=False, edgecolor='black', linewidth=0.8)
    for idx, val in enumerate(plot_df[metric]):
        plt.text(idx, val + 0.01, f"{val:.4f}", ha='center', fontsize=10, fontweight='bold')
    plt.title(metric, fontsize=14, fontweight="bold")
    plt.xlabel("Model")
    plt.ylabel(metric)
    plt.xticks(rotation=45)
    plt.ylim(0, plot_df[metric].max() + 0.02)
    plt.grid(axis='y', linestyle='-', alpha=0.4)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.suptitle("Model Performance Comparison", fontsize=16, fontweight="bold")
plt.show()

# Save Best Model and Vectorizer #

In [None]:
joblib.dump(best_model, "best_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print(f"Best model '{best_model_name}' saved successfully!")
print("TF-IDF vectorizer saved successfully!")

# Load Model & Vectorizer #

In [None]:
model = joblib.load("best_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")
print("Model and vectorizer loaded successfully!\n")

# Create Unseen Data and Predict #

In [None]:
data = {
    "review": [
        # Positive reviews
        "I love this product, it works great!", "Amazing quality and super fast delivery.",
        "Highly recommend, I'm very satisfied.", "Best purchase ever! Worth every penny.",
        "Excellent service and friendly staff.", "I'm so happy with my order, thank you!",
        "The design is beautiful and feels premium.", "Five stars! Would definitely buy again.",
        "Great experience overall, no issues at all.", "Perfect size, color, and quality!",
        "Really good, it exceeded my expectations.", "Totally worth it, great value for money.",
        "Everything arrived on time, thank you!", "Love it so much, very comfortable to use.",
        "Fast delivery and great customer support.", "Exactly what I needed, works perfectly.",
        "Very nice product, looks amazing.", "Satisfied with my purchase, thank you!",
        "Super easy to use and setup.", "Absolutely fantastic experience!",
        # Negative reviews
        "Terrible quality, broke after one use.", "Waste of money, very disappointed.",
        "Worst purchase I've made online.", "Item arrived damaged and dirty.",
        "Customer support was not helpful at all.", "Didn't work as described, useless.",
        "The color was different from the pictures.", "Late delivery and poor packaging.",
        "I don't recommend this to anyone.", "Not worth the price, too expensive.",
        "Bad quality and weird smell.", "Completely stopped working after two days.",
        "Cheap material, looks nothing like the image.", "Disappointed, expected much better.",
        "Wouldn't buy again, not satisfied.", "Horrible experience, waste of time.",
        "Very slow delivery, bad service.", "Feels cheap and fragile.", "Item was missing parts.",
        "Worst experience ever!"
    ],
    "label": [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    ]
}

df_unseen = pd.DataFrame(data)
df_unseen.sample(6)

X_unseen = vectorizer.transform(df_unseen['review'])
y_true = df_unseen['label']

y_pred = model.predict(X_unseen)

# Evaluate Unseen Data #

In [None]:
print("üìä Model Performance on Unseen Dataset:")
print(f"Accuracy:  {accuracy_score(y_true, y_pred):.2f}")
print(f"Precision: {precision_score(y_true, y_pred):.2f}")
print(f"Recall:    {recall_score(y_true, y_pred):.2f}")
print(f"F1-Score:  {f1_score(y_true, y_pred):.2f}\n")

print("üìã Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))

# Show Example Predictions #

In [None]:
print("\nüîç Example Predictions:")
for i in range(5):
    print(f"Review: {df_unseen['review'][i]}")
    print(f"Predicted Sentiment: {'Positive' if y_pred[i] == 1 else 'Negative'} | Actual: {'Positive' if y_true[i] == 1 else 'Negative'}\n")