In [None]:
import pandas as pd
import numpy as np
import re
import spacy
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Load dataset
df = pd.read_csv("all_kindle_review.csv")
print("Dataset Loaded. Shape:", df.shape)

# Drop missing values
df.dropna(subset=["reviewText"], inplace=True)

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

# Apply text preprocessing
df["processed_review"] = df["reviewText"].apply(preprocess_text)

# Convert ratings into sentiment labels
df["sentiment"] = df["overall"].apply(lambda x: "positive" if x > 3 else ("negative" if x < 3 else "neutral"))

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["processed_review"])
y = df["sentiment"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Naïve Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear', probability=True, random_state=42)
}

# Train and evaluate models
performance = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    performance[name] = {"Accuracy": accuracy, "Precision": report["weighted avg"]["precision"], "Recall": report["weighted avg"]["recall"], "F1-Score": report["weighted avg"]["f1-score"]}

# Convert performance metrics to DataFrame and display
performance_df = pd.DataFrame(performance).T
print("\nModel Performance Comparison:")
print(performance_df)

# Save the best model (Random Forest for now)
best_model = models["Random Forest"]
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# Confusion Matrix for best model
y_pred_best = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_best, labels=["positive", "neutral", "negative"])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["positive", "neutral", "negative"], yticklabels=["positive", "neutral", "negative"])
plt.title("Confusion Matrix - Best Model (Random Forest)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

print("\nSentiment Analysis Complete.")
