In [None]:
# Install dependencies
!pip install pandas scikit-learn nltk torch transformers kaggle


In [None]:
from google.colab import files
files.upload()   # ðŸ‘‰ Select kaggle.json from your computer


In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!ls ~/.kaggle



In [None]:
!kaggle datasets list -s "amazon customer reviews"


In [None]:
# Download smaller dataset (~4 MB)
!kaggle datasets download -d thedevastator/amazon-customer-reviews-with-2013-2019-sentiment -p ./data

# Unzip it
!unzip ./data/amazon-customer-reviews-with-2013-2019-sentiment.zip -d ./data


In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("./data/Amazon Review Data Web Scrapping - Amazon Review Data Web Scrapping.csv")

# Check shape and first rows
print("Dataset shape:", df.shape)
df.head()


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

def clean_text(text):
    text = re.sub(r"<.*?>", "", str(text))   # remove HTML
    text = re.sub(r"[^a-zA-Z]", " ", str(text))  # keep only letters
    text = text.lower()
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

# Apply cleaning
df["cleaned_review"] = df["Review_text"].apply(clean_text)

# Select only needed columns
df_final = df[["cleaned_review", "Own_Rating"]].rename(columns={"Own_Rating": "sentiment"})

# Check distribution of labels
print(df_final["sentiment"].value_counts())

df_final.head()


In [None]:
from sklearn.model_selection import train_test_split

X = df_final["cleaned_review"]
y = df_final["sentiment"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print("Train size:", len(X_train))
print("Validation size:", len(X_valid))
print("Test size:", len(X_test))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Convert text â†’ TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_tfidf, y_train)

# Evaluate on validation set
y_pred = model_lr.predict(X_valid_tfidf)
print("Validation Performance:\n", classification_report(y_valid, y_pred))


In [None]:
y_test_pred = model_lr.predict(X_test_tfidf)
print("Test Performance:\n", classification_report(y_test, y_test_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_valid_tfidf)
print("Random Forest Validation Performance:\n", classification_report(y_valid, rf_pred))

# SVM
svm_model = LinearSVC(class_weight="balanced", random_state=42)
svm_model.fit(X_train_tfidf, y_train)
svm_pred = svm_model.predict(X_valid_tfidf)
print("SVM Validation Performance:\n", classification_report(y_valid, svm_pred))


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_valid_enc = le.transform(y_valid)
y_test_enc  = le.transform(y_test)

# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100)
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(X_valid), maxlen=100)
X_test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100)

# Build LSTM model
model_lstm = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(len(le.classes_), activation="softmax")
])

model_lstm.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train
history = model_lstm.fit(X_train_seq, y_train_enc,
                         validation_data=(X_valid_seq, y_valid_enc),
                         epochs=3, batch_size=64)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Predict on test set
y_test_pred_probs = model_lstm.predict(X_test_seq)
y_test_pred = np.argmax(y_test_pred_probs, axis=1)

# Map back to labels
y_test_labels = le.inverse_transform(y_test_enc)
y_pred_labels = le.inverse_transform(y_test_pred)

# Classification report
print("LSTM Test Performance:\n", classification_report(y_test_labels, y_pred_labels))

# Confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=le.classes_)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("LSTM Confusion Matrix")
plt.show()


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset
import os

# Disable W&B logging
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

# Sample 10k for Colab speed
df_small = df_final.sample(10000, random_state=42)
df_small["label"] = df_small["sentiment"].map({"Negative":0, "Neutral":1, "Positive":2})

# Tokenize
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["cleaned_review"], padding="max_length", truncation=True, max_length=128)

dataset = HFDataset.from_pandas(df_small).map(tokenize, batched=True)
train_test = dataset.train_test_split(test_size=0.2, seed=42)

# Model
model_bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    report_to="none"   # disables W&B
)

trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"],

)

trainer.train()


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Predictions
preds = trainer.predict(train_test["test"])
y_pred = np.argmax(preds.predictions, axis=1)
y_true = preds.label_ids

# Classification report
print("BERT Test Performance:\n", classification_report(
    y_true, y_pred, target_names=["Negative", "Neutral", "Positive"])
)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Negative", "Neutral", "Positive"],
            yticklabels=["Negative", "Neutral", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("BERT Confusion Matrix")
plt.show()
