**1️⃣ Project Setup**

In [None]:
# Install necessary libraries
!pip install pandas numpy scikit-learn xgboost tensorflow transformers torch sentence-transformers



In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string

**2️⃣ Load Sample Dataset**

In [1]:
from google.colab import files

# 1. Upload kaggle.json
files.upload()

# 2. Move kaggle.json to ~/.kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 3. Download Mall Customers dataset from Kaggle
!kaggle datasets download -d bittlingmayer/amazonreviews

# 4. Unzip into a folder
!unzip amazonreviews.zip -d amazonreviews

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
 94% 463M/493M [00:00<00:00, 394MB/s]
100% 493M/493M [00:01<00:00, 472MB/s]
Archive:  amazonreviews.zip
  inflating: amazonreviews/test.ft.txt.bz2  
  inflating: amazonreviews/train.ft.txt.bz2  


**1️⃣ Load the Dataset**

In [3]:
import bz2
import pandas as pd

def load_bz2_fasttext(filepath):
    data = []
    with bz2.open(filepath, 'rt', encoding='utf-8') as f:  # 'rt' = read text
        for line in f:
            line = line.strip()
            if line:
                label, text = line.split(' ', 1)
                label = int(label.replace('__label__', '')) - 1  # 0=negative, 1=positive
                data.append((text, label))
    return pd.DataFrame(data, columns=['review', 'sentiment'])

# Load training and test sets
train_df = load_bz2_fasttext('amazonreviews/train.ft.txt.bz2')
test_df = load_bz2_fasttext('amazonreviews/test.ft.txt.bz2')

print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)
train_df.head()

Training set shape: (3600000, 2)
Test set shape: (400000, 2)


Unnamed: 0,review,sentiment
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1


**2️⃣ Preprocess the Reviews**

In [4]:
import re
import string

def preprocess_text(text):
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove digits
    text = text.strip()  # remove extra spaces
    return text

train_df['clean_review'] = train_df['review'].apply(preprocess_text)
test_df['clean_review'] = test_df['review'].apply(preprocess_text)

**3️⃣ TF-IDF Feature Engineering**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=10000)  # larger dataset, more features
X_train = tfidf.fit_transform(train_df['clean_review'])
y_train = train_df['sentiment']

X_test = tfidf.transform(test_df['clean_review'])
y_test = test_df['sentiment']

**4️⃣ Train Machine Learning Models**

A. Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_model = LogisticRegression(max_iter=500)  # increase iterations for big dataset
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.9053875
              precision    recall  f1-score   support

           0       0.91      0.90      0.91    200000
           1       0.90      0.91      0.91    200000

    accuracy                           0.91    400000
   macro avg       0.91      0.91      0.91    400000
weighted avg       0.91      0.91      0.91    400000



B. XGBoost (Optional, more powerful)

In [1]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

NameError: name 'X_train' is not defined

**5️⃣ Save the Best Model (For Deployment)**

In [None]:
import joblib

# Save TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Save Logistic Regression model
joblib.dump(lr_model, 'sentiment_lr_model.pkl')

# Optional: Save XGBoost model
joblib.dump(xgb_model, 'sentiment_xgb_model.pkl')

**6️⃣ Optional: Embedding + Logistic Regression**

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode reviews
X_train_emb = model.encode(train_df['clean_review'].tolist(), convert_to_tensor=False)
X_test_emb = model.encode(test_df['clean_review'].tolist(), convert_to_tensor=False)

from sklearn.linear_model import LogisticRegression
lr_emb = LogisticRegression(max_iter=500)
lr_emb.fit(X_train_emb, y_train)

y_pred_emb = lr_emb.predict(X_test_emb)
print("Embedding + LR Accuracy:", accuracy_score(y_test, y_pred_emb))

**7️⃣ Next Step: Deployment (Streamlit Web App)**

Build a Streamlit app where user inputs a review → outputs sentiment (Positive/Negative).

Use the saved model (sentiment_lr_model.pkl) and TF-IDF vectorizer (tfidf_vectorizer.pkl) in the app.

Deploy on Streamlit Cloud → get a public link for your resume.