# Load the data file

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/1429_1.csv')

# Basic info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

Dataset shape: (34660, 21)
Columns: ['id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id', 'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title', 'reviews.userCity', 'reviews.userProvince', 'reviews.username']

First few rows:
                     id                                               name  \
0  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
1  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
2  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
3  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
4  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   

        asins   brand                                         categories  \
0  B01AHB9CN2  Amazon  Electronics,iPad & Tablets,All T

  df = pd.read_csv('data/1429_1.csv')


# Examine the key columns for our tasks

In [6]:
print("🔍 KEY COLUMNS ANALYSIS")
print("="*50)

# 1. SENTIMENT ANALYSIS - Check reviews.text and reviews.rating
print("REVIEW TEXT (first 3 examples):")
for i in range(3):
    print(f"{i+1}. Rating: {df['reviews.rating'].iloc[i]} - Text: {df['reviews.text'].iloc[i][:100]}...")

print(f"\nRATING DISTRIBUTION:")
print(df['reviews.rating'].value_counts().sort_index())

print(f"\nRating Statistics:")
print(df['reviews.rating'].describe())

# 2. CATEGORY CLUSTERING - Check categories column
print(f"\nCATEGORIES (first 5 examples):")
for i in range(5):
    print(f"{i+1}. {df['categories'].iloc[i]}")

print(f"\nUNIQUE CATEGORIES COUNT: {df['categories'].nunique()}")

# 3. PRODUCT ANALYSIS - Check name and brand
print(f"\nPRODUCTS:")
print(f"Unique products: {df['name'].nunique()}")
print(f"Unique brands: {df['brand'].nunique()}")
print(f"Top 5 brands:")
print(df['brand'].value_counts().head())

# 4. DATA QUALITY CHECK
print(f"\nDATA QUALITY:")
print(f"Missing values in key columns:")
key_columns = ['reviews.text', 'reviews.rating', 'categories', 'name', 'brand']
for col in key_columns:
    missing = df[col].isnull().sum()
    print(f"  {col}: {missing} missing ({missing/len(df)*100:.1f}%)")

🔍 KEY COLUMNS ANALYSIS
REVIEW TEXT (first 3 examples):
1. Rating: 5.0 - Text: This product so far has not disappointed. My children love to use it and I like the ability to monit...
2. Rating: 5.0 - Text: great for beginner or experienced person. Bought as a gift and she loves it...
3. Rating: 5.0 - Text: Inexpensive tablet for him to use and learn on, step up from the NABI. He was thrilled with it, lear...

RATING DISTRIBUTION:
reviews.rating
1.0      410
2.0      402
3.0     1499
4.0     8541
5.0    23775
Name: count, dtype: int64

Rating Statistics:
count    34627.000000
mean         4.584573
std          0.735653
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: reviews.rating, dtype: float64

CATEGORIES (first 5 examples):
1. Electronics,iPad & Tablets,All Tablets,Fire Tablets,Tablets,Computers & Tablets
2. Electronics,iPad & Tablets,All Tablets,Fire Tablets,Tablets,Computers & Tablets
3. Electronics,iPad & Tablets,

# Key Insights:

## Sentiment Analysis: Great rating distribution (mostly 4-5 stars), very few missing reviews
- Category Clustering: 41 unique categories to reduce to 4-6 meta-categories
- Products: 48 unique products, dominated by Amazon products
- Data Quality: Very clean dataset, minimal missing values
- Inbalanced class in 1.0 and 2.0 ratings in the dataset (most belong to the 5.0)

# Categories Deep Dive

In [5]:
# Categories for clustering task
print("CATEGORY ANALYSIS FOR CLUSTERING")
print("="*50)

# Split categories and analyze individual category terms
all_categories = []
for cat_string in df['categories'].dropna():
    categories = cat_string.split(',')
    all_categories.extend([cat.strip() for cat in categories])

from collections import Counter
category_counts = Counter(all_categories)

print(f"Most common category terms:")
for cat, count in category_counts.most_common(15):
    print(f"  {cat}: {count:,}")

print(f"\nSAMPLE CATEGORY STRINGS:")
unique_categories = df['categories'].unique()[:10]
for i, cat in enumerate(unique_categories, 1):
    print(f"{i:2d}. {cat}")

# Analyze product names to understand what we're working with
print(f"\nSAMPLE PRODUCT NAMES:")
unique_products = df['name'].dropna().unique()[:10]
for i, product in enumerate(unique_products, 1):
    print(f"{i:2d}. {product}")

# Check text length for preprocessing planning
df['text_length'] = df['reviews.text'].fillna('').str.len()
print(f"\nTEXT LENGTH ANALYSIS:")
print(f"Average review length: {df['text_length'].mean():.0f} characters")
print(f"Median review length: {df['text_length'].median():.0f} characters")
print(f"Max review length: {df['text_length'].max():,} characters")

print(f"\nData exploration complete!")
print(f"Ready for preprocessing and model building")

CATEGORY ANALYSIS FOR CLUSTERING
Most common category terms:
  Electronics: 42,291
  Computers & Tablets: 21,719
  Tablets: 21,383
  All Tablets: 18,413
  iPad & Tablets: 17,784
  Electronics Features: 16,926
  Fire Tablets: 16,303
  Home: 14,597
  Kindle Store: 12,886
  Amazon Devices: 12,691
  Featured Brands: 12,647
  TVs Entertainment: 11,682
  Holiday Shop: 11,682
  Frys: 11,615
  Tech Toys: 11,608

SAMPLE CATEGORY STRINGS:
 1. Electronics,iPad & Tablets,All Tablets,Fire Tablets,Tablets,Computers & Tablets
 2. eBook Readers,Kindle E-readers,Computers & Tablets,E-Readers & Accessories,E-Readers
 3. Electronics,eBook Readers & Accessories,Covers,Kindle Store,Amazon Device Accessories,Kindle E-Reader Accessories,Kindle (5th Generation) Accessories,Kindle (5th Generation) Covers
 4. Kindle Store,Amazon Devices,Electronics
 5. Tablets,Fire Tablets,Electronics,Computers,Computer Components,Hard Drives & Storage,Computers & Tablets,All Tablets
 6. Tablets,Fire Tablets,Computers & Tablets

# Create the meta-categories & start pre-processing
- Create 5 meaningful meta-categories based on your data
- Convert ratings to sentiment labels (negative/neutral/positive)
- Show the distribution of both

In [10]:
# Define meta-categories based on the data analysis
print("CREATING META-CATEGORIES")
print("="*50)

def categorize_product(category_string, product_name):
    """
    Classify products into meta-categories based on category strings and product names
    """
    if pd.isna(category_string):
        category_string = ""
    if pd.isna(product_name):
        product_name = ""
    
    cat_lower = category_string.lower()
    prod_lower = product_name.lower()
    
    # E-readers (Kindle devices)
    if any(term in cat_lower for term in ['kindle', 'ebook', 'e-reader']):
        return "E-Readers"
    
    # Tablets (Fire tablets, iPads)
    elif any(term in cat_lower for term in ['fire tablet', 'ipad', 'tablet']) and 'accessory' not in cat_lower:
        return "Tablets"
    
    # Accessories (covers, chargers, cables)
    elif any(term in cat_lower for term in ['cover', 'accessory', 'cable', 'charger', 'adapter']):
        return "Accessories"
    
    # Smart Home/Entertainment (Echo, Fire TV)
    elif any(term in cat_lower for term in ['echo', 'fire tv', 'entertainment', 'home']):
        return "Smart Home & Entertainment"
    
    # Default fallback
    else:
        return "Other Electronics"

# Apply categorization
df['meta_category'] = df.apply(lambda row: categorize_product(row['categories'], row['name']), axis=1)

# Check the distribution
print("META-CATEGORY DISTRIBUTION:")
meta_dist = df['meta_category'].value_counts()
for category, count in meta_dist.items():
    print(f"  {category}: {count:,} ({count/len(df)*100:.1f}%)")

print(f"\nEXAMPLES BY META-CATEGORY:")
for category in df['meta_category'].unique():
    # Fixed: properly check for available names
    category_products = df[df['meta_category'] == category]['name'].dropna()
    if len(category_products) > 0:
        sample = category_products.iloc[0]
    else:
        sample = "No product names available"
    print(f"  {category}: {sample}")

# Prepare sentiment labels (convert ratings to sentiment)
def rating_to_sentiment(rating):
    """Convert 1-5 star rating to sentiment label"""
    if pd.isna(rating):
        return None
    elif rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral" 
    else:  # rating >= 4
        return "positive"

df['sentiment'] = df['reviews.rating'].apply(rating_to_sentiment)

print(f"\nSENTIMENT DISTRIBUTION:")
sentiment_dist = df['sentiment'].value_counts()
for sentiment, count in sentiment_dist.items():
    print(f"  {sentiment}: {count:,} ({count/len(df.dropna(subset=['sentiment']))*100:.1f}%)")

print(f"\nMeta-categories and sentiment labels created!")
print(f"Ready to start building models")

CREATING META-CATEGORIES
META-CATEGORY DISTRIBUTION:
  E-Readers: 18,958 (54.7%)
  Tablets: 15,653 (45.2%)
  Other Electronics: 34 (0.1%)
  Accessories: 8 (0.0%)
  Smart Home & Entertainment: 7 (0.0%)

EXAMPLES BY META-CATEGORY:
  Tablets: All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Magenta
  E-Readers: Kindle Oasis E-reader with Leather Charging Cover - Merlot, 6 High-Resolution Display (300 ppi), Wi-Fi - Includes Special Offers,,
  Other Electronics: Brand New Amazon Kindle Fire 16gb 7 Ips Display Tablet Wifi 16 Gb Blue,,,
  Smart Home & Entertainment: No product names available
  Accessories: No product names available

SENTIMENT DISTRIBUTION:
  positive: 32,316 (93.3%)
  neutral: 1,499 (4.3%)
  negative: 812 (2.3%)

Meta-categories and sentiment labels created!
Ready to start building models


# There seems to be imbalanced distribution:
- Meta-category: Mostly E-Readers and Tablets (98.9%)
- Heavily biased: 93.3% are positive reviews

## Review imbalences deeper

In [14]:
# Create the processed directory first
import os
os.makedirs('data/processed', exist_ok=True)

# Analyze the imbalance problem
print("ANALYZING CLASS IMBALANCE")
print("="*50)

# Current distribution
print("CURRENT SENTIMENT DISTRIBUTION:")
sentiment_counts = df_clean['sentiment'].value_counts()
total = len(df_clean)
for sentiment, count in sentiment_counts.items():
    print(f"  {sentiment}: {count:,} ({count/total*100:.1f}%)")

# Check if imbalance varies by category
print(f"\nSENTIMENT BY META-CATEGORY:")
sentiment_by_category = df_clean.groupby(['meta_category', 'sentiment']).size().unstack(fill_value=0)
print(sentiment_by_category)

# Look at the rating distribution in more detail
print(f"\nDETAILED RATING DISTRIBUTION:")
rating_counts = df_clean['reviews.rating'].value_counts().sort_index()
for rating, count in rating_counts.items():
    sentiment = rating_to_sentiment(rating)
    print(f"  {rating} stars -> {sentiment}: {count:,} ({count/total*100:.1f}%)")

# Options for handling imbalance
print(f"\nOPTIONS FOR HANDLING IMBALANCE:")
print("1. Use stratified sampling to balance classes")
print("2. Use class weights in the model")
print("3. Focus on binary classification (positive vs negative+neutral)")
print("4. Use different evaluation metrics (precision, recall, F1)")

# Let's create a more balanced sample for initial testing
print(f"\nCREATING BALANCED SAMPLE FOR TESTING:")

# Sample equal amounts from each class (limited by smallest class)
min_class_size = sentiment_counts.min()
print(f"Smallest class has {min_class_size} samples")

# Create balanced sample
balanced_df = df_clean.groupby('sentiment').apply(
    lambda x: x.sample(n=min(min_class_size, len(x)), random_state=42)
).reset_index(drop=True)

print(f"\nBALANCED SAMPLE DISTRIBUTION:")
balanced_counts = balanced_df['sentiment'].value_counts()
balanced_total = len(balanced_df)
for sentiment, count in balanced_counts.items():
    print(f"  {sentiment}: {count:,} ({count/balanced_total*100:.1f}%)")

print(f"\nBalanced sample size: {len(balanced_df):,} (vs original {len(df_clean):,})")

# Save the data
df_clean.to_csv('data/processed/cleaned_reviews.csv', index=False)
balanced_df.to_csv('data/processed/balanced_reviews.csv', index=False)
print(f"\nData saved successfully!")

ANALYZING CLASS IMBALANCE
CURRENT SENTIMENT DISTRIBUTION:
  positive: 32,315 (93.3%)
  neutral: 1,499 (4.3%)
  negative: 812 (2.3%)

SENTIMENT BY META-CATEGORY:
sentiment                   negative  neutral  positive
meta_category                                          
Accessories                        0        0         8
E-Readers                        363      614     17957
Other Electronics                  4        0        20
Smart Home & Entertainment         1        0         6
Tablets                          444      885     14324

DETAILED RATING DISTRIBUTION:
  1.0 stars -> negative: 410 (1.2%)
  2.0 stars -> negative: 402 (1.2%)
  3.0 stars -> neutral: 1,499 (4.3%)
  4.0 stars -> positive: 8,541 (24.7%)
  5.0 stars -> positive: 23,774 (68.7%)

OPTIONS FOR HANDLING IMBALANCE:
1. Use stratified sampling to balance classes
2. Use class weights in the model
3. Focus on binary classification (positive vs negative+neutral)
4. Use different evaluation metrics (precision, re

  balanced_df = df_clean.groupby('sentiment').apply(



Data saved successfully!


# Sentiment analysis model using LogisticRegression
(sample model below)

In [None]:
# Build our first sentiment analysis model
print("BUILDING SENTIMENT ANALYSIS MODEL")
print("="*50)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Start with the balanced dataset for initial testing
print("TESTING WITH BALANCED DATASET")
print("-" * 30)

# Prepare the data
X_balanced = balanced_df['reviews.text_clean'].tolist()
y_balanced = balanced_df['sentiment'].tolist()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_balanced
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),  # unigrams and bigrams
    min_df=2,  # ignore terms that appear in less than 2 documents
    max_df=0.95  # ignore terms that appear in more than 95% of documents
)

# Fit and transform the text data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")

# Train logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(f"\nMODEL EVALUATION (BALANCED DATASET):")
print(classification_report(y_test, y_pred))

print(f"\nCONFUSION MATRIX:")
cm = confusion_matrix(y_test, y_pred, labels=['negative', 'neutral', 'positive'])
print("           Predicted")
print("         neg  neu  pos")
print(f"Actual neg {cm[0][0]:3d} {cm[0][1]:3d} {cm[0][2]:3d}")
print(f"       neu {cm[1][0]:3d} {cm[1][1]:3d} {cm[1][2]:3d}")
print(f"       pos {cm[2][0]:3d} {cm[2][1]:3d} {cm[2][2]:3d}")

# Test on some sample texts
print(f"\nSAMPLE PREDICTIONS:")
sample_texts = [
    "This product is amazing! I love it so much.",
    "It's okay, nothing special but does the job.",
    "Terrible quality, waste of money. Don't buy this."
]

sample_tfidf = vectorizer.transform(sample_texts)
sample_predictions = model.predict(sample_tfidf)
sample_probabilities = model.predict_proba(sample_tfidf)

for i, (text, pred, prob) in enumerate(zip(sample_texts, sample_predictions, sample_probabilities)):
    print(f"{i+1}. Text: '{text}'")
    print(f"   Prediction: {pred}")
    print(f"   Probabilities: neg={prob[0]:.3f}, neu={prob[1]:.3f}, pos={prob[2]:.3f}")
    print()

print("BASELINE MODEL COMPLETE!")
print("Next step: Test with class weights on full dataset")

BUILDING SENTIMENT ANALYSIS MODEL
TESTING WITH BALANCED DATASET
------------------------------
Training set: 1948 samples
Test set: 488 samples
TF-IDF matrix shape: (1948, 4809)

MODEL EVALUATION (BALANCED DATASET):
              precision    recall  f1-score   support

    negative       0.63      0.65      0.64       162
     neutral       0.54      0.55      0.54       163
    positive       0.73      0.71      0.72       163

    accuracy                           0.63       488
   macro avg       0.63      0.63      0.63       488
weighted avg       0.63      0.63      0.63       488


CONFUSION MATRIX:
           Predicted
         neg  neu  pos
Actual neg 105  42  15
       neu  46  89  28
       pos  15  33 115

SAMPLE PREDICTIONS:
1. Text: 'This product is amazing! I love it so much.'
   Prediction: positive
   Probabilities: neg=0.072, neu=0.089, pos=0.839

2. Text: 'It's okay, nothing special but does the job.'
   Prediction: neutral
   Probabilities: neg=0.105, neu=0.795, p

## The model is working well on the balanced dataset (63% accuracy, good sample predictions)

# Test on full imbalanced dataset (with class weights)

In [16]:
# Test model with class weights on full dataset
print("TESTING WITH FULL DATASET + CLASS WEIGHTS")
print("="*50)

from sklearn.utils.class_weight import compute_class_weight

# Prepare full dataset
X_full = df_clean['reviews.text_clean'].tolist()
y_full = df_clean['sentiment'].tolist()

# Split full dataset
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_full, y_full, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_full
)

print(f"Full training set: {len(X_train_full):,} samples")
print(f"Full test set: {len(X_test_full):,} samples")

# Calculate class weights to handle imbalance
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train_full),
    y=y_train_full
)
class_weight_dict = dict(zip(np.unique(y_train_full), class_weights))
print(f"Class weights: {class_weight_dict}")

# Create new vectorizer for full dataset
vectorizer_full = TfidfVectorizer(
    max_features=10000,  # More features for larger dataset
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5,  # Higher threshold for larger dataset
    max_df=0.95
)

# Fit and transform
X_train_full_tfidf = vectorizer_full.fit_transform(X_train_full)
X_test_full_tfidf = vectorizer_full.transform(X_test_full)

print(f"Full TF-IDF matrix shape: {X_train_full_tfidf.shape}")

# Train model with class weights
model_weighted = LogisticRegression(
    random_state=42, 
    max_iter=1000, 
    class_weight=class_weight_dict
)
model_weighted.fit(X_train_full_tfidf, y_train_full)

# Predictions
y_pred_full = model_weighted.predict(X_test_full_tfidf)

# Evaluate
print(f"\nMODEL EVALUATION (FULL DATASET WITH CLASS WEIGHTS):")
print(classification_report(y_test_full, y_pred_full))

# Show distribution of predictions vs actual
print(f"\nPREDICTION DISTRIBUTION:")
from collections import Counter
actual_dist = Counter(y_test_full)
pred_dist = Counter(y_pred_full)

print("Actual vs Predicted:")
for sentiment in ['negative', 'neutral', 'positive']:
    actual_pct = actual_dist[sentiment] / len(y_test_full) * 100
    pred_pct = pred_dist[sentiment] / len(y_pred_full) * 100
    print(f"  {sentiment}: {actual_pct:.1f}% actual, {pred_pct:.1f}% predicted")

# Test the same sample texts
print(f"\nSAMPLE PREDICTIONS (WEIGHTED MODEL):")
sample_tfidf_full = vectorizer_full.transform(sample_texts)
sample_pred_full = model_weighted.predict(sample_tfidf_full)
sample_prob_full = model_weighted.predict_proba(sample_tfidf_full)

for i, (text, pred, prob) in enumerate(zip(sample_texts, sample_pred_full, sample_prob_full)):
    print(f"{i+1}. Text: '{text}'")
    print(f"   Prediction: {pred}")
    print(f"   Probabilities: neg={prob[0]:.3f}, neu={prob[1]:.3f}, pos={prob[2]:.3f}")
    print()

print("SENTIMENT ANALYSIS MODEL COMPLETE!")
print("Ready for next task: Category clustering")

TESTING WITH FULL DATASET + CLASS WEIGHTS
Full training set: 27,700 samples
Full test set: 6,926 samples
Class weights: {np.str_('negative'): np.float64(14.205128205128204), np.str_('neutral'): np.float64(7.700861829302196), np.str_('positive'): np.float64(0.3571750931620956)}
Full TF-IDF matrix shape: (27700, 10000)

MODEL EVALUATION (FULL DATASET WITH CLASS WEIGHTS):
              precision    recall  f1-score   support

    negative       0.28      0.56      0.38       162
     neutral       0.16      0.40      0.23       300
    positive       0.97      0.88      0.92      6464

    accuracy                           0.85      6926
   macro avg       0.47      0.61      0.51      6926
weighted avg       0.92      0.85      0.88      6926


PREDICTION DISTRIBUTION:
Actual vs Predicted:
  negative: 2.3% actual, 4.6% predicted
  neutral: 4.3% actual, 10.9% predicted
  positive: 93.3% actual, 84.5% predicted

SAMPLE PREDICTIONS (WEIGHTED MODEL):
1. Text: 'This product is amazing! I lov

## Sentiment analysis model is working well. The class weights helped balance the predictions - it's now predicting more negatives and neutrals instead of everything being positive. The sample predictions look very accurate.