# Load the data file

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/1429_1.csv')

# Basic info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

Dataset shape: (34660, 21)
Columns: ['id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id', 'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs', 'reviews.text', 'reviews.title', 'reviews.userCity', 'reviews.userProvince', 'reviews.username']

First few rows:
                     id                                               name  \
0  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
1  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
2  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
3  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
4  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   

        asins   brand                                         categories  \
0  B01AHB9CN2  Amazon  Electronics,iPad & Tablets,All T

  df = pd.read_csv('data/1429_1.csv')


# Examine the key columns for our tasks

In [6]:
print("🔍 KEY COLUMNS ANALYSIS")
print("="*50)

# 1. SENTIMENT ANALYSIS - Check reviews.text and reviews.rating
print("REVIEW TEXT (first 3 examples):")
for i in range(3):
    print(f"{i+1}. Rating: {df['reviews.rating'].iloc[i]} - Text: {df['reviews.text'].iloc[i][:100]}...")

print(f"\nRATING DISTRIBUTION:")
print(df['reviews.rating'].value_counts().sort_index())

print(f"\nRating Statistics:")
print(df['reviews.rating'].describe())

# 2. CATEGORY CLUSTERING - Check categories column
print(f"\nCATEGORIES (first 5 examples):")
for i in range(5):
    print(f"{i+1}. {df['categories'].iloc[i]}")

print(f"\nUNIQUE CATEGORIES COUNT: {df['categories'].nunique()}")

# 3. PRODUCT ANALYSIS - Check name and brand
print(f"\nPRODUCTS:")
print(f"Unique products: {df['name'].nunique()}")
print(f"Unique brands: {df['brand'].nunique()}")
print(f"Top 5 brands:")
print(df['brand'].value_counts().head())

# 4. DATA QUALITY CHECK
print(f"\nDATA QUALITY:")
print(f"Missing values in key columns:")
key_columns = ['reviews.text', 'reviews.rating', 'categories', 'name', 'brand']
for col in key_columns:
    missing = df[col].isnull().sum()
    print(f"  {col}: {missing} missing ({missing/len(df)*100:.1f}%)")

🔍 KEY COLUMNS ANALYSIS
REVIEW TEXT (first 3 examples):
1. Rating: 5.0 - Text: This product so far has not disappointed. My children love to use it and I like the ability to monit...
2. Rating: 5.0 - Text: great for beginner or experienced person. Bought as a gift and she loves it...
3. Rating: 5.0 - Text: Inexpensive tablet for him to use and learn on, step up from the NABI. He was thrilled with it, lear...

RATING DISTRIBUTION:
reviews.rating
1.0      410
2.0      402
3.0     1499
4.0     8541
5.0    23775
Name: count, dtype: int64

Rating Statistics:
count    34627.000000
mean         4.584573
std          0.735653
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: reviews.rating, dtype: float64

CATEGORIES (first 5 examples):
1. Electronics,iPad & Tablets,All Tablets,Fire Tablets,Tablets,Computers & Tablets
2. Electronics,iPad & Tablets,All Tablets,Fire Tablets,Tablets,Computers & Tablets
3. Electronics,iPad & Tablets,

# Key Insights:

## Sentiment Analysis: Great rating distribution (mostly 4-5 stars), very few missing reviews
- Category Clustering: 41 unique categories to reduce to 4-6 meta-categories
- Products: 48 unique products, dominated by Amazon products
- Data Quality: Very clean dataset, minimal missing values

# Categories Deep Dive

In [5]:
# Categories for clustering task
print("CATEGORY ANALYSIS FOR CLUSTERING")
print("="*50)

# Split categories and analyze individual category terms
all_categories = []
for cat_string in df['categories'].dropna():
    categories = cat_string.split(',')
    all_categories.extend([cat.strip() for cat in categories])

from collections import Counter
category_counts = Counter(all_categories)

print(f"Most common category terms:")
for cat, count in category_counts.most_common(15):
    print(f"  {cat}: {count:,}")

print(f"\nSAMPLE CATEGORY STRINGS:")
unique_categories = df['categories'].unique()[:10]
for i, cat in enumerate(unique_categories, 1):
    print(f"{i:2d}. {cat}")

# Analyze product names to understand what we're working with
print(f"\nSAMPLE PRODUCT NAMES:")
unique_products = df['name'].dropna().unique()[:10]
for i, product in enumerate(unique_products, 1):
    print(f"{i:2d}. {product}")

# Check text length for preprocessing planning
df['text_length'] = df['reviews.text'].fillna('').str.len()
print(f"\nTEXT LENGTH ANALYSIS:")
print(f"Average review length: {df['text_length'].mean():.0f} characters")
print(f"Median review length: {df['text_length'].median():.0f} characters")
print(f"Max review length: {df['text_length'].max():,} characters")

print(f"\nData exploration complete!")
print(f"Ready for preprocessing and model building")

CATEGORY ANALYSIS FOR CLUSTERING
Most common category terms:
  Electronics: 42,291
  Computers & Tablets: 21,719
  Tablets: 21,383
  All Tablets: 18,413
  iPad & Tablets: 17,784
  Electronics Features: 16,926
  Fire Tablets: 16,303
  Home: 14,597
  Kindle Store: 12,886
  Amazon Devices: 12,691
  Featured Brands: 12,647
  TVs Entertainment: 11,682
  Holiday Shop: 11,682
  Frys: 11,615
  Tech Toys: 11,608

SAMPLE CATEGORY STRINGS:
 1. Electronics,iPad & Tablets,All Tablets,Fire Tablets,Tablets,Computers & Tablets
 2. eBook Readers,Kindle E-readers,Computers & Tablets,E-Readers & Accessories,E-Readers
 3. Electronics,eBook Readers & Accessories,Covers,Kindle Store,Amazon Device Accessories,Kindle E-Reader Accessories,Kindle (5th Generation) Accessories,Kindle (5th Generation) Covers
 4. Kindle Store,Amazon Devices,Electronics
 5. Tablets,Fire Tablets,Electronics,Computers,Computer Components,Hard Drives & Storage,Computers & Tablets,All Tablets
 6. Tablets,Fire Tablets,Computers & Tablets