# EDA for Amazon Stars vs Sentiment

---

This notebook explores the relationship between Amazon review star ratings and sentiment analysis scores. It loads a large sample of reviews, computes sentiment using DistilBERT, and visualizes the alignment (or divergence) between written sentiment and star ratings.

In [None]:
# ------------------------------------------------------------
# 0. Ensure the review slice exists, then load it
# ------------------------------------------------------------
import subprocess, pathlib, pandas as pd, sys, textwrap

CATEGORY = "raw_review_Clothing_Shoes_and_Jewelry"   # ← keep the raw_review_ prefix
N_ROWS   = 1_000_000
PARQUET  = pathlib.Path(f"data/{CATEGORY}_{N_ROWS}.parquet")

# If the file is missing, run data/get_data.py to create it
if not PARQUET.exists():
    print("🔄 Parquet not found – running data/get_data.py …")
    try:
        import os
        script_path = os.path.abspath(os.path.join(os.path.dirname(__file__) if "__file__" in globals() else os.getcwd(), "..", "data", "get_data.py"))
        subprocess.run(
            [
                sys.executable,
                script_path,
                "--category", CATEGORY,
                "--rows", str(N_ROWS)
            ],
            check=True,
            capture_output=True,
            text=True
        )
    except subprocess.CalledProcessError as e:
        print("❌ get_data.py failed")
        print(textwrap.indent(e.stderr or "", "    "))
        raise

    if not PARQUET.exists():
        raise FileNotFoundError("Download script finished, but Parquet still missing.")

# Load the dataset
print(f"✅ Loading {PARQUET}")
df = pd.read_parquet(PARQUET)
df.head()


🔄 Parquet not found – running data/get_data.py …


## A. Load Parquet

In [None]:
# Already loaded above. DataFrame: df

## B. Plot star distribution → results/star_counts.png

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x='star_rating', data=df)
plt.title('Star Rating Distribution')
plt.savefig('results/star_counts.png')
plt.show()

## C. DistilBERT SST-2 sentiment → sent_score column

In [None]:
from transformers import pipeline
sent_clf = pipeline('sentiment-analysis')
df['sent_score'] = df['review_body'].apply(lambda x: sent_clf(x)[0]['score'])  # This may take a while!

## D. Compute divergence metric; histogram → results/divergence_hist.png

In [None]:
df['divergence'] = df['sent_score'] - (df['star_rating']-3)/2
ax = df['divergence'].hist()
plt.title('Divergence Histogram')
plt.savefig('results/divergence_hist.png')
plt.show()

## E. Boxplot polarity vs star rating → results/polarity_vs_rating.png

In [None]:
sns.boxplot(x='star_rating', y='sent_score', data=df)
plt.title('Polarity vs Star Rating')
plt.savefig('results/polarity_vs_rating.png')
plt.show()

## (Optional) Helpful votes vs Divergence

In [None]:
# sns.scatterplot(x='helpful_votes', y='divergence', data=df)
# plt.savefig('results/helpful_vs_div.png')

## F. Save cleaned subset → data/clean_1M.parquet

In [None]:
df.to_parquet('data/clean_1M.parquet')
print('✅ Saved cleaned subset to data/clean_1M.parquet')