In [26]:
import pandas as pd
import altair as alt
from pathlib import Path
from superlinked_app.data_processing import process_amazon_dataset

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

### load raw data

In [27]:
DATA_PATH = Path("data") / "sample.json"
assert DATA_PATH.exists(), (
    f"Ddataset not found at '{DATA_PATH}'. "
    "Please run 'make download-and-process-sample-dataset' first to download and process the Amazon dataset."
)

In [None]:
df = pd.read_json(DATA_PATH, lines=True)
df.head(2)

### explore the raw data

In [None]:
print("=== Dataset Overview ===")
print(f"Number of records: {len(df)}")
print("\nColumns:")
df.columns.tolist()

In [None]:
print("\n=== Data Types ===")
df.dtypes

In [None]:
print("\n=== Total Records ===")
total_records = len(df)
print(f"Total number of records in dataset: {total_records:,}")

In [None]:
print("\n=== Missing Values by Column ===")
missing_values = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': df.isnull().sum(),
    'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2).astype(str) + '%'
})
missing_values = missing_values[missing_values['Missing Count'] > 0].sort_values('Missing Percentage', ascending=False)
print(missing_values.to_string(index=False))

total_missing = df.isnull().sum().sum()
total_cells = len(df) * len(df.columns)
print(f"\nTotal missing values across all columns: {total_missing:,}")
print(f"Total missing percentage: {(total_missing / total_cells * 100):.2f}%")

In [None]:
print("\n=== Product Type Distribution ===")
type_dist = df["type"].value_counts()
print(type_dist)

In [None]:
chart = (
        alt.Chart(df.reset_index())
        .mark_bar()
        .encode(x=alt.X("type:N", title="Type"), y=alt.Y("count():Q", title="Count"))
        .properties(width=500, height=300, title="Distribution of Product Types")
        )
chart.show()

In [None]:
print("\n=== Locale Distribution ===")
locale_dist = df["locale"].value_counts()
print(locale_dist)

In [None]:
# Create a DataFrame for the locale distribution
locale_df = pd.DataFrame({"locale": locale_dist.index, "count": locale_dist.values})

# Calculate percentage
locale_df["percentage"] = locale_df["count"] / locale_df["count"].sum() * 100

# Create donut chart
chart = (
    alt.Chart(locale_df)
    .mark_arc(innerRadius=50)
    .encode(
        theta=alt.Theta(field="count", type="quantitative"),
        color=alt.Color(field="locale", type="nominal"),
        tooltip=["locale", "percentage"],
    )
    .properties(width=400, height=400, title="Distribution of Locales")
)
chart.show()

In [None]:

print("\n=== Ratings Statistics ===")
# Convert ratings to numeric, removing 'ratings' text and commas
df["ratings_count"] = (
    df["ratings"]
    .str.extract("(\d+(?:,\d+)?)", expand=False)
    .str.replace(",", "")
    .astype(float)
)
print(df["ratings_count"].describe())

In [None]:
chart = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X("ratings_count:Q", bin=alt.Bin(maxbins=30), title="Number of Ratings"),
        y=alt.Y("count():Q", scale=alt.Scale(type="log"), title="Frequency"),
    )
    .properties(width=500, height=300, title="Distribution of Number of Ratings")
)
chart.show()

In [None]:
print("\n=== Star Ratings Statistics ===")
# Convert stars to numeric, extracting just the number
df["stars_numeric"] = df["stars"].str.extract("([\d.]+)").astype(float)
print(df["stars_numeric"].describe())

In [None]:

chart = (
    alt.Chart(df)
    .mark_boxplot()
    .encode(
        x=alt.X("type:N", title="Type"), y=alt.Y("stars_numeric:Q", title="Star Rating")
    )
    .properties(
        width=600, height=300, title="Star Ratings Distribution by Product Type"
    )
)
chart.show()

In [None]:
columns_to_keep = [
        "asin",
        "type",
        "category",
        "title",
        "description",
        "stars",
        "ratings",
        "price",
        "locale"
    ]
df[df["locale"] == "us"][columns_to_keep].head() # drop majority of columns

### preprocessing the data

In [None]:
processed_df = process_amazon_dataset(df)
processed_df.head()

In [None]:
len(processed_df)

In [None]:
print("\n=== Missing Values by Column ===")
missing_values = pd.DataFrame({
    'Column': processed_df.columns,
    'Missing Count': processed_df.isnull().sum(),
    'Missing Percentage': (processed_df.isnull().sum() / len(processed_df) * 100).round(2).astype(str) + '%'
})
missing_values = missing_values[missing_values['Missing Count'] > 0].sort_values('Missing Percentage', ascending=False)
print(missing_values.to_string(index=False))

total_missing = processed_df.isnull().sum().sum()
total_cells = len(processed_df) * len(processed_df.columns)
print(f"\nTotal missing values across all columns: {total_missing:,}")
print(f"Total missing percentage: {(total_missing / total_cells * 100):.2f}%")

### explore the processed data

In [None]:
categories = set()
for category_list in processed_df["category"]:
    categories.update(category_list)
categories

In [None]:
len(categories)

In [None]:
processed_df[["price", "review_count", "review_rating"]].describe().loc[["min", "max"]]

In [None]:
print("\nNaN value counts:")
print(processed_df[["category", "price", "review_count", "review_rating"]].isna().sum())

In [None]:
processed_df.head(10)