# Bookstore Data EDA and GroupBy Analysis

In [None]:
# ---- Configuration ----
HOST = "127.0.0.1"
USER = "root"
PASSWORD = "yourpassword"
DATABASE = "your_database"


In [None]:
# ---- Imports & Connection ----
import pandas as pd
import mysql.connector
import matplotlib.pyplot as plt

%matplotlib inline

conn = mysql.connector.connect(
    host=HOST,
    user=USER,
    password=PASSWORD,
    database=DATABASE
)
print("✅ Connected to MySQL!")

In [None]:
# ---- Load tables ----
tables = {}
for t in ["stores","customers","books","transactions"]:
    tables[t] = pd.read_sql(f"SELECT * FROM {t};", conn)

for name, df in tables.items():
    print(f"--- {name.upper()} ---")
    display(df.head())

In [None]:
# ---- Exploratory Analysis ----
for name, df in tables.items():
    print(f"--- {name.upper()} ---")
    print(df.info())
    print(df.describe(include='all'))
    print("\nMissing values:")
    print(df.isnull().sum())
    print("\n")

In [None]:
# ---- Merge for analysis ----
merged = tables['transactions'] \    .merge(tables['customers'], on="customer_id") \    .merge(tables['stores'], on="store_id") \    .merge(tables['books'], on="book_id")
merged.head()

In [None]:
# ---- Average price per country ----
avg_price_country = merged.groupby("country")['price'].mean().sort_values(ascending=False)
display(avg_price_country)

avg_price_country.plot(kind='bar', title="Average Price per Country", figsize=(8,5))
plt.show()

In [None]:
# ---- Total sales per store ----
sales_store = merged.groupby("store_location")['price'].sum().sort_values(ascending=False)
display(sales_store)

sales_store.plot(kind='bar', title="Total Sales per Store", figsize=(10,5))
plt.show()

In [None]:
# ---- Purchases per book category ----
cat_count = merged['category'].value_counts()
display(cat_count)

cat_count.plot(kind='bar', title="Purchases per Book Category", figsize=(10,5))
plt.show()

In [None]:
# ---- Average customer age per category ----
avg_age_cat = merged.groupby("category")['age'].mean().sort_values(ascending=False)
display(avg_age_cat)

avg_age_cat.plot(kind='bar', title="Average Customer Age per Category", figsize=(10,5))
plt.show()

## Join-based GroupBy Examples

In [None]:
# ---- Total sales per author ----
sales_author = merged.groupby("author")['price'].sum().sort_values(ascending=False)
display(sales_author.head(10))

sales_author.head(10).plot(kind='bar', title="Top 10 Authors by Sales", figsize=(10,5))
plt.show()

In [None]:
# ---- Total sales per book category per country ----
sales_cat_country = merged.groupby(["country","category"])['price'].sum().unstack().fillna(0)
display(sales_cat_country)

sales_cat_country.plot(kind='bar', stacked=True, figsize=(12,6), title="Sales by Category per Country")
plt.show()

In [None]:
# ---- Average book price per author ----
avg_price_author = merged.groupby("author")['price'].mean().sort_values(ascending=False)
display(avg_price_author.head(10))

avg_price_author.head(10).plot(kind='bar', title="Top 10 Authors by Avg Price", figsize=(10,5))
plt.show()

## Time-based Analysis

In [None]:
# ---- Sales trend by year ----
merged['year'] = pd.to_datetime(merged['purchase_date']).dt.year
sales_year = merged.groupby('year')['price'].sum()
display(sales_year)

sales_year.plot(kind='line', marker='o', title="Total Sales by Year", figsize=(8,5))
plt.show()

In [None]:
# ---- Sales trend by month (all years combined) ----
merged['month'] = pd.to_datetime(merged['purchase_date']).dt.month
sales_month = merged.groupby('month')['price'].sum()
display(sales_month)

sales_month.plot(kind='bar', title="Total Sales by Month (all years combined)", figsize=(8,5))
plt.show()

In [None]:
# ---- Cleanup ----
try:
    conn.close()
    print("🔌 MySQL connection closed.")
except Exception as e:
    print("Connection close error:", e)