# EDA & Data Cleaning — Seoul Bike Sharing Demand

Goal:
1) Explore the raw dataset (overview, target distribution, missingness/outliers, feature–target relationships)
2) Clean the dataset using `clean_seoul_bike_data`
3) Explore and analyse explantory data
3) Save a reproducible cleaned parquet to `data/processed/`

In [None]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# make src importable
repo_root = Path.cwd().resolve()
if repo_root.name == "notebooks":
    repo_root = repo_root.parent
sys.path.insert(0, str(repo_root / "src"))

repo_root

In [None]:
from bike_demand.data.load_data import load_data

df_raw = load_data()
df_raw.head()

## Data overview
We inspect dataset size, column names, dtypes, and summary statistics.

In [None]:
# shape + columns
df_raw.shape, df_raw.columns.tolist()[:10]

In [None]:
# info / dtypes
df_raw.info()

In [None]:
# numeric describe
df_raw.describe().T

In [None]:
# categorical describe
df_raw.select_dtypes(include=["object", "category"]).describe().T

## Data cleaning plan 

Based on the raw data inspection above:

- The dataset contains 8,760 hourly observations (365 × 24) and no missing values.
- `Date` is stored as an object and should be parsed into a proper datetime.
- Column names contain spaces and units; we standardise them to snake_case for robust downstream use.
- Categorical variables (`Seasons`, `Holiday`, `Functioning Day`) are stored as strings; we convert them to `category`.
- We retain extreme weather values as they are plausible and informative for demand prediction.

In [None]:
from bike_demand.preprocessing import clean_seoul_bike_data

df_clean = clean_seoul_bike_data(df_raw)

print(df_clean.shape)
df_clean.info()
df_clean.head()

## Exploratory analysis on cleaned data

After standardising the dataset schema, we conduct exploratory analysis on the cleaned data to:
- understand the target distribution,
- identify potential outliers (without necessarily removing them),
- examine relationships between key features and rental demand,
- uncover temporal and seasonal patterns that may inform feature engineering and model choice.


In [None]:
# Target distribution
from bike_demand.plotting import plot_target_distribution

plot_target_distribution(df_clean, target_col="rented_bike_count")
plt.show()

In [None]:
# hourly pattern of target variable
import matplotlib.pyplot as plt

hourly = df_clean.groupby("hour", observed=True)["rented_bike_count"].mean()

plt.figure(figsize=(10, 4))
plt.plot(hourly.index, hourly.values)
plt.title("Average bike rentals by hour of day")
plt.xlabel("Hour of day")
plt.ylabel("Average rented bike count")
plt.xticks(range(0, 24, 2))
plt.show()

In [None]:
from bike_demand.plotting import plot_hourly_trend_by_season

plot_hourly_trend_by_season(df_clean)
plt.show()

## Distribution of key variables

We examine the marginal distributions of selected variables to assess
skewness, zero inflation, and scale differences that may affect model
choice and feature engineering.


In [None]:
from bike_demand.plotting import plot_combined_explanatory_distributions

vars_to_plot = [
    "temperature",
    "humidity",
    "wind_speed",
    "visibility",
    "dew_point_temp",
    "solar_radiation",
]

plot_combined_explanatory_distributions(df_clean, vars_to_plot, bins=30, kde=True)
plt.show()

In [None]:
vars_to_plot2 = [
    "rainfall",
    "snowfall",
]

plot_combined_explanatory_distributions(df_clean, vars_to_plot2, bins=30, kde=True)
plt.show()

In [None]:
df_clean["rain_binary"] = (df_clean["rainfall"] > 0).astype(int)
rain_occurrence_table = (
    df_clean["rain_binary"]
    .value_counts()
    .rename(index={0: "No Rainfall", 1: "Rainfall > 0"})
    .to_frame(name="Count")
)

rain_occurrence_table["Proportion"] = (
    rain_occurrence_table["Count"] / rain_occurrence_table["Count"].sum()
)

rain_occurrence_table

In [None]:
rain_pos = df_clean.loc[df_clean["rainfall"] > 0, "rainfall"]

plt.figure(figsize=(8, 5))
sns.histplot(rain_pos, bins=20, kde=True)
plt.title("Distribution of positive rainfall values")
plt.xlabel("Rainfall (mm)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
df_clean["snow_binary"] = (df_clean["snowfall"] > 0).astype(int)
snow_occurrence_table = (
    df_clean["snow_binary"]
    .value_counts()
    .rename(index={0: "No Snowfall", 1: "Snowfall > 0"})
    .to_frame(name="Count")
)

snow_occurrence_table["Proportion"] = (
    snow_occurrence_table["Count"] / snow_occurrence_table["Count"].sum()
)

snow_occurrence_table

In [None]:
snow_pos = df_clean.loc[df_clean["snowfall"] > 0, "snowfall"]

plt.figure(figsize=(8, 5))
sns.histplot(snow_pos, bins=20, kde=True)
plt.title("Distribution of positive snowfall values")
plt.xlabel("Snowfall (cm)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
from bike_demand.plotting import plot_categorical_frequency

plot_categorical_frequency(df_clean, col="hour", order=list(range(24)), title="Hour Frequency")
plt.show()

plot_categorical_frequency(
    df_clean, col="day_of_week", order=list(range(7)), title="Day of Week Frequency"
)
plt.show()

plot_categorical_frequency(df_clean, col="month", order=list(range(1, 13)), title="Month Frequency")
plt.show()

In [None]:
# 1. seasons
plot_categorical_frequency(df_clean, col="seasons", title="Distribution of seasons", palette="Set2")

print(df_clean["seasons"].value_counts())
plt.show()


# 2. holiday
plot_categorical_frequency(df_clean, col="holiday", title="Holiday frequency", palette="Set2")

print(df_clean["holiday"].value_counts())
plt.show()


# 3. functioning_day
plot_categorical_frequency(
    df_clean, col="functioning_day", title="Functioning day frequency", palette="Set2"
)

print(df_clean["functioning_day"].value_counts())
plt.show()

In [None]:
# Correlation
num_cols = [
    "temperature",
    "humidity",
    "wind_speed",
    "visibility",
    "dew_point_temp",
    "solar_radiation",
    "rainfall",
    "snowfall",
    "month",
    "day_of_week",
    "hour",
]

corr = (
    df_clean[num_cols + ["rented_bike_count"]]
    .corr(numeric_only=True)["rented_bike_count"]
    .sort_values(ascending=False)
)
corr

In [None]:
from bike_demand.plotting import plot_target_vs_continuous

continuous_feature = [
    "temperature",
    "humidity",
    "wind_speed",
    "visibility",
    "dew_point_temp",
    "solar_radiation",
    "rainfall",
    "snowfall",
]


plot_target_vs_continuous(df_clean, continuous_feature)

In [None]:
from bike_demand.plotting import plot_target_vs_categorical_mean

temporal_categorical_features = ["month", "hour", "seasons", "day_of_week"]

plot_target_vs_categorical_mean(df_clean, temporal_categorical_features, ncols=2)

In [None]:
categorical_indicators = ["functioning_day"]

plot_target_vs_categorical_mean(df_clean, categorical_indicators, ncols=2)

In [None]:
from bike_demand.plotting import plot_correlation_heatmap

# Exclude target and obvious identifiers
corr = plot_correlation_heatmap(
    df_clean,
    cmap="coolwarm",
    exclude_cols=["rented_bike_count", "date"],
)

In [None]:
from bike_demand.preprocessing import save_processed_data

save_processed_data(df_clean)