In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

sns.set_theme(style = "whitegrid")

* datetime - hourly date + timestamp 
* season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
* holiday - whether the day is considered a holiday
* workingday - whether the day is neither a weekend nor holiday
* weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
* 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
* 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
* 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
* temp - temperature in Celsius
* atemp - "feels like" temperature in Celsius
* humidity - relative humidity
* windspeed - wind speed
* casual - number of non-registered user rentals initiated
* registered - number of registered user rentals initiated
* count - number of total rentals

In [2]:
SEASON_MAP = {
    1: 'Spring',
    2: 'Summer',
    3: 'Fall',
    4: 'Winter'
}
WEATHER_MAP = {
    1: "Clear/Partly Cloudy",
    2: "Mist",
    3: "Snow/Rain",
    4: "Heavy Rain/Snow + Fog",
}

In [3]:
bikes = pd.read_csv("data/train.csv").astype({"datetime": "datetime64[ns]"})


In [4]:
bikes["season"] = bikes["season"].apply(lambda x: SEASON_MAP[x])
bikes["holiday"] = bikes["holiday"].apply(lambda x: "Yes" if x == 1 else "No")
bikes["workingday"] = bikes["workingday"].apply(lambda x: "Yes" if x == 1 else "No")
bikes["weather"] = bikes["weather"].apply(lambda x: WEATHER_MAP[x])

In [5]:
def plot_count_boxplot_per_category(bikes: pd.DataFrame, col: str, save: bool = True, plot: bool = False) -> plt.Figure:
    fig, ax = plt.subplots( figsize=(18, 8))
    
    Q2 = bikes["count"].quantile(.5)
    ax.axhline(Q2, color="k", linestyle="-.", lw=4)
    
    sns.boxplot(x=col, y="count", data=bikes, ax=ax)
    ax.legend(["count median"], fontsize=20)

    fig.suptitle(f"boxplot por categorias da coluna {col}", fontsize=20)
    
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)
    
    if save:
        fig.savefig(f"eda_plots/count_boxplot_per_{col}.png")

    if not plot:
        plt.close()
        
    return fig


In [6]:
def plot_category_frequency(bikes: pd.DataFrame, col: str, save: bool = True, plot: bool = False) -> plt.Figure:
    fig, ax = plt.subplots(figsize=(18, 8))
    sns.countplot(x=col, data=bikes)
    ax.set_xlabel(col)
    ax.set_ylabel("count")
    ax.set_title(f"Frequency of {col}", fontsize=20)

    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)

    if save:
        fig.savefig(f"eda_plots/frequency_{col}.png")
    if not plot:
        plt.close()
    return fig

In [7]:
def plot_histogram(bikes: pd.DataFrame, col: str, save: bool = True, plot: bool = False) -> plt.Figure:
    fig, ax = plt.subplots(figsize=(18, 8))
    sns.histplot(bikes[col],  ax=ax, kde = True)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax.set_title(f"Histogram of {col}", fontsize=20)

    Q2 = bikes[col].quantile(.5)
    mean = bikes[col].mean()


    ax.axvline(Q2, color="r", linestyle="-.", lw=4)
    ax.axvline(mean, color="k", linestyle="-.", lw=4)

    ax.legend(["kde", "median", "mean"], fontsize=20)

    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)

    if save:
        fig.savefig(f"eda_plots/histogram_{col}.png")
    if not plot:
        plt.close()
    return fig

In [8]:
def create_bins(series: pd.Series, n_bins = 10) -> pd.Series:
    return pd.cut(series, bins=np.linspace(series.min() - 1e-7, series.max(), n_bins + 1))

In [9]:
def plot_count_boxplot_per_bin(
    bikes: pd.DataFrame, col: str, n_bins=10, save=True, plot=False
):
    fig, ax = plt.subplots(figsize=(18, 8))
    
    Q2 = bikes["count"].quantile(0.5)
    data = bikes.copy()

    data["binned"] = create_bins(data[col], n_bins=n_bins)
    data["binned_left"] = data["binned"].apply(lambda x: x.left).astype(int)
    data["binned_right"] = data["binned"].apply(lambda x: x.right).astype(int)
    l = sorted(data["binned_left"].unique())
    r = sorted(data["binned_right"].unique())
    xlabels = [f"{l[i]}-{r[i]}" for i in range(len(l))]
    
    sns.boxplot(x="binned_left", y="count", data=data, ax=ax)
    ax.axhline(Q2, color="k", linestyle="--")



    ax.set_xticklabels(xlabels, fontsize=20)
    ax.set_xlabel(f"{col} bins", fontsize=20)
    ax.yaxis.label.set_size(20)
    ax.yaxis.set_tick_params(labelsize=20)

    if save:
        fig.savefig(f"eda_plots/count_boxplot_per_{col}_bin.png")
    
    if not plot:
        plt.close()

    return fig

In [10]:
plot_category_frequency(bikes, "season")
plot_category_frequency(bikes, "holiday")
plot_category_frequency(bikes, "workingday")
plot_category_frequency(bikes, "weather");

In [11]:
plot_histogram(bikes, "temp")
plot_histogram(bikes, "atemp")
plot_histogram(bikes, "humidity")
plot_histogram(bikes, "windspeed")
plot_histogram(bikes, "count");

In [12]:
plot_count_boxplot_per_category(bikes, "season")
plot_count_boxplot_per_category(bikes, "holiday")
plot_count_boxplot_per_category(bikes, "workingday")
plot_count_boxplot_per_category(bikes, "weather");

In [13]:
plot_count_boxplot_per_bin(bikes, "temp", n_bins=10)
plot_count_boxplot_per_bin(bikes, "atemp", n_bins=10)
plot_count_boxplot_per_bin(bikes, "humidity", n_bins=10)
plot_count_boxplot_per_bin(bikes, "windspeed", n_bins=10);