In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

sns.set_theme(style = "whitegrid")

* datetime - hourly date + timestamp 
* season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
* holiday - whether the day is considered a holiday
* workingday - whether the day is neither a weekend nor holiday
* weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
* 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
* 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
* 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
* temp - temperature in Celsius
* atemp - "feels like" temperature in Celsius
* humidity - relative humidity
* windspeed - wind speed
* casual - number of non-registered user rentals initiated
* registered - number of registered user rentals initiated
* count - number of total rentals

In [2]:
SEASON_TO_INDEX = {
    1: 'Primavera',
    2: 'Verão',
    3: 'Outono',
    4: 'Inverno'
}
WEATHER_TO_INDEX = {
    1: "Limpo/Parcialmente Nublado",
    2: "Neblina",
    3: "Neve/Chuvoso",
    4: "Chuva Forte/Neve + Névoa",
}
INDEX_TO_SEASON = {
    1: 'Primavera',
    2: 'Verão',
    3: 'Outono',
    4: 'Inverno'
}
INDEX_TO_WEATHER = {
    1: "Limpo/Parcialmente Nublado",
    2: "Neblina",
    3: "Neve/Chuvoso",
    4: "Chuva Forte/Neve + Névoa",
}

In [3]:
bikes = pd.read_csv("data/train.csv").astype({"datetime": "datetime64[ns]"})

In [4]:
bikes["season"] = bikes["season"].apply(lambda x: SEASON_TO_INDEX[x])
bikes["holiday"] = bikes["holiday"].apply(lambda x: "Sim" if x == 1 else "Não")
bikes["workingday"] = bikes["workingday"].apply(lambda x: "Sim" if x == 1 else "Não")
bikes["weather"] = bikes["weather"].apply(lambda x: WEATHER_TO_INDEX[x])

In [5]:
def plot_count_boxplot_per_category(bikes: pd.DataFrame, col: str, save: bool = True, plot: bool = False) -> plt.Figure:
    fig, ax = plt.subplots( figsize=(18, 8))
    
    Q2 = bikes["count"].quantile(.5)
    ax.axhline(Q2, color="k", linestyle="-.", lw=4)
    
    sns.boxplot(x=col, y="count", data=bikes, ax=ax)
    ax.legend(["Mediana da Contagem"], fontsize=20)

    fig.suptitle(f"Boxplot por categorias da coluna {col}", weight='bold', fontsize=20)
    
    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    ax.set_ylabel('Contagem')
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)
    
    if save:
        fig.savefig(f"eda_plots/count_boxplot_per_{col}.png")

    if not plot:
        plt.close()
        
    return fig


In [6]:
def plot_category_frequency(bikes: pd.DataFrame, col: str, save: bool = True, plot: bool = False) -> plt.Figure:
    fig, ax = plt.subplots(figsize=(18, 8))
    sns.countplot(x=col, data=bikes)
    ax.set_xlabel(col)
    ax.set_title(f"Frequência de {col}", weight='bold', fontsize=20)

    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    ax.set_ylabel('Contagem')
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)

    if save:
        fig.savefig(f"eda_plots/frequency_{col}.png")
    if not plot:
        plt.close()
    return fig

In [7]:
def plot_histogram(bikes: pd.DataFrame, col: str, save: bool = True, plot: bool = False) -> plt.Figure:
    fig, ax = plt.subplots(figsize=(18, 8))
    sns.histplot(bikes[col],  ax=ax, kde = True)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequência")
    ax.set_title(f"Histograma de {col}", weight='bold', fontsize=20)

    Q2 = bikes[col].quantile(.5)
    mean = bikes[col].mean()

    ax.axvline(Q2, color="r", linestyle="-.", lw=4)
    ax.axvline(mean, color="k", linestyle="-.", lw=4)

    ax.legend(["EDK", "mediana", "média"], fontsize=20)

    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)

    if save:
        fig.savefig(f"eda_plots/histogram_{col}.png")
    if not plot:
        plt.close()
    return fig

In [8]:
def create_bins(series: pd.Series, n_bins = 10) -> pd.Series:
    return pd.cut(series, bins=np.linspace(series.min() - 1e-7, series.max(), n_bins + 1))

In [9]:
def plot_count_boxplot_per_bin(
    bikes: pd.DataFrame, col: str, n_bins=10, save=True, plot=False
):
    fig, ax = plt.subplots(figsize=(18, 8))
    ax.set_title(f"Boxplot do atributo count por partições de {col}", weight='bold', fontsize=20)
    
    Q2 = bikes["count"].quantile(0.5)
    data = bikes.copy()

    data["binned"] = create_bins(data[col], n_bins=n_bins)
    data["binned_left"] = data["binned"].apply(lambda x: x.left).astype(int)
    data["binned_right"] = data["binned"].apply(lambda x: x.right).astype(int)
    l = sorted(data["binned_left"].unique())
    r = sorted(data["binned_right"].unique())
    xlabels = [f"{l[i]}-{r[i]}" for i in range(len(l))]
    
    sns.boxplot(x="binned_left", y="count", data=data, ax=ax)
    ax.axhline(Q2, color="k", linestyle="--")

    ax.set_xticklabels(xlabels, fontsize=20)
    ax.set_xlabel(f"Particições de {col}", fontsize=20)
    ax.yaxis.label.set_size(20)
    ax.set_ylabel('Contagem')
    ax.yaxis.set_tick_params(labelsize=20)

    if save:
        fig.savefig(f"eda_plots/count_boxplot_per_{col}_bin.png")
    
    if not plot:
        plt.close()

    return fig

In [68]:
plot_category_frequency(bikes, "season")
plot_category_frequency(bikes, "holiday")
plot_category_frequency(bikes, "workingday")
plot_category_frequency(bikes, "weather")

In [69]:
plot_histogram(bikes, "temp")
plot_histogram(bikes, "atemp")
plot_histogram(bikes, "humidity")
plot_histogram(bikes, "windspeed")
plot_histogram(bikes, "count")

In [70]:
plot_count_boxplot_per_category(bikes, "season")
plot_count_boxplot_per_category(bikes, "holiday")
plot_count_boxplot_per_category(bikes, "workingday")
plot_count_boxplot_per_category(bikes, "weather")

In [71]:
plot_count_boxplot_per_bin(bikes, "temp", n_bins=10)
plot_count_boxplot_per_bin(bikes, "atemp", n_bins=10)
plot_count_boxplot_per_bin(bikes, "humidity", n_bins=10)
plot_count_boxplot_per_bin(bikes, "windspeed", n_bins=10)

In [10]:
def plot_multivariable_scatterplot(bikes: pd.DataFrame, hue: str, save: bool = True, plot: bool = False) -> plt.Figure:
    fig, ax = plt.subplots(figsize=(18, 8))
    sns.pairplot(df, hue=hue, vars=['temp','atemp','humidity','windspeed','casual','registered','count'])
    ax.set_title(f"Matriz Scatter Plot com separação por '{hue}'", weight='bold', fontsize=20)
    if save:
        fig.savefig(f"eda_plots/matrix_scatterplot_{hue}.png")
    if not plot:
        plt.close()
    return fig

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,Primavera,Não,Não,Limpo/Parcialmente Nublado,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,Primavera,Não,Não,Limpo/Parcialmente Nublado,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,Primavera,Não,Não,Limpo/Parcialmente Nublado,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,Primavera,Não,Não,Limpo/Parcialmente Nublado,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,Primavera,Não,Não,Limpo/Parcialmente Nublado,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,Inverno,Não,Sim,Limpo/Parcialmente Nublado,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,Inverno,Não,Sim,Limpo/Parcialmente Nublado,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,Inverno,Não,Sim,Limpo/Parcialmente Nublado,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,Inverno,Não,Sim,Limpo/Parcialmente Nublado,13.94,17.425,61,6.0032,12,117,129


In [None]:
df = pd.read_csv("data/train.csv").astype({"datetime": "datetime64[ns]"})
sns.pairplot(df, hue="holiday", vars=['temp','atemp','humidity','windspeed','casual','registered','count'])
sns.pairplot(df, hue="workingday", vars=['temp','atemp','humidity','windspeed','casual','registered','count'])
sns.pairplot(df, hue="weather", vars=['temp','atemp','humidity','windspeed','casual','registered','count'])