### Data information

- hour.csv : bike sharing counts aggregated on hourly basis. Records: 17379 hours
- day.csv - bike sharing counts aggregated on daily basis. Records: 731 days

### Dataset Characteristics

- instant: record index
- dteday : date
- season : 
    - 1: spring, 
    - 2: summer, 
    - 3: fall, 
    - 4: winter
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot  as plt

from sklearn.preprocessing import OneHotEncoder

sns.set_theme(style = "whitegrid")

In [2]:
bikes = pd.read_csv("data/hour.csv")
bikes["dteday"] = pd.to_datetime(bikes["dteday"])
bikes.set_index('dteday', inplace=True)
bikes.drop(columns=["instant","casual","registered","atemp"], inplace=True)

In [3]:
# Count Nans
print(bikes.shape)
print(bikes.isnull().sum(axis=0))

(17379, 12)
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
hum           0
windspeed     0
cnt           0
dtype: int64


In [4]:
# Lagging
bikes["cnt_lag_1"] = bikes["cnt"].shift(-1)
bikes["cnt_lag_2"] = bikes["cnt"].shift(-2)
bikes = bikes.dropna()

In [5]:
# One-Hot Encoder for categorical columns
categorical_columns = ["season", "weathersit", "holiday", "workingday", "mnth", "weekday"]

for col in categorical_columns:
    ohe = OneHotEncoder(sparse=False)
    new_cols = ohe.fit_transform(bikes[[col]])
    bikes[[f"{col}={c}" for c in ohe.categories_[0]]] = new_cols

# Drop original categorical columns
bikes = bikes.drop(columns=categorical_columns)

In [6]:
bikes

Unnamed: 0_level_0,yr,hr,temp,hum,windspeed,cnt,cnt_lag_1,cnt_lag_2,season=1,season=2,...,mnth=10,mnth=11,mnth=12,weekday=0,weekday=1,weekday=2,weekday=3,weekday=4,weekday=5,weekday=6
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-01,0,0,0.24,0.81,0.0000,16,40.0,32.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0,1,0.22,0.80,0.0000,40,32.0,13.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0,2,0.22,0.80,0.0000,32,13.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0,3,0.24,0.75,0.0000,13,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0,4,0.24,0.75,0.0000,1,1.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31,1,17,0.26,0.48,0.0896,164,122.0,119.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-12-31,1,18,0.26,0.48,0.1343,122,119.0,89.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-12-31,1,19,0.26,0.60,0.1642,119,89.0,90.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-12-31,1,20,0.26,0.60,0.1642,89,90.0,61.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Normalização de valores (min max)
normalized_bikes = (bikes - bikes.min()) / (bikes.max() - bikes.min())

# Box plot para análise de outliers (em variáveis não categóricas)
def plot_boxplot(bikes: pd.DataFrame, save=True, plot=False):
    fig, ax = plt.subplots(figsize=(18, 8))
    
    ax.set_title('Boxplot dos dados não categóricos normalizados', weight='bold', fontsize=25)
    data = pd.melt(bikes)
    data.rename(columns={"variable": "atributos", "value": "valores"}, inplace=True)
    sns.boxplot(x="atributos", y="valores",data=data)

    ax.xaxis.label.set_size(20)
    ax.yaxis.label.set_size(20)
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)
    
    if save:
        fig.savefig(f"preproc/pre_outlier_boxplot.png")
    
    if not plot:
        plt.close()

    return fig


In [8]:
normalized_bikes

Unnamed: 0_level_0,yr,hr,temp,hum,windspeed,cnt,cnt_lag_1,cnt_lag_2,season=1,season=2,...,mnth=10,mnth=11,mnth=12,weekday=0,weekday=1,weekday=2,weekday=3,weekday=4,weekday=5,weekday=6
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-01,0.0,0.000000,0.224490,0.81,0.000000,0.015369,0.039959,0.031762,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0.0,0.043478,0.204082,0.80,0.000000,0.039959,0.031762,0.012295,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0.0,0.086957,0.204082,0.80,0.000000,0.031762,0.012295,0.000000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0.0,0.130435,0.224490,0.75,0.000000,0.012295,0.000000,0.000000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2011-01-01,0.0,0.173913,0.224490,0.75,0.000000,0.000000,0.000000,0.001025,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31,1.0,0.739130,0.244898,0.48,0.105325,0.167008,0.123975,0.120902,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-12-31,1.0,0.782609,0.244898,0.48,0.157870,0.123975,0.120902,0.090164,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-12-31,1.0,0.826087,0.244898,0.60,0.193018,0.120902,0.090164,0.091189,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2012-12-31,1.0,0.869565,0.244898,0.60,0.193018,0.090164,0.091189,0.061475,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
training_data = bikes[:"2012-11-01"]                # Training
testing_data = bikes["2012-11-01":]                 # Evaluation

In [10]:
training_data.to_csv("data/training.csv")
testing_data.to_csv("data/testing.csv")