In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [77]:
INDEX_TO_SEASON = {
    1: 'Primavera',
    2: 'Verão',
    3: 'Outono',
    4: 'Inverno'
}
INDEX_TO_WEATHER = {
    1: "Limpo/Parcialmente Nublado",
    2: "Neblina",
    3: "Neve/Chuvoso",
    4: "Chuva Forte/Neve + Névoa",
}

In [78]:
bikes = pd.read_csv("data/train.csv")
bikes = bikes.astype({"datetime":"datetime64[ns]"}).set_index("datetime")
bikes = bikes.drop(columns=["casual", "registered"])

bikes["season"] = bikes["season"].apply(lambda x: INDEX_TO_SEASON[x])
bikes["holiday"] = bikes["holiday"].apply(lambda x: "Sim" if x == 1 else "Não")
bikes["workingday"] = bikes["workingday"].apply(lambda x: "Sim" if x == 1 else "Não")
bikes["weather"] = bikes["weather"].apply(lambda x: INDEX_TO_WEATHER[x])


In [79]:
# quebrando por mês para fazer o foward fill, sem isso o asfreq insere muitos NaNs (os últimos dias de cada mês)
bikes_monthly = {(month.year, month.month):group.asfreq("H") for month, group in bikes.groupby(pd.Grouper(freq='1M'))}

groups = []
for month, group in bikes_monthly.items():
    nans = group.isna().any(axis=1).sum()
    if nans > 0:
        print(f"{month}: {nans} NaNs")
    groups.append(group.ffill())

bikes = pd.concat(groups)

(2011, 1): 25 NaNs
(2011, 2): 10 NaNs
(2011, 3): 10 NaNs
(2011, 4): 1 NaNs
(2011, 9): 3 NaNs
(2011, 10): 1 NaNs
(2012, 1): 3 NaNs
(2012, 2): 1 NaNs
(2012, 3): 1 NaNs
(2012, 4): 2 NaNs
(2012, 11): 1 NaNs


In [80]:
bikes = bikes.astype({
    "count" : int,
    "humidity" : int,
})

In [81]:
categorical_columns = ["season", "weather"]

for col in categorical_columns:
    ohe = OneHotEncoder(sparse=False,)
    new_cols = ohe.fit_transform(bikes[[col]])
    ohe.categories_
    bikes[[f"{col}={c}" for c in ohe.categories_[0]]] = new_cols

bikes = bikes.drop(columns = categorical_columns)

In [82]:
bikes

Unnamed: 0_level_0,holiday,workingday,temp,atemp,humidity,windspeed,count,season=Inverno,season=Outono,season=Primavera,season=Verão,weather=Chuva Forte/Neve + Névoa,weather=Limpo/Parcialmente Nublado,weather=Neblina,weather=Neve/Chuvoso
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-01 00:00:00,Não,Não,9.84,14.395,81,0.0000,16,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2011-01-01 01:00:00,Não,Não,9.02,13.635,80,0.0000,40,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2011-01-01 02:00:00,Não,Não,9.02,13.635,80,0.0000,32,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2011-01-01 03:00:00,Não,Não,9.84,14.395,75,0.0000,13,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2011-01-01 04:00:00,Não,Não,9.84,14.395,75,0.0000,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,Não,Sim,15.58,19.695,50,26.0027,336,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2012-12-19 20:00:00,Não,Sim,14.76,17.425,57,15.0013,241,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2012-12-19 21:00:00,Não,Sim,13.94,15.910,61,15.0013,168,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2012-12-19 22:00:00,Não,Sim,13.94,17.425,61,6.0032,129,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
