## **Rede Neural - Regressão** <br> COC361 - Inteligência Computacional (2021.2)
### Alunos: <br> Henrique Chaves (DRE 119025571) <br> Pedro Boechat (DRE 119065050)
<hr>

### • Importação das bibliotecas

In [1]:
# Bibliotecas padrão
from os import (
    listdir,
    makedirs
)
import pickle
from typing import Tuple

# Bibliotecas do Jupyter
from IPython.display import display

# Bibliotecas para manipulação dos dados
import kaggle
import numpy as np
import pandas as pd

# SKLearn
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

# Tensorflow/Keras
from tensorflow.keras.layers import (
    Dense,
    Dropout,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from scikeras.wrappers import KerasRegressor
from tensorflow.config import list_physical_devices
from keras.engine.sequential import Sequential as TypeSequential

# Bibliotecas para plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Carregamento das variáveis de ambiente
from dotenv import load_dotenv
load_dotenv()

# Número de GPUs disponíveis para o Tensorflow/Keras
print("Número de GPUs disponíveis: ", len(list_physical_devices('GPU')))

Número de GPUs disponíveis:  1


### • Download do dataset ([Link](https://www.kaggle.com/contactprad/bike-share-daily-data?select=bike_sharing_daily.csv))

In [2]:
# Cria pasta de destino, caso não exista
makedirs("./data/regression", exist_ok=True)

# Se a pasta de destino estiver vazia, baixa os dados
if len(listdir("./data/regression/")) == 0:
    kaggle.api.dataset_download_file(
        "contactprad/bike-share-daily-data",
        "bike_sharing_daily.csv",
        "./data/regression/"
    )

### • Carregamento do dataset

In [3]:
# Carregamento dos dados
df = pd.read_csv("./data/regression/bike_sharing_daily.csv")

### • Estudo do dataset
```
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered
```

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


In [5]:
df.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


In [6]:
display(df.head())
display(df.tail())

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
726,727,2012-12-27,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1,1,12,0,5,1,2,0.253333,0.255046,0.59,0.155471,644,2451,3095
728,729,2012-12-29,1,1,12,0,6,0,2,0.253333,0.2424,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.2317,0.483333,0.350754,364,1432,1796
730,731,2012-12-31,1,1,12,0,1,1,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


### • Análise Exploratória dos Dados

### • Limpeza dos dados

#### 1. Remover coluna `instant`

In [7]:
# Remove coluna `instant` se for igual ao índice do dataset
if np.all(df.index == df["instant"] - 1):
    df = df.drop("instant", axis=1)

print("df shape:", df.shape)
df.sample(5)

df shape: (731, 15)


Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
24,2011-01-25,1,0,1,0,2,1,2,0.223478,0.234526,0.616957,0.129796,186,1799,1985
262,2011-09-20,3,0,9,0,2,1,2,0.561667,0.532217,0.88125,0.134954,438,3203,3641
359,2011-12-26,1,0,12,1,1,0,1,0.321739,0.315535,0.506957,0.239465,430,887,1317
73,2011-03-15,1,0,3,0,2,1,2,0.317391,0.318178,0.655652,0.184309,289,1767,2056
363,2011-12-30,1,0,12,0,5,1,1,0.311667,0.318812,0.636667,0.134337,491,2508,2999


#### 2. Converter coluna `season` para variáveis dummies

In [8]:
dummies_season = pd.get_dummies(df["season"], drop_first=True)
dummies_season = dummies_season.rename(
    columns={
        2: "is_summer",
        3: "is_fall",
        4: "is_winter"
    }
)

print("dummies_season shape:", dummies_season.shape)
dummies_season.sample(5)

dummies_season shape: (731, 3)


Unnamed: 0,is_summer,is_fall,is_winter
605,0,1,0
717,0,0,1
181,0,1,0
131,1,0,0
512,1,0,0


In [9]:
df = df.drop("season", axis=1)
df = pd.concat([df, dummies_season], axis=1)
print("df shape:", df.shape)
df.sample(5)

df shape: (731, 17)


Unnamed: 0,dteday,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,is_summer,is_fall,is_winter
78,2011-03-20,0,3,0,0,0,1,0.3325,0.32575,0.47375,0.207721,1047,1424,2471,0,0,0
363,2011-12-30,0,12,0,5,1,1,0.311667,0.318812,0.636667,0.134337,491,2508,2999,0,0,0
704,2012-12-05,1,12,0,3,1,1,0.438333,0.428012,0.485,0.324021,331,5398,5729,0,0,1
196,2011-07-16,0,7,0,6,0,1,0.686667,0.638263,0.585,0.208342,2418,3505,5923,0,1,0
437,2012-03-13,1,3,0,2,1,1,0.565,0.542929,0.6175,0.23695,762,5085,5847,0,0,0


#### 3. Converter colunas `mnth` e `weekday` para variáveis cíclicas usando `sin` e `cos`

In [10]:
df["mnth_cos"] = np.cos(df["mnth"]*np.pi/6)
df["mnth_sin"] = np.cos(df["mnth"]*np.pi/6)
df["weekday_cos"] = np.cos((df["weekday"]+1)*2*np.pi/7)
df["weekday_sin"] = np.cos((df["weekday"]+1)*2*np.pi/7)

df = df.drop(["mnth", "weekday"], axis=1)
print("df shape:", df.shape)
df.sample(5)

df shape: (731, 19)


Unnamed: 0,dteday,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
40,2011-02-10,0,0,1,1,0.144348,0.149548,0.437391,0.221935,47,1491,1538,0,0,0,0.5,0.5,-0.222521,-0.222521
546,2012-06-30,1,0,0,1,0.765,0.687508,0.60125,0.161071,1455,4232,5687,0,1,0,-1.0,-1.0,1.0,1.0
609,2012-09-01,1,0,0,2,0.753333,0.702654,0.638333,0.113187,2352,3788,6140,0,1,0,-1.83697e-16,-1.83697e-16,1.0,1.0
418,2012-02-23,1,0,1,1,0.454167,0.444446,0.554583,0.190913,516,4546,5062,0,0,0,0.5,0.5,-0.222521,-0.222521
125,2011-05-06,0,0,1,1,0.479167,0.474117,0.59,0.228246,894,3714,4608,1,0,0,-0.8660254,-0.8660254,0.62349,0.62349


#### 4. Desnormalizar colunas `temp`, `atemp`, `hum` e `windspeed`

In [11]:
df["temp"] *= 41
df["atemp"] *= 50
df["hum"] *= 100
df["windspeed"] *= 67

print("df shape:", df.shape)
df.sample(5)

df shape: (731, 19)


Unnamed: 0,dteday,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
407,2012-02-12,1,0,0,1,5.2275,5.0829,46.4583,27.417204,73,1456,1529,0,0,0,0.5,0.5,0.62349,0.62349
186,2011-07-06,0,0,1,1,29.52,34.28165,74.3333,10.042161,784,3845,4629,0,1,0,-0.8660254,-0.8660254,-0.900969,-0.900969
80,2011-03-22,0,0,1,1,18.108347,22.0321,62.4583,15.12525,460,2243,2703,1,0,0,6.123234000000001e-17,6.123234000000001e-17,-0.900969,-0.900969
14,2011-01-15,0,0,0,2,9.566653,12.4056,49.875,10.583521,222,1026,1248,0,0,0,0.8660254,0.8660254,1.0,1.0
540,2012-06-24,1,0,0,1,30.476653,33.7127,47.9167,9.750175,2551,4340,6891,0,1,0,-1.0,-1.0,0.62349,0.62349


#### 5. Remover colunas `casual` e `registered` pois a soma delas é igual a `cnt` (variável alvo)

In [12]:
if np.all(df["casual"] + df["registered"] == df["cnt"]):
    df = df.drop(["casual", "registered"], axis=1)

print("df shape:", df.shape)
df.sample(5)

df shape: (731, 17)


Unnamed: 0,dteday,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
259,2011-09-17,0,0,0,2,20.158347,23.9256,71.8333,12.708225,4511,0,1,0,-1.83697e-16,-1.83697e-16,1.0,1.0
562,2012-07-16,1,0,1,1,31.296653,36.20625,64.5,11.000529,6830,0,1,0,-0.8660254,-0.8660254,-0.222521,-0.222521
647,2012-10-09,1,0,1,2,18.313347,21.9056,76.1667,12.7501,6392,0,0,1,0.5,0.5,-0.900969,-0.900969
324,2011-11-21,0,0,1,3,18.3475,22.2531,91.0,9.249618,2765,0,0,1,0.8660254,0.8660254,-0.222521,-0.222521
623,2012-09-15,1,0,0,1,24.941653,29.29335,50.1667,16.583907,8714,0,1,0,-1.83697e-16,-1.83697e-16,1.0,1.0


### • Salvar dataset limpo

In [13]:
df.to_csv("./data/regression/df_clean.csv", index=False)

### • Definindo `features` e  `targets`

In [14]:
features = ["yr", "holiday", "workingday", "weathersit",
            "temp", "atemp", "hum", "windspeed",
            "is_summer", "is_fall", "is_winter",
            "mnth_cos", "mnth_sin", "weekday_cos", "weekday_sin"]

targets = ["cnt"]

In [15]:
df_X = df[features]

print("df_X shape:", df_X.shape)
df_X.sample(5)

df_X shape: (731, 15)


Unnamed: 0,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
171,0,0,1,2,27.914153,31.8823,77.0417,11.458675,0,1,0,-1.0,-1.0,-0.900969,-0.900969
426,1,0,1,2,14.486653,17.9921,65.7083,9.708568,0,0,0,6.123234000000001e-17,6.123234000000001e-17,0.62349,0.62349
323,0,0,0,2,18.996653,22.8529,68.4583,12.45865,0,0,1,0.8660254,0.8660254,0.62349,0.62349
15,0,0,0,1,9.498347,11.71085,48.375,12.625011,0,0,0,0.8660254,0.8660254,0.62349,0.62349
605,1,0,1,1,29.861653,33.3025,62.0,12.791975,0,1,0,-0.5,-0.5,-0.900969,-0.900969


In [16]:
df_y = df[targets]

print("df_y shape:", df_y.shape)
df_y.sample(5)

df_y shape: (731, 1)


Unnamed: 0,cnt
332,2914
685,5698
377,3214
125,4608
74,2192


### • Normalização dos dados

In [17]:
# Definição do scaler
scaler = MinMaxScaler

# Instância do scaler para X e Y
X_scaler = scaler()
y_scaler = scaler()

In [18]:
X = X_scaler.fit_transform(df_X)
y = y_scaler.fit_transform(df_y)


print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (731, 15)
y shape: (731, 1)


### • Definição de callbacks da rede

In [19]:
def create_model(
    n_hidden_layers: int,
    n_neurons: int,
    dropout_rate: float,
    dropout_last_layer: bool,
    learning_rate: float = 0.001,
    input_shape: Tuple[int, ] = (X.shape[1], )
) -> TypeSequential:
    """Função que retorna o modelo compilado a partir dos parâmetros.
    Args:
        n_layers (int): Número de camadas da rede. 2 por padrão.
        n_neurons (int): Número de neurônios da rede. 32 por padrão.
        dropout_rate (float): Taxa de dropout. 0.2 por padrão.
        dropout_last_layer (bool): Se terá dropout na última camada.
        False por padrão.
        learning_rate (float): Learning rate do modelo. 0.001 por padrão.
        input_shape (List[int]): Forma da entrada. [99] por padrão.
    """
    # Criação do modelo sequencial
    model = Sequential()

    # Número de variáveis de entrada
    model.add(
            Dense(
                n_neurons,
                activation='relu',
                input_shape=input_shape
            )
        )

    for i in range(n_hidden_layers):
        # Camada de adensamento com ativação RELU
        model.add(
            Dense(
                n_neurons,
                activation='relu'
            )
        )

        # Camada de dropout
        if dropout_rate > 0.0:
            if (i == n_hidden_layers - 1) and (not dropout_last_layer):
                continue
            model.add(
                Dropout(
                    dropout_rate
                )
            )

    # Camada de adensamento com ativação LINEAR
    model.add(Dense(1, activation='linear'))

    # Otimizador Adam
    # optimizer = Adam(learning_rate=learning_rate)

    # Compilação do modelo
    model.compile(
        optimizer="adam",
        loss='mse',
        metrics=['mae']
    )

    return model

In [20]:
# Reduz a learning rate caso o modelo esteja estagnado
lr_reduce = ReduceLROnPlateau(
    min_delta=1e-5,
    patience=5,
    verbose=1
)

# Lista contendo os checkpoints definidos
callbacks = [
    lr_reduce
]

### • Definição das camadas da rede

In [21]:
# Criação do regressor com wrapper do SKLearn
regressor = KerasRegressor(
    model=create_model,
    n_hidden_layers=1,
    n_neurons=32,
    dropout_rate=0.0,
    dropout_last_layer=False,
    batch_size=32,
    epochs=100,
)

In [22]:
# Parâmetros para o Grid Search
param_grid = {
    "n_hidden_layers": [1, 2, 3],
    "n_neurons": [32, 64, 128],
    "dropout_rate": [0.0, 0.2],
    "dropout_last_layer": [False, True]
}

In [23]:
# Instância do Grid Search
grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=10,
    n_jobs=1
)

### • Treino da rede

In [25]:
# Treino do modelo
grid_result = grid_search.fit(
    X, y,
    callbacks=callbacks,
    verbose=0
)







In [26]:
with open("./data/regression/grid_result.pkl", "wb") as f:
    pickle.dump(grid_result, f)

INFO:tensorflow:Assets written to: C:\Users\pedro\AppData\Local\Temp\tmpyl5ip_t5\assets


In [None]:
# history = grid_result.best_estimator_.model.model.history.history

### • Avaliação da rede

In [None]:
# Definição dos subplots
fig, ax = plt.subplots(figsize=(15, 16), nrows=2)

# Gráfico do MAE do modelo por época
ax[0].plot(history['mse'])
ax[0].plot(history['val_mae'])
ax[0].set_title('MAE do modelo por época', fontsize=18)
ax[0].set_ylabel('MAE', fontsize=14)
ax[0].set_xlabel('Época', fontsize=14)
ax[0].legend(['Treino', 'Validação'], loc='upper left', fontsize=16)

# Gráfico da loss do modelo por época
ax[1].plot(history['loss'])
ax[1].plot(history['val_loss'])
ax[1].set_title('Loss (MSE) do modelo por época', fontsize=18)
ax[1].set_ylabel('Loss (MSE)', fontsize=14)
ax[1].set_xlabel('Época', fontsize=14)
ax[1].legend(['Treino', 'Validação'], loc='upper left', fontsize=16)

# Ajuste do layout do plot
plt.tight_layout()