## **Rede Neural - Regressão** <br> COC361 - Inteligência Computacional (2021.2)
### Alunos: <br> Henrique Chaves (DRE 119025571) <br> Pedro Boechat (DRE 119065050)
<hr>

### • Importação das bibliotecas

In [1]:
# Bibliotecas padrão
from os import (
    listdir,
    makedirs
)
from typing import List

# Bibliotecas do Jupyter
from IPython.display import display

# Bibliotecas para manipulação dos dados
import kaggle
import numpy as np
import pandas as pd

# SKLearn
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler

# Tensorflow/Keras
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    InputLayer
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (
    ModelCheckpoint,
    ReduceLROnPlateau
)
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.config import list_physical_devices
from keras.engine.sequential import Sequential as TypeSequential

# Bibliotecas para plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Carregamento das variáveis de ambiente
from dotenv import load_dotenv
load_dotenv()

# Número de GPUs disponíveis para o Tensorflow/Keras
print("Número de GPUs disponíveis: ", len(list_physical_devices('GPU')))

Número de GPUs disponíveis:  1


### • Download do dataset ([Link](https://www.kaggle.com/contactprad/bike-share-daily-data?select=bike_sharing_daily.csv))

In [2]:
# Cria pasta de destino, caso não exista
makedirs("./data/regression", exist_ok=True)

# Se a pasta de destino estiver vazia, baixa os dados
if len(listdir("./data/regression/")) == 0:
    kaggle.api.dataset_download_file(
        "contactprad/bike-share-daily-data",
        "bike_sharing_daily.csv",
        "./data/regression/"
    )

### • Carregamento do dataset

In [3]:
# Carregamento dos dados
df = pd.read_csv("./data/regression/bike_sharing_daily.csv")

### • Estudo do dataset
```
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered
```

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


In [5]:
df.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


In [6]:
display(df.head())
display(df.tail())

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
726,727,2012-12-27,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1,1,12,0,5,1,2,0.253333,0.255046,0.59,0.155471,644,2451,3095
728,729,2012-12-29,1,1,12,0,6,0,2,0.253333,0.2424,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.2317,0.483333,0.350754,364,1432,1796
730,731,2012-12-31,1,1,12,0,1,1,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


### • Limpeza dos dados

#### 1. Remover coluna `instant`

In [7]:
# Remove coluna `instant` se for igual ao índice do dataset
if np.all(df.index == df["instant"] - 1):
    df = df.drop("instant", axis=1)

print("df shape:", df.shape)
df.sample(5)

df shape: (731, 15)


Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
428,2012-03-04,1,1,3,0,0,0,1,0.325833,0.303021,0.403333,0.334571,710,2713,3423
275,2011-10-03,4,0,10,0,1,1,2,0.384167,0.392046,0.760833,0.083346,330,3240,3570
503,2012-05-18,2,1,5,0,5,1,1,0.564167,0.551121,0.523333,0.136817,1521,6118,7639
729,2012-12-30,1,1,12,0,0,0,1,0.255833,0.2317,0.483333,0.350754,364,1432,1796
511,2012-05-26,2,1,5,0,6,0,1,0.6925,0.642696,0.7325,0.198992,2855,3681,6536


#### 2. Converter coluna `season` para variáveis dummies

In [8]:
dummies_season = pd.get_dummies(df["season"], drop_first=True)
dummies_season = dummies_season.rename(
    columns={
        2: "is_summer",
        3: "is_fall",
        4: "is_winter"
    }
)

print("dummies_season shape:", dummies_season.shape)
dummies_season.sample(5)

dummies_season shape: (731, 3)


Unnamed: 0,is_summer,is_fall,is_winter
318,0,0,1
287,0,0,1
494,1,0,0
508,1,0,0
34,0,0,0


In [9]:
df = df.drop("season", axis=1)
df = pd.concat([df, dummies_season], axis=1)
print("df shape:", df.shape)
df.sample(5)

df shape: (731, 17)


Unnamed: 0,dteday,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,is_summer,is_fall,is_winter
423,2012-02-28,1,2,0,2,1,1,0.359167,0.353525,0.395833,0.193417,229,4134,4363,0,0,0
41,2011-02-11,0,2,0,5,1,1,0.189091,0.213509,0.506364,0.10855,149,1597,1746,0,0,0
221,2011-08-10,0,8,0,3,1,1,0.766667,0.684983,0.424167,0.200258,884,3896,4780,0,1,0
467,2012-04-12,1,4,0,4,1,1,0.3975,0.387604,0.46625,0.290429,663,4746,5409,1,0,0
568,2012-07-22,1,7,0,0,0,2,0.6675,0.623125,0.7625,0.093921,2544,4866,7410,0,1,0


#### 3. Converter colunas `mnth` e `weekday` para variáveis cíclicas usando `sin` e `cos`

In [10]:
df["mnth_cos"] = np.cos(df["mnth"]*np.pi/6)
df["mnth_sin"] = np.cos(df["mnth"]*np.pi/6)
df["weekday_cos"] = np.cos((df["weekday"]+1)*2*np.pi/7)
df["weekday_sin"] = np.cos((df["weekday"]+1)*2*np.pi/7)

df = df.drop(["mnth", "weekday"], axis=1)
print("df shape:", df.shape)
df.sample(5)

df shape: (731, 19)


Unnamed: 0,dteday,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
223,2011-08-12,0,0,1,1,0.708333,0.654042,0.415,0.125621,1051,3854,4905,0,1,0,-0.5,-0.5,0.62349,0.62349
374,2012-01-10,1,0,1,1,0.308696,0.318826,0.646522,0.187552,173,3425,3598,0,0,0,0.8660254,0.8660254,-0.900969,-0.900969
582,2012-08-05,1,0,0,1,0.769167,0.731079,0.6525,0.290421,1707,3757,5464,0,1,0,-0.5,-0.5,0.62349,0.62349
636,2012-09-28,1,0,1,2,0.619167,0.585863,0.69,0.164179,1045,6370,7415,0,0,1,-1.83697e-16,-1.83697e-16,0.62349,0.62349
49,2011-02-19,0,0,0,1,0.399167,0.391404,0.187917,0.507463,532,1103,1635,0,0,0,0.5,0.5,1.0,1.0


#### 4. Desnormalizar colunas `temp`, `atemp`, `hum` e `windspeed`

In [11]:
df["temp"] *= 41
df["atemp"] *= 50
df["hum"] *= 100
df["windspeed"] *= 67

print("df shape:", df.shape)
df.sample(5)

df shape: (731, 19)


Unnamed: 0,dteday,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
450,2012-03-26,1,0,1,1,18.279153,21.9375,47.7917,25.917007,795,4763,5558,1,0,0,6.123234000000001e-17,6.123234000000001e-17,-0.222521,-0.222521
333,2011-11-30,0,0,1,1,13.325,15.56105,61.3333,18.167586,188,3425,3613,0,0,1,0.8660254,0.8660254,-0.900969,-0.900969
627,2012-09-19,1,0,1,1,22.6525,27.0202,53.6667,14.416725,788,6803,7591,0,1,0,-1.83697e-16,-1.83697e-16,-0.900969,-0.900969
133,2011-05-14,0,0,0,2,21.354153,25.03145,92.25,9.04165,902,2507,3409,1,0,0,-0.8660254,-0.8660254,1.0,1.0
174,2011-06-24,0,0,1,1,29.690847,32.82915,57.3333,14.875675,969,4022,4991,0,1,0,-1.0,-1.0,0.62349,0.62349


#### 5. Remover colunas `casual` e `registered` pois a soma delas é igual a `cnt` (variável alvo)

In [12]:
if np.all(df["casual"] + df["registered"] == df["cnt"]):
    df = df.drop(["casual", "registered"], axis=1)

print("df shape:", df.shape)
df.sample(5)

df shape: (731, 17)


Unnamed: 0,dteday,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
98,2011-04-09,0,0,0,2,14.0425,17.07645,87.75,8.916561,2455,1,0,0,-0.5,-0.5,1.0,1.0
341,2011-12-08,0,0,1,1,10.899153,12.8469,58.0,16.083886,3322,0,0,1,1.0,1.0,-0.222521,-0.222521
505,2012-05-20,1,0,0,1,25.454153,29.19835,53.0417,17.042589,7129,1,0,0,-0.866025,-0.866025,0.62349,0.62349
534,2012-06-18,1,0,1,2,23.301653,27.2421,77.7917,11.707982,5099,1,0,0,-1.0,-1.0,-0.222521,-0.222521
604,2012-08-27,1,0,1,1,28.836653,32.7344,73.0417,8.625111,6917,0,1,0,-0.5,-0.5,-0.222521,-0.222521


### • Salvar dataset limpo

In [13]:
df.to_csv("./data/regression/df_clean.csv", index=False)

### • Definindo `features` e  `targets`

In [14]:
features = ["yr", "holiday", "workingday", "weathersit",
            "temp", "atemp", "hum", "windspeed",
            "is_summer", "is_fall", "is_winter",
            "mnth_cos", "mnth_sin", "weekday_cos", "weekday_sin"]

targets = ["cnt"]

In [15]:
df_X = df[features]

print("df_X shape:", df_X.shape)
df_X.sample(5)

df_X shape: (731, 15)


Unnamed: 0,yr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,is_summer,is_fall,is_winter,mnth_cos,mnth_sin,weekday_cos,weekday_sin
573,1,0,1,1,32.048347,36.71085,59.4583,10.250464,0,1,0,-0.866025,-0.866025,0.62349,0.62349
91,0,0,0,2,12.915,15.78185,65.375,13.208782,1,0,0,-0.5,-0.5,1.0,1.0
32,0,0,1,2,10.66,12.72085,77.5417,17.708636,0,0,0,0.5,0.5,-0.900969,-0.900969
307,0,0,1,2,16.536653,20.1696,62.25,18.209193,0,0,1,0.866025,0.866025,0.62349,0.62349
211,0,0,0,1,33.039153,36.4898,48.0833,11.042471,0,1,0,-0.866025,-0.866025,0.62349,0.62349


In [16]:
df_y = df[targets]

print("df_y shape:", df_y.shape)
df_y.sample(5)

df_y shape: (731, 1)


Unnamed: 0,cnt
54,1807
192,4258
391,3456
53,1917
728,1341


### • Normalização dos dados

In [17]:
# Definição do scaler
scaler = MinMaxScaler

# Instância do scaler para X e Y
X_scaler = scaler()
y_scaler = scaler()

In [18]:
X = X_scaler.fit_transform(df_X)
y = y_scaler.fit_transform(df_y)


print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (731, 15)
y shape: (731, 1)


### • Definição de callbacks da rede

In [19]:
def create_model(
    n_layers: int = 2,
    n_neurons: int = 32,
    dropout_rate: float = 0.2,
    dropout_last_layer: bool = False,
    learning_rate: float = 0.001,
    input_shape: tuple[int, ] = (X.shape[1], )
) -> TypeSequential:
    """Função que retorna o modelo compilado a partir dos parâmetros.
    Args:
        n_layers (int): Número de camadas da rede. 2 por padrão.
        n_neurons (int): Número de neurônios da rede. 32 por padrão.
        dropout_rate (float): Taxa de dropout. 0.2 por padrão.
        dropout_last_layer (bool): Se terá dropout na última camada.
        False por padrão.
        learning_rate (float): Learning rate do modelo. 0.001 por padrão.
        input_shape (List[int]): Forma da entrada. [99] por padrão.
    """
    # Criação do modelo sequencial
    model = Sequential()

    # Número de variáveis de entrada
    model.add(InputLayer(input_shape=input_shape))

    for i in range(n_layers):
        # Camada de adensamento com ativação RELU
        model.add(
            Dense(
                n_neurons,
                activation='relu'
            )
        )

        # Camada de dropout
        if dropout_rate > 0:
            if (i == n_layers - 1) and not dropout_last_layer:
                continue
            model.add(
                Dropout(
                    dropout_rate
                )
            )

    # Camada de adensamento com ativação LINEAR
    model.add(Dense(1, activation='linear'))

    # Otimizador Adam
    optimizer = Adam(learning_rate=learning_rate)

    # Compilação do modelo
    model.compile(
        optimizer=optimizer,
        loss='mse',
        metrics=['mae']
    )

    return model

In [20]:
# Reduz a learning rate caso o modelo esteja estagnado
lr_reduce = ReduceLROnPlateau(
    monitor="loss",
    factor=0.1,
    min_delta=1e-5,
    patience=5,
    verbose=1
)

# Lista contendo os checkpoints definidos
callbacks = [
    lr_reduce
]

### • Definição das camadas da rede

In [21]:
# Criação do regressor com wrapper do SKLearn
regressor = KerasRegressor(build_fn=create_model)

  regressor = KerasRegressor(build_fn=create_model)


In [22]:
# Parâmetros para o Grid Search
param_grid = {
    "n_layers": [1, 2, 3, 4],
    "n_neurons": [32, 64, 128],
    "dropout_rate": [0.1, 0.2, 0.3],
    "dropout_last_layer": [False, True],
    "learning_rate": np.logspace(-4, -2, 3),
    "batch_size": [16, 32, 64, 128],
}

In [23]:
# Instância do Grid Search
grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    n_jobs=4,
    cv=10
)

### • Treino da rede

In [24]:
# Treino do modelo
history = grid_search.fit(
    X, y,
    callbacks=callbacks,
    epochs=100,
    njobs=1,
    verbose=1
)

exception calling callback for <Future at 0x2c45e333310 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Users\pedro\AppData\Roaming\Python\Python39\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "C:\Users\pedro\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "C:\Users\pedro\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py", line 794, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "C:\Users\pedro\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\pedro\AppData\Roaming\Python\Python39\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\pedro\AppData\Roaming\Python\Python39\site-packages\jo

KeyboardInterrupt: 

In [None]:
# history = grid_result.best_estimator_.model.model.history.history

### • Avaliação da rede

In [None]:
# Definição dos subplots
fig, ax = plt.subplots(figsize=(15, 16), nrows=2)

# Gráfico do MAE do modelo por época
ax[0].plot(history['mse'])
ax[0].plot(history['val_mae'])
ax[0].set_title('MAE do modelo por época', fontsize=18)
ax[0].set_ylabel('MAE', fontsize=14)
ax[0].set_xlabel('Época', fontsize=14)
ax[0].legend(['Treino', 'Validação'], loc='upper left', fontsize=16)

# Gráfico da loss do modelo por época
ax[1].plot(history['loss'])
ax[1].plot(history['val_loss'])
ax[1].set_title('Loss (MSE) do modelo por época', fontsize=18)
ax[1].set_ylabel('Loss (MSE)', fontsize=14)
ax[1].set_xlabel('Época', fontsize=14)
ax[1].legend(['Treino', 'Validação'], loc='upper left', fontsize=16)

# Ajuste do layout do plot
plt.tight_layout()

### • Predição usando a rede treinada

In [None]:
y_test_pred = np.argmax(model.predict(X_test), axis=1)