### Importando as bibliotecas e pacotes


In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
import pandas as pd
import numpy as np

### Carregando e visualizando o dataset


In [2]:
path = "../data/dados_consumo_agua.pkl"

df = pd.read_pickle(path)
df.head()

Unnamed: 0,ano,mes,dia,hora,quantidade_pessoas,regiao,consumo_agua_m3,padrao_consumo
0,2020,1,Quarta,0,3,Sul,0.540039,Alto
1,2020,1,Quarta,1,3,Sul,0.300049,Normal
2,2020,1,Quarta,2,3,Sul,0.529785,Alto
3,2020,1,Quarta,3,3,Sul,0.620117,Alto
4,2020,1,Quarta,4,3,Sul,0.23999,Normal


In [3]:
df['padrao_consumo'].value_counts(normalize=True)

padrao_consumo
Normal    0.79843
Alto      0.20157
Name: proportion, dtype: float64

### Pré-processamento

In [6]:
# Divisão em features e labels
X, y = df.drop(columns=["ano", "padrao_consumo"]), df["padrao_consumo"]

# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transformar variáveis categóricas em numéricas
columns_category = X_train.select_dtypes(include="category").columns
for column in columns_category:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])


X_train.head()

Index(['dia', 'regiao'], dtype='object')


Unnamed: 0,mes,dia,hora,quantidade_pessoas,regiao,consumo_agua_m3
9284,1,2,20,3,0,0.199951
14314,8,2,10,3,0,0.75
5094,7,4,6,3,0,1.089844
33269,10,1,5,3,0,0.280029
9994,2,5,10,3,0,1.950195


## RandomForestClassifier


### Criando o modelo RandomForest


In [5]:
modelo_random_forest = RandomForestClassifier()

### Parametros para o RandomForest


In [6]:
dicionario_random_forest = {
    "n_estimators"      : np.arange(10, 110, 10),# de 10 a 100
    "criterion"         : ["gini", "entropy", "log_loss"],
    "max_depth"         : np.arange(2, 8, 1),# de 2 a 7
    "min_samples_split" : np.arange(2, 11, 1),# de 2 a 10
    "min_samples_leaf"  : np.arange(1, 6, 1),# de 1 a 5
    "max_features"      : ["sqrt", "log2", None],
    "max_leaf_nodes"    : [5, 10, 15, 20, None],
    "n_jobs"            : [-1],
    "random_state"      : [semente]
}

NameError: name 'semente' is not defined

### Utilizando o GridSearchCV


In [None]:
grid_search_randomForest = GridSearchCV(
    estimator = modelo_random_forest,
    param_grid = dicionario_random_forest,
    cv = 5
)

### Treinando o GridSearch para obter os melhores parâmetros


In [None]:
grid_search_randomForest.fit(x_train, y_train.ravel())
grid_search_randomForest.best_params_

### Imprimindo o Score do melhor modelo


In [None]:
grid_search_randomForest.score(x_train, y_train)

## KNN


### Criando o modelo KNN


In [None]:
modelo_knn = NearestNeighbors()

### Parametros para o KNN


In [None]:
dicionario_knn = {
    "n_neighbors" : np.arange(3, 17, 2),# de 3 a 15
    "radius"      : np.arange(1.0, 3.5, 0.5),# de 1.0 a 3.0
    "p"           : np.arange(1.0, 2.1, 0.1),# de 1.0 a 2.0
    "n_jobs"      : [-1]
}

### Utilizando o GridSearchCV


In [None]:
grid_search_KNN = GridSearchCV(
    estimator = modelo_knn,
    param_grid = dicionario_knn,
    cv = 5
)

### Treinando o GridSearch para obter os melhores parâmetros


In [None]:
grid_search_KNN.fit(x_train, y_train.ravel())
grid_search_KNN.best_params_

### Imprimindo o Score do melhor modelo


In [None]:
grid_search_KNN.score(x_train, y_train)

## SVC


### Criando o modelo SVM


In [None]:
modelo_svc = SVC()

### Parametros para o SVM


In [None]:
dicionario_svc = {
    "C"                         : np.logspace(-3, 3, 7),
    "kernel"                    : ["linear", "poly", "rbf", "sigmoid"],
    "degree"                    : np.arange(1, 6, 1),
    "tol"                       : np.logspace(-5, -1, 5),
    "max_iter"                  : [100, 200, 300, 400, 500, 600, -1],
    "decision_function_shape"   : ["ovr", "ovo"],
    "random_state"              :  [semente]
}

### Utilizando o GridSearchCV


In [None]:
grid_search_svc = GridSearchCV(
    estimator = modelo_svc,
    param_grid = dicionario_svc,
    cv = 5
)

### Treinando o GridSearch para obter os melhores parâmetros


In [None]:
grid_search_svc.fit(x_train, y_train.ravel())
grid_search_svc.best_params_

### Imprimindo o Score do melhor modelo


In [None]:
grid_search_svc.score(x_train, y_train)