### Importando as bibliotecas e pacotes


In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

### Carregando e visualizando o dataset


In [2]:
path = "../data/dados_consumo_agua.pkl"

df = pd.read_pickle(path)
df.head()

Unnamed: 0,ano,mes,dia,hora,quantidade_pessoas,regiao,consumo_agua_m3,padrao_consumo
0,2020,1,Quarta,0,3,Sul,0.540039,Alto
1,2020,1,Quarta,1,3,Sul,0.300049,Normal
2,2020,1,Quarta,2,3,Sul,0.529785,Alto
3,2020,1,Quarta,3,3,Sul,0.620117,Alto
4,2020,1,Quarta,4,3,Sul,0.23999,Normal


In [3]:
df['padrao_consumo'].value_counts(normalize=True)

padrao_consumo
Normal    0.79843
Alto      0.20157
Name: proportion, dtype: float64

### Pré-processamento

In [4]:
# Divisão em features e labels
X, y = df.drop(columns=["ano", "padrao_consumo"]), df["padrao_consumo"]

# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Transformar variáveis categóricas em numéricas
columns_category = X_train.select_dtypes(include="category").columns
for column in columns_category:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])

# Normalização
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Balanceamento dos dados
X_train, y_train = RandomUnderSampler().fit_resample(X_train, y_train)

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

print("\n", y_train.value_counts(normalize=True))
print("\n", y_test.value_counts(normalize=True))

X_train shape: (11246, 6), y_train shape: (11246,)
X_test shape: (7008, 6), y_test shape: (7008,)

 padrao_consumo
Alto      0.5
Normal    0.5
Name: proportion, dtype: float64

 padrao_consumo
Normal    0.794521
Alto      0.205479
Name: proportion, dtype: float64


### Treinamento e avaliação dos modelos

In [5]:
models = [
    RandomForestClassifier(),
    SVC(),
    KNeighborsClassifier()
]

accuracies = []
precisions = []
recalls = []
f1s = []

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracies.append(accuracy_score(y_test, y_pred) * 100)
    precisions.append(precision_score(y_test, y_pred, average="weighted"))
    recalls.append(recall_score(y_test, y_pred, average="weighted"))
    f1s.append(f1_score(y_test, y_pred, average="weighted"))

df_score = pd.DataFrame({
    "Model": ["RandomForest", "SVC", "KNN"],
    "Accuracy": accuracies,
    "Precision": precisions,
    "Recall": recalls,
    "F1": f1s
})

df_score

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,RandomForest,99.543379,0.995429,0.995434,0.995426
1,SVC,97.60274,0.976244,0.976027,0.976113
2,KNN,98.07363,0.981246,0.980736,0.980883
