### Importando as bibliotecas e pacotes


In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from joblib import dump
import pandas as pd
import numpy as np
import pickle

### Carregando e visualizando o dataset


In [2]:
path = "../data/dados_consumo_agua.pkl"

df = pd.read_pickle(path)
df.head()

Unnamed: 0,ano,mes,dia,hora,quantidade_pessoas,cidade,bairro,consumo_agua_m3,padrao_consumo
0,2020,1,Quarta,0,3,Joinville,Centro,0.040009,Normal
1,2020,1,Quarta,1,3,Joinville,Centro,0.26001,Normal
2,2020,1,Quarta,2,3,Joinville,Centro,0.290039,Normal
3,2020,1,Quarta,3,3,Joinville,Centro,0.180054,Normal
4,2020,1,Quarta,4,3,Joinville,Centro,0.580078,Alto


In [3]:
df['padrao_consumo'].value_counts(normalize=True)

padrao_consumo
Normal    0.794035
Alto      0.205965
Name: proportion, dtype: float64

### Pré-processamento

In [4]:
# VERIFICANDO AS CLASSES ORIGINAIS
column_list = [
    "dia",
    "cidade",
    "bairro",
    "padrao_consumo"
]
for colunm in column_list:
    print("COLUNA:", colunm, "\nVALORES:", list(df[colunm].unique()), end = "\n")

COLUNA: dia 
VALORES: ['Quarta', 'Quinta', 'Sexta', 'Sábado', 'Domingo', 'Segunda', 'Terça']
COLUNA: cidade 
VALORES: ['Joinville']
COLUNA: bairro 
VALORES: ['Centro']
COLUNA: padrao_consumo 
VALORES: ['Normal', 'Alto']


In [14]:
# Divisão em features e labels
X, y = df.drop(columns=["ano", "padrao_consumo"]), df["padrao_consumo"]

# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Transformar variáveis categóricas em numéricas
columns_category = X_train.select_dtypes(include="category").columns

arquivo_dia = open("../model/label_for_dia.sav", "wb")
arquivo_cidade = open("../model/label_for_cidade.sav", "wb")
arquivo_bairro = open("../model/label_for_dia.bairro.sav", "wb")
arquivo_padrao_consumo = open("../model/label_for_padrao_consumo.sav", "wb")

for column in columns_category:
    le = LabelEncoder()
    X_train[column] = le.fit_transform(X_train[column])
    X_test[column] = le.transform(X_test[column])
    
    # SALVANDO OS LabelEncoder'S EM VARIOS ARQUIVOS .sav
    match column:
        case "dia":
            pickle.dump(le, arquivo_dia)
            arquivo_dia.close()
        case "cidade":
            pickle.dump(le, arquivo_cidade)
            arquivo_cidade.close()
        case "bairro":
            pickle.dump(le, arquivo_bairro)
            arquivo_bairro.close()
        case _:
            pickle.dump(le, arquivo_padrao_consumo)
            arquivo_padrao_consumo.close()

# Normalização
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Balanceamento dos dados
X_train, y_train = RandomUnderSampler().fit_resample(X_train, y_train)

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

print("\n", y_train.value_counts(normalize=True))
print("\n", y_test.value_counts(normalize=True))

X_train shape: (11574, 7), y_train shape: (11574,)
X_test shape: (7008, 7), y_test shape: (7008,)

 padrao_consumo
Alto      0.5
Normal    0.5
Name: proportion, dtype: float64

 padrao_consumo
Normal    0.795947
Alto      0.204053
Name: proportion, dtype: float64


### Treinamento e avaliação dos modelos

In [6]:
models = [
    RandomForestClassifier(),
    SVC(),
    KNeighborsClassifier()
]

accuracies = []
precisions = []
recalls = []
f1s = []

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracies.append(accuracy_score(y_test, y_pred) * 100)
    precisions.append(precision_score(y_test, y_pred, average="weighted") * 100)
    recalls.append(recall_score(y_test, y_pred, average="weighted") * 100)
    f1s.append(f1_score(y_test, y_pred, average="weighted") * 100)

df_score = pd.DataFrame({
    "Model": ["RandomForest", "SVC", "KNN"],
    "Accuracy": accuracies,
    "Precision": precisions,
    "Recall": recalls,
    "F1": f1s
})

df_score

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,RandomForest,99.28653,99.288378,99.28653,99.287258
1,SVC,95.476598,95.643066,95.476598,95.532307
2,KNN,97.188927,97.28838,97.188927,97.218293


### Salvar o modelo

In [7]:
dump(models[0], "../model/random_forest.joblib")

['../model/random_forest.joblib']