In [11]:
import sys

sys.path.insert(0, "..")

In [12]:
import pandas as pd
from knn import perform_knn
import plotly.express as px
from best_model import create_best_model_df

In [13]:
df = pd.read_csv("../../processed_tables/merged_no_damns_standardized.csv")

## KNN

Esta vez utilizaremos `sklearn` para hacer KNN con regresión, variando parámetros como el número de vecinos, la métrica de distancia y la técnica de pesado. También usamos lags.


In [14]:
fecha_corte = "2023-07-01"
param_dist = {
    "n_neighbors": range(1, 30),
    "weights": ["uniform", "distance"],
    "p": [1, 2],
}

In [15]:
metrics = []

for i in range(1, 12):
    results = perform_knn(
        {"df": df, "param_dist": param_dist, "fecha_corte": fecha_corte, "lags": i}
    )
    metrics.append(
        {"mape": results["mape"], "mse_test": results["mse_test"], "lags": i}
    )

Best parameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 5}
Mean Squared Error on Test Data: 32241.110950580136
Mean Absolute Percentage Error: 0.1967518084598995
Best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 11}
Mean Squared Error on Test Data: 38179.60389840255
Mean Absolute Percentage Error: 0.21071669712951746
Best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 10}
Mean Squared Error on Test Data: 37907.64326626772
Mean Absolute Percentage Error: 0.2237386891783524
Best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 12}
Mean Squared Error on Test Data: 38684.49527416618
Mean Absolute Percentage Error: 0.2383551287725106
Best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 7}
Mean Squared Error on Test Data: 45427.01935242872
Mean Absolute Percentage Error: 0.25926660862280404
Best parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 15}
Mean Squared Error on Test Data: 46274.184544485965
Mean Absolute Percentage Erro

In [16]:
df_pruebas = pd.DataFrame(metrics)

fig_mse = px.line(
    df_pruebas, x="lags", y="mse_test", title="MSE de la data de test vs Lags"
)

fig_mse.show()

In [17]:
df_pruebas = pd.DataFrame(metrics)

fig_mse = px.line(
    df_pruebas, x="lags", y="mape", title="MAPE de la data de test vs Lags"
)

fig_mse.show()

In [18]:
df_pruebas.head()

Unnamed: 0,mape,mse_test,lags
0,0.196752,32241.110951,1
1,0.210717,38179.603898,2
2,0.223739,37907.643266,3
3,0.238355,38684.495274,4
4,0.259267,45427.019352,5


Con estas gráficas concluimos que el mejor KNN es con un solo lag.


## Exportación del mejor modelo


In [19]:
results = perform_knn(
    {"df": df, "param_dist": param_dist, "fecha_corte": fecha_corte, "lags": 1}
)
X_train_dates = results["X_train_dates"]
y_train = results["y_train"]
y_pred_train = results["y_pred_train"]
y_test = results["y_test"]
X_test_dates = results["X_test_dates"]
y_pred = results["y_pred"]

Best parameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 5}
Mean Squared Error on Test Data: 32241.110950580136
Mean Absolute Percentage Error: 0.1967518084598995


In [20]:
create_best_model_df(
    {
        "dates_train": X_train_dates,
        "y_train": y_train,
        "y_pred_train": y_pred_train,
        "y_test": y_test,
        "dates_test": X_test_dates,
        "y_pred_test": y_pred,
        "output_path_train": "KNN_with_lags_no_criteria_best_model_train.pkl",
        "output_path_test": "KNN_with_lags_no_criteria_best_model_test.pkl",
    }
)