In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from lazypredict.Supervised import LazyRegressor

# Cargar datos
train = pd.read_csv("dengue_features_train.csv")
train_labels = pd.read_csv("dengue_labels_train.csv")
test = pd.read_csv("dengue_features_test.csv")

# Unir etiquetas al conjunto de entrenamiento
train = train.merge(train_labels, on=["city", "year", "weekofyear"])

# Convertir fecha a datetime y eliminar la columna
train["week_start_date"] = pd.to_datetime(train["week_start_date"])
test["week_start_date"] = pd.to_datetime(test["week_start_date"])
train.drop(columns=["week_start_date"], inplace=True)
test.drop(columns=["week_start_date"], inplace=True)

# Seleccionar solo las columnas numéricas para la imputación (excluyendo city, year, weekofyear, total_cases)
num_features = train.columns.difference(["city", "year", "weekofyear", "total_cases"])

# Imputar valores faltantes con la media solo en las columnas numéricas
imputer = SimpleImputer(strategy="mean")
train[num_features] = imputer.fit_transform(train[num_features])
test[num_features] = imputer.transform(test[num_features])

# Separar características y etiquetas
X = train.drop(columns=["total_cases"])
y = train["total_cases"]

# Normalizar datos solo en las columnas numéricas
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])
test[num_features] = scaler.transform(test[num_features])

# Dividir datos en entrenamiento y prueba
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Lazy Predict para evaluar modelos automáticamente
lazy_reg = LazyRegressor()
models, predictions = lazy_reg.fit(X_train[num_features], X_val[num_features], y_train, y_val)
print(models)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train[num_features], y_train)
y_pred_nb = nb.predict(X_val[num_features])
mae_nb = mean_absolute_error(y_val, y_pred_nb)
print(f"MAE Naive Bayes: {mae_nb}")

# KNN
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train[num_features], y_train)
y_pred_knn = knn.predict(X_val[num_features])
mae_knn = mean_absolute_error(y_val, y_pred_knn)
print(f"MAE KNN: {mae_knn}")

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train[num_features], y_train)
y_pred_rf = rf.predict(X_val[num_features])
mae_rf = mean_absolute_error(y_val, y_pred_rf)
print(f"MAE Random Forest: {mae_rf}")

# GridSearch en KNN
param_grid = {"n_neighbors": [3, 5, 7, 9, 11]}
grid_search_knn = GridSearchCV(KNeighborsRegressor(), param_grid, scoring="neg_mean_absolute_error", cv=3)
grid_search_knn.fit(X_train[num_features], y_train)
print("Mejor K para KNN:", grid_search_knn.best_params_)

# RandomizedSearch en Random Forest
param_dist = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
random_search_rf = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_dist, n_iter=10, scoring="neg_mean_absolute_error", cv=3, random_state=42)
random_search_rf.fit(X_train[num_features], y_train)
print("Mejores hiperparámetros para Random Forest:", random_search_rf.best_params_)

# Generar predicciones finales
best_rf = random_search_rf.best_estimator_
final_predictions = best_rf.predict(test[num_features])

# Crear DataFrame de envío
submission = test[["city", "year", "weekofyear"]].copy()
submission["total_cases"] = np.round(final_predictions).astype(int)

# Guardar CSV
submission.to_csv("submission.csv", index=False)


100%|██████████| 42/42 [00:13<00:00,  3.08it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4412
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 20
[LightGBM] [Info] Start training from score 23.116838
                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
GradientBoostingRegressor                    0.39       0.43 42.63        1.43
RandomForestRegressor                        0.37       0.42 43.24        2.24
AdaBoostRegressor                            0.34       0.39 44.43        0.77
XGBRegressor                                 0.34       0.38 44.57        0.56
HistGradientBoostingRegressor                0.33       0.38 44.71        0.61
BaggingRegressor                             0.33       0.37 44.87        0.52
LGBMRegressor             




MAE Naive Bayes: 29.86986301369863
MAE KNN: 23.84931506849315
MAE Random Forest: 22.15398503750815
Mejor K para KNN: {'n_neighbors': 11}
Mejores hiperparámetros para Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 20}
