In [29]:
# Librerías básicas

import pandas as pd
import pickle
import pandas as pd
import numpy as np
import yaml
import os
import pickle

# Librerías Machine Learning 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, ElasticNet,LogisticRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Librerías Model Selection y evaluación
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# Librerías features transformación 
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectFromModel

In [13]:
# Leer dataset 

ruta_archivo = 'data/tapas.xlsx'

# Lee el archivo de Excel
df = pd.read_excel(ruta_archivo)

# Muestra las primeras filas del DataFrame
print(df.head())


   nacimiento  genero bebida_favorita  tapa_favorita  comunidad_autonoma
0         NaN     NaN     Vino blanco            NaN                 NaN
1         NaN     NaN      Vino tinto            NaN                 NaN


In [14]:
# Train test split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
# Target
target = 'tapa_favorita'

In [19]:
# Features
columnas_categoricas = ['nacimiento','genero','bebida_favorita','comunidad_autonoma']
features = columnas_categoricas

In [20]:
# Aplicar LabelEncoder a features
train_set_encoded = train_set.copy()
test_set_encoded = test_set.copy()
for feature in features:
    all_values = pd.concat([train_set[feature], test_set[feature]], axis=0).unique()
    label_encoder = LabelEncoder()
    label_encoder.fit(all_values)
    train_set_encoded[feature] = label_encoder.transform(train_set[feature])
    test_set_encoded[feature] = label_encoder.transform(test_set[feature])

# Mostrar los DataFrames codificados
print(train_set_encoded.head())
print(test_set_encoded.head())

   nacimiento  genero  bebida_favorita  tapa_favorita  comunidad_autonoma
0           0       0                0            NaN                   0
   nacimiento  genero  bebida_favorita  tapa_favorita  comunidad_autonoma
1           0       0                1            NaN                   0


In [24]:
# x y train/test
X_train = train_set_encoded.drop(columns=['tapa_favorita'])
y_train = train_set_encoded['tapa_favorita']
X_test = test_set_encoded.drop(columns=['tapa_favorita'])
y_test = test_set_encoded['tapa_favorita']

In [22]:
# Definir los modelos
modelos = {
    "LightGBM": LGBMRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Elastic Net": ElasticNet(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree Classifier": DecisionTreeClassifier(max_depth=10, random_state=42)
}

# Definir los hiperparámetros para cada modelo
parametros = {
    "LightGBM": {
        'max_depth': [10, 20],
        'n_estimators': [100, 200]
    },
    "XGBoost": {
        'max_depth': [10, 20],
        'n_estimators': [100, 200]
    },
    "Gradient Boosting": {
        'max_depth': [10, 20],
        'n_estimators': [100, 200]
    },
    "Linear Regression": {
        'fit_intercept': [True, False]
    },
    "Random Forest": {
        'max_depth': [10, 20],
        'n_estimators': [100, 200]
    },
    "Decision Tree Regressor": {
        'max_depth': [10, 20],
        'min_samples_split': [2, 10]
    },
    "Elastic Net": {
        'alpha': [0.1, 1.0, 10.0],
        'l1_ratio': [0.1, 0.5, 0.9]
    },
    "KNN": {
        'n_neighbors': [3, 5, 7]
    },
    "Decision Tree Classifier": {
        'max_depth': [10, 20],
        'min_samples_split': [2, 10]
    }
}

In [None]:
# Realizar validación cruzada y calcular balanced_accuracy
for tipo, modelo in modelos.items():
    print(f"{tipo}: ", end=" ")
    scores = cross_val_score(modelo, X_train, y_train, cv=5, scoring="balanced_accuracy")
    print(np.mean(scores).round(2))
print("******")

In [26]:
# Grid Search
resultados = {}
for nombre_modelo, modelo in modelos.items():
    print(f"Entrenando modelo: {nombre_modelo}")
    grid_search = GridSearchCV(estimator=modelo, param_grid=parametros[nombre_modelo], cv=3, n_jobs=-1, scoring='r2')
    grid_search.fit(X_train, y_train)
    print(f"Mejores parámetros para {nombre_modelo}: {grid_search.best_params_}")
    print(f"Mejor score para {nombre_modelo}: {grid_search.best_score_}")
    resultados[nombre_modelo] = grid_search.best_estimator_

# Imprimir los mejores modelos y sus parámetros
for nombre_modelo, mejor_modelo in resultados.items():
    print(f"Mejor modelo para {nombre_modelo}: {mejor_modelo}")

Entrenando modelo: LightGBM


ValueError: Cannot have number of splits n_splits=3 greater than the number of samples: n_samples=1.

In [None]:
# Selección manual del modelo (De momento por poner uno Random Forest)
modelo_seleccionado = RandomForestRegressor(max_iter=1000, random_state=42)

# Ajustar el modelo a los datos de entrenamiento
modelo_seleccionado.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = modelo_seleccionado.predict(X_test)

In [27]:
# Métricas de evaluación
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


NameError: name 'y_pred' is not defined

In [None]:
# Definir el directorio y la ruta del archivo
models_dir = 'models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

results_file_path = os.path.join(models_dir, 'modelo.pkl')

# Guardar el modelo en un archivo
with open(results_file_path, 'wb') as file:
    pickle.dump(modelo_seleccionado, file)

print(f"Modelo guardado en {results_file_path}")