In [11]:
import pandas as pd
import plotly.express as px
import joblib

In [12]:
data=pd.read_csv("../data_restaurantes_definitivo.csv", sep=";", encoding="utf-8")

In [13]:
data.columns

Index(['Unnamed: 0', 'Michelin', 'Slug', 'Bookable', 'Street', 'ZipCode',
       'Locality', 'Country', 'Longitude', 'Latitude', 'Cantidad_metodos_pago',
       'Reservas_today', 'Reservas_last_week', 'Rate_Distinction',
       'Numero_fotos', 'Is_Affiliated', 'Name', 'Average_Price',
       'Numero_awards', 'Review_count', 'Rating_count', 'Food_rating',
       'Service_rating', 'Ambience_rating', 'Tipo_comida', 'Provincia',
       'Poblacion', 'Salario Medio Anual', 'comunidad_autonoma',
       'Michelin numérico', 'Rate_Distinction_numérico'],
      dtype='object')

Eliminación de columnas

In [14]:
columnas_borrar=["Unnamed: 0","Slug","Street","Locality","Country","Michelin","Name","Rate_Distinction_numérico","comunidad_autonoma","ZipCode","Poblacion","Salario Medio Anual"]
data.drop(columnas_borrar, axis=1, inplace=True)

In [15]:
data.columns

Index(['Bookable', 'Longitude', 'Latitude', 'Cantidad_metodos_pago',
       'Reservas_today', 'Reservas_last_week', 'Rate_Distinction',
       'Numero_fotos', 'Is_Affiliated', 'Average_Price', 'Numero_awards',
       'Review_count', 'Rating_count', 'Food_rating', 'Service_rating',
       'Ambience_rating', 'Tipo_comida', 'Provincia', 'Michelin numérico'],
      dtype='object')

Tratamiento de valores nulos

In [16]:
ls_rate_distinction=[]
exchange={"Excellent":3,"Fabulous":2,"Very good":1}
for i in data["Rate_Distinction"]:
    if i in exchange.keys():
        ls_rate_distinction.append(exchange[i])
    else:
        ls_rate_distinction.append(0)
data["Rate_Distinction"]=ls_rate_distinction

In [17]:
# Imputamos la mediana a los valores nulos numéricos
for i in data.columns:
    if data[i].dtype=="float64":
        data[i].fillna(data[i].median(), inplace=True)

Recodificación de variables categóricas

In [18]:
data

Unnamed: 0,Bookable,Longitude,Latitude,Cantidad_metodos_pago,Reservas_today,Reservas_last_week,Rate_Distinction,Numero_fotos,Is_Affiliated,Average_Price,Numero_awards,Review_count,Rating_count,Food_rating,Service_rating,Ambience_rating,Tipo_comida,Provincia,Michelin numérico
0,True,-3.693348,40.436288,5,0,32,3,17,True,125.0,1,255.0,479.0,9.7,9.9,9.5,Colombian,madrid,1
1,True,-3.692337,40.427143,7,0,115,3,14,True,23.0,0,3318.0,7977.0,9.3,9.5,9.3,Fusion,madrid,0
2,True,-3.685201,40.425980,3,7,57,3,8,True,30.0,0,24.0,99.0,9.0,9.3,9.1,Asian,madrid,0
3,True,-3.690170,40.428110,6,0,140,3,11,True,55.0,0,824.0,2590.0,9.6,9.3,9.4,Asian,madrid,0
4,True,-3.692898,40.454068,7,9,234,3,14,True,25.0,0,907.0,3047.0,9.1,9.3,9.2,Mexican,madrid,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7603,True,-3.714279,40.423297,3,0,3,3,13,True,155.0,1,31.0,73.0,9.5,9.4,8.9,Mediterranean,madrid,1
7604,True,-3.671614,40.417718,5,2,41,3,17,True,22.0,0,2633.0,6059.0,9.4,9.3,9.1,Indian,madrid,0
7605,True,-3.700328,40.430676,7,11,139,3,14,True,22.0,0,99.0,301.0,9.0,9.0,9.0,International,madrid,0
7606,True,-3.689049,40.427816,4,1,2,2,36,True,40.0,0,18.0,72.0,8.6,8.9,8.4,French,madrid,0


MODELO 1

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np

df=data.copy()
# Suponiendo que tienes un DataFrame llamado df con tus datos
X = df.drop(columns=['Average_Price'])
y = df['Average_Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define el preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), ['Tipo_comida', 'Provincia'])] #, 'Poblacion'
)

# Define el pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Entrena el modelo
pipeline.fit(X_train, y_train)

# Evalúa el modelo
mse = pipeline.score(X_test, y_test)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

joblib.dump(pipeline, 'random_forest_model.pkl')

Mean Squared Error: 0.10762963461699149
Root Mean Squared Error: 0.3280695575895324


['random_forest_model.pkl']

In [20]:
from sklearn.model_selection import GridSearchCV

# Define el pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define los parámetros a ajustar
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Realiza la búsqueda de hiperparámetros
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Imprime los mejores parámetros encontrados
print("Best parameters found: ", grid_search.best_params_)

# Evalúa el mejor modelo
best_model = grid_search.best_estimator_
mse = -grid_search.best_score_
rmse = np.sqrt(mse)
print("Best Mean Squared Error:", mse)
print("Best Root Mean Squared Error:", rmse)

# Guarda el mejor modelo
import joblib
joblib.dump(best_model, 'random_forest_model_best.pkl')


Best parameters found:  {'regressor__max_depth': 30, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300}
Best Mean Squared Error: 312.56428164702555
Best Root Mean Squared Error: 17.679487595714576


['random_forest_model_best.pkl']

In [24]:
X.columns

Index(['Bookable', 'Longitude', 'Latitude', 'Cantidad_metodos_pago',
       'Reservas_today', 'Reservas_last_week', 'Rate_Distinction',
       'Numero_fotos', 'Is_Affiliated', 'Numero_awards', 'Review_count',
       'Rating_count', 'Food_rating', 'Service_rating', 'Ambience_rating',
       'Tipo_comida', 'Provincia', 'Michelin numérico'],
      dtype='object')

In [27]:
feature_importance = pipeline.named_steps['regressor'].feature_importances_

# Obtener el nombre de las características después de la transformación
encoded_feature_names = pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(input_features=['Tipo_comida', 'Provincia'])

# Crear un DataFrame para visualizar la importancia de las características
feature_importance_df = pd.DataFrame({'Feature': encoded_feature_names, 'Importance': feature_importance})

# Ordenar el DataFrame por importancia en orden descendente
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Imprimir el DataFrame con la importancia de las características
print(feature_importance_df)

                           Feature  Importance
40              Tipo_comida_Fusion    0.098501
72             Tipo_comida_Spanish    0.048629
128              Provincia_vizcaya    0.048016
4             Tipo_comida_American    0.046198
50             Tipo_comida_Italian    0.045341
..                             ...         ...
62         Tipo_comida_Palestinian    0.000051
39   Tipo_comida_From the Pyrenees    0.000029
77             Tipo_comida_Tibetan    0.000025
75              Tipo_comida_Syrian    0.000022
70           Tipo_comida_Siciliano    0.000020

[131 rows x 2 columns]


MODELO 2

In [32]:
from xgboost import XGBRegressor
from sklearn.preprocessing import OrdinalEncoder

df=data.copy()
X = df.drop(columns=['Average_Price'])
y = df['Average_Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define el preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['Tipo_comida', 'Provincia'])]
)

# Define el pipeline
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

# Entrena el modelo
pipeline2.fit(X_train, y_train)

# Evalúa el modelo
mse = pipeline2.score(X_test, y_test)
print("Mean Squared Error:", mse)
joblib.dump(pipeline2, 'xgboost_model.pkl')


Mean Squared Error: 0.08697328707900254


['xgboost_model.pkl']

MODELO 3

In [8]:
import joblib

In [19]:
from sklearn.linear_model import LinearRegression
from category_encoders import CountEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np


# Define el preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[('cat', CountEncoder(), ['Tipo_comida', 'Provincia'])]
)

# Define el pipeline
pipeline3 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Entrena el modelo
pipeline3.fit(X_train, y_train)

# Evalúa el modelo
mse = pipeline3.score(X_test, y_test)
print("Mean Squared Error:", mse)
joblib.dump(pipeline3, 'linear_regression_model.pkl')


# Obtener los coeficientes de regresión del modelo de regresión lineal
coefficients = pipeline3.named_steps['regressor'].coef_

# Obtener el nombre de las características después de la transformación
encoded_feature_names = pipeline3.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(input_features=['Tipo_comida', 'Provincia'])

# Crear un DataFrame para visualizar los coeficientes de regresión
coefficients_df = pd.DataFrame({'Feature': encoded_feature_names, 'Coefficient': coefficients})

# Ordenar el DataFrame por coeficientes en valor absoluto en orden descendente
coefficients_df['Absolute_Coefficient'] = abs(coefficients_df['Coefficient'])
coefficients_df = coefficients_df.sort_values(by='Absolute_Coefficient', ascending=False)

# Imprimir el DataFrame con los coeficientes de regresión
print(coefficients_df)


Mean Squared Error: 0.006463577559170264
       Feature  Coefficient  Absolute_Coefficient
0  Tipo_comida     0.002964              0.002964
1    Provincia    -0.001894              0.001894


In [32]:
len(X_train.columns)

22

Guadar archivos como pkl

In [None]:
#MOdelo 1

In [36]:
import joblib
from sklearn.linear_model import LinearRegression
from category_encoders import CountEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define el preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[('cat', CountEncoder(), ['Tipo_comida', 'Provincia', 'Poblacion', 'comunidad_autonoma'])]
)

# Define el pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Entrena el modelo
pipeline.fit(X_train, y_train)

# Guarda el modelo en un archivo pkl
joblib.dump(pipeline, 'linear_regression_model.pkl')




['linear_regression_model.pkl']

In [92]:
import joblib
import pandas as pd

# Función para cargar el modelo y predecir
def load_model_and_predict3(input_data):
    # Carga el modelo desde el archivo pkl
    loaded_model = joblib.load('linear_regression_model.pkl')
    
    # Realiza la predicción
    prediction = loaded_model.predict(input_data)
    return prediction

# Ejemplo de uso: predice el valor para la primera fila de X_train
first_row = X_train.iloc[-4:-3]
predicted_value = load_model_and_predict3(first_row)
print("Predicted value for the first row of X_train:", predicted_value[0])

Predicted value for the first row of X_train: 34.693974005943524


In [76]:
# modelo 2

import joblib
from xgboost import XGBRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define el preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['Tipo_comida', 'Provincia', 'Poblacion', 'comunidad_autonoma'])]
)

# Define el pipeline
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

# Entrena el modelo
pipeline2.fit(X_train, y_train)

# Guarda el modelo en un archivo pkl
joblib.dump(pipeline2, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [77]:
import joblib
import pandas as pd

# Función para cargar el modelo y predecir
def load_model_and_predict2(input_data):
    # Carga el modelo desde el archivo pkl
    loaded_model = joblib.load('xgboost_model.pkl')
    
    # Realiza la predicción
    prediction = loaded_model.predict(input_data)
    return prediction

# Ejemplo de uso: predice el valor para la primera fila de X_train
first_row = X_train.iloc[:5]
predicted_value = load_model_and_predict2(first_row)
print("Predicted value for the first row of X_train:", predicted_value)


Predicted value for the first row of X_train: [16.781967 33.83176  21.931383 21.070848 32.30745 ]


In [69]:
y_test

2644    25.0
2227    20.0
5698    25.0
132     25.0
3172    18.0
        ... 
5924    19.0
1835    20.0
506     25.0
3590    14.0
5313    26.0
Name: Average_Price, Length: 1522, dtype: float64

In [70]:
# modelo 1

import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np

# Suponiendo que tienes un DataFrame llamado df con tus datos
df = data.copy()
X = df.drop(columns=['Average_Price'])
y = df['Average_Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define el preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), ['Tipo_comida', 'Provincia', 'Poblacion', 'comunidad_autonoma'])]
)

# Define el pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Entrena el modelo
pipeline.fit(X_train, y_train)

# Guarda el modelo en un archivo pkl
joblib.dump(pipeline, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [82]:
import joblib
import pandas as pd

# Función para cargar el modelo y predecir
def load_model_and_predict1(input_data):
    # Carga el modelo desde el archivo pkl
    loaded_model = joblib.load('random_forest_model.pkl')
    
    # Realiza la predicción
    prediction = loaded_model.predict(input_data)
    return prediction

# Ejemplo de uso: predice el valor para la primera fila de X_train
first_row = X_train.iloc[0:1]
predicted_value = load_model_and_predict1(first_row)
print("Predicted value for the first row of X_train:", predicted_value)


Predicted value for the first row of X_train: [16.59431885]


In [86]:
def predict_general(input_data):
    p1=load_model_and_predict1(input_data)[0]
    p2=load_model_and_predict2(input_data)[0]
    p3=load_model_and_predict3(input_data)[0]
    
    return np.mean([p1,p2,p3])

In [85]:
load_model_and_predict1(first_row)[0]

16.594318852324395

In [95]:
for i in range(5):
    first_row = X_train.iloc[i:i+1]
    print("Predicted value for the first row of X_train:", predict_general(first_row))

Predicted value for the first row of X_train: 19.758642244582457
Predicted value for the first row of X_train: 32.61540858598364
Predicted value for the first row of X_train: 23.630801410889855
Predicted value for the first row of X_train: 23.963628271498177
Predicted value for the first row of X_train: 30.08532201909536


In [1]:
y_test

NameError: name 'y_test' is not defined