In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  make_scorer, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import pickle

In [2]:
def mapear_a_puntaje(valor):
    if valor == -1:
        return 0  # Descalificado
    elif valor == 0:
        return 62.5  # Calidad Media (promedio de 50-74)
    elif valor == 1:
        return 87.5  # Calidad Alta (promedio de 75-100)

In [3]:
df = pd.read_csv('Datos_Train_Calidad_Lead_Clean.csv')
df.head()

Unnamed: 0,Cuota_Inicial__c,Presupuesto_inmueble_a_comprar__c,ingresos_totales__c,Ingresos_familiares__c,Calidad_de_lead__c,picklist_ciudad__c
0,7000000.0,300000000.0,5000000.0,5000000.0,-1,1
1,2000000.0,200000000.0,2800000.0,2800000.0,-1,1
2,4000000.0,0.0,1000000.0,1000000.0,-1,1
3,1000000.0,18000000.0,1800000.0,1800000.0,-1,1
4,0.0,170000000.0,3000000.0,3000000.0,-1,1


In [4]:
# Mapear la columna 'Calidad_de_lead__c' a 'Calidad_de_lead_score'
df['Puntaje_de_calidad'] = df['Calidad_de_lead__c'].apply(mapear_a_puntaje)

In [5]:
# Separar las columnas de entrada y la salida
X = df.drop(['Calidad_de_lead__c', 'Puntaje_de_calidad'], axis=1)
y = df['Puntaje_de_calidad']

In [6]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Normalizar los datos
escalador = StandardScaler()
X_train = escalador.fit_transform(X_train)
X_test = escalador.transform(X_test)

In [8]:
# Definir los modelos
modelos = {
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42),
    'LinearRegression': LinearRegression(),
    'SVR': SVR()
}

In [9]:
# Definir los hiperparámetros a probar
parametros_grid = {
    'RandomForestRegressor': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    'GradientBoostingRegressor': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 0.9, 1.0]
    },
    'LinearRegression': {
        'fit_intercept': [True, False],
    },
    'SVR': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto'],
        'epsilon': [0.1, 0.2, 0.5]
    }
}

# Definir un scorer basado en mean_absolute_error
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Buscar los mejores hiperparámetros para cada modelo
mejores_modelos = {}
for nombre_modelo, modelo in modelos.items():
    print(f"Buscando mejores hiperparámetros para {nombre_modelo}...")
    grid_search = GridSearchCV(estimator=modelo, param_grid=parametros_grid[nombre_modelo], scoring=mae_scorer, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    mejores_modelos[nombre_modelo] = grid_search.best_estimator_
    print(f"Mejores hiperparámetros para {nombre_modelo}: {grid_search.best_params_}")
    print(f"Mejor puntuación (MAE): {abs(grid_search.best_score_)}\n")

# Evaluar los mejores modelos en el conjunto de prueba
for nombre_modelo, modelo in mejores_modelos.items():
    y_pred = modelo.predict(X_test)
    mse = mean_absolute_error(y_test, y_pred)
    print(f"MAE para el mejor {nombre_modelo}: {mse}")

Buscando mejores hiperparámetros para RandomForestRegressor...
Mejores hiperparámetros para RandomForestRegressor: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Mejor puntuación (MAE): 6.495865743918249

Buscando mejores hiperparámetros para GradientBoostingRegressor...
Mejores hiperparámetros para GradientBoostingRegressor: {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 1.0}
Mejor puntuación (MAE): 7.359079182281867

Buscando mejores hiperparámetros para LinearRegression...
Mejores hiperparámetros para LinearRegression: {'fit_intercept': True}
Mejor puntuación (MAE): 23.980685180130553

Buscando mejores hiperparámetros para SVR...
Mejores hiperparámetros para SVR: {'C': 100, 'epsilon': 0.2, 'gamma': 'auto', 'kernel': 'rbf'}
Mejor puntuación (MAE): 11.178183431453764

MAE para el mejor RandomForestRegressor: 6.820132013201319
MAE para el mejor GradientB

In [10]:
rfr = make_pipeline(RobustScaler(), RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=False
))

In [11]:
gbr = make_pipeline(RobustScaler(), GradientBoostingRegressor(
    learning_rate= 0.1, 
    max_depth= 7, 
    min_samples_leaf= 1, 
    min_samples_split= 2, 
    n_estimators= 100, 
    subsample= 1.0
))

In [12]:
lreg = make_pipeline(RobustScaler(), LinearRegression(
    fit_intercept= True
))

In [13]:
svr = make_pipeline(RobustScaler(), SVR(
    C= 100, 
    epsilon= 0.2, 
    gamma= 'auto', 
    kernel= 'rbf'
    ))

In [14]:
rfr.fit(X_train, y_train)
gbr.fit(X_train, y_train)
lreg.fit(X_train, y_train)
svr.fit(X_train, y_train)

In [31]:
models_list = [rfr, gbr, lreg, svr]
model_weights = [5, 5, 1, 1]

In [32]:
def blend_models_predict(x, models, weights=[]):
    if not weights:
        weights = [1/(len(models)) for i in range(len(models))]
    if len(models) != len(weights):
        return(print('Diferentes longitudes'))
    s = sum(weights)
    weights = [float(i)/s for i in weights]
    res = np.zeros(len(x))
    for i in range(len(models)):
        res += weights[i] * models[i].predict(x)
    return(res)

In [33]:
y_pred = blend_models_predict(X_train, models_list, model_weights)

In [34]:
print(y_pred)

[ 4.22017622e+00  1.26942022e+00  1.34612414e+00  8.58300621e-01
  3.69489848e+00  4.67056450e+00  2.26492917e+00  7.01337152e-01
  8.66982994e+01  8.24804104e-01  1.51546688e+00 -1.59751123e+00
  2.43903631e+00 -8.39962129e-01  3.09154599e+00 -4.03220010e-01
  7.57041263e-01  8.45943390e+01  8.22511063e+01  5.99806626e+01
  7.70752984e+01  1.10416832e+00  8.33993212e+01 -1.13988395e+00
  5.59876190e-01  1.21706457e+00  9.35733314e-01  8.41538464e+01
  9.07248018e+00  4.45375896e-01  7.10920566e-01  1.35706319e+00
  7.45012821e+01  1.53174628e+00 -2.68798392e-01  8.01981393e+01
  1.93791823e+00  4.21687192e+00  2.09304570e+00  5.48393107e+01
  4.54224824e+00  2.10510069e+00 -3.41857815e-01  7.93108172e-01
  1.93363890e+00  7.77290101e-01  1.00712752e+00 -1.34384572e+00
  9.70606225e-01  1.03527880e+00  8.25638871e+01  7.98099083e+01
 -1.32848640e+00  5.61744839e+01 -2.79167458e+00  1.68970541e+00
  6.72195965e-01  5.72613140e+01  9.22875382e-01  1.04528252e+00
  2.88839567e+00  8.49315

In [35]:
df['y_pred'] = pd.Series(y_pred)

In [36]:
df

Unnamed: 0,Cuota_Inicial__c,Presupuesto_inmueble_a_comprar__c,ingresos_totales__c,Ingresos_familiares__c,Calidad_de_lead__c,picklist_ciudad__c,Puntaje_de_calidad,y_pred
0,7000000.0,300000000.0,5000000.0,5000000.0,-1,1,0.0,4.220176
1,2000000.0,200000000.0,2800000.0,2800000.0,-1,1,0.0,1.269420
2,4000000.0,0.0,1000000.0,1000000.0,-1,1,0.0,1.346124
3,1000000.0,18000000.0,1800000.0,1800000.0,-1,1,0.0,0.858301
4,0.0,170000000.0,3000000.0,3000000.0,-1,1,0.0,3.694898
...,...,...,...,...,...,...,...,...
666,1000000.0,450.0,6800000.0,6800000.0,-1,1,0.0,
667,90000000.0,600000000.0,30000000.0,0.0,-1,1,0.0,
668,37000000.0,160000000.0,4200000.0,4200000.0,-1,0,0.0,
669,40000000.0,172000000.0,8000000.0,8000000.0,-1,0,0.0,
