In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, TweedieRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
# Cargar datos
path_datos = os.path.join('..','..','Datos','Limpios')
df2 = pd.read_csv(os.path.join(path_datos,'datos2_limpios.csv'), index_col=0)

df2_limpio = df2.drop(['Porcentaje Ahorro (%)', 'Ahorro Actual (€)', 'Gasto Mensual (€)',
                        'Vida Esperada (años)', 'Edad de Jubilación'], axis=1)

In [3]:

# Definir variables de entrada y salida
X = df2_limpio.drop(columns=['Años Hasta Jubilación'])
y = df2_limpio['Años Hasta Jubilación']

In [4]:

# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [5]:
# Modelos a evaluar
models = [LinearRegression(), Lasso(max_iter=1000), Ridge(max_iter=1000), ElasticNet(max_iter=1000), TweedieRegressor(max_iter=10000)]

In [6]:
# Evaluación de modelos
for model in models:
    cv_score = cross_validate(model, X_train, y_train, scoring=['neg_mean_absolute_error','r2'], cv=5)
    
    mean_mae = np.mean(cv_score['test_neg_mean_absolute_error'])
    mean_r2 = np.mean(cv_score['test_r2'])
    
    print(f"MAE for model {model} = {mean_mae}")
    print(f"R2 for model {model} = {mean_r2}")

MAE for model LinearRegression() = -1.42721674158758
R2 for model LinearRegression() = 0.9823301306663744
MAE for model Lasso() = -1.4272285104350628
R2 for model Lasso() = 0.9822915385607898
MAE for model Ridge(max_iter=1000) = -1.427216746228429
R2 for model Ridge(max_iter=1000) = 0.9823301306523066
MAE for model ElasticNet() = -1.4272290838346038
R2 for model ElasticNet() = 0.9822918540011575
MAE for model TweedieRegressor(max_iter=10000) = -1.4272296523901655
R2 for model TweedieRegressor(max_iter=10000) = 0.9822921665059073


found 0 physical cores < 1
  File "c:\BDATA_2\reto07_verde_oscuro\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [7]:
# Hiperparámetros para GridSearchCV
model_hyperparameters = {
    'lin_reg':{},
    'lasso_reg':{'alpha':[0.5,1,5,10,20]},
    'ridge_reg':{'alpha':[0.5,1,5,10,20]},
    'elastic_reg':{'alpha':[0.5,1,5,10,20], 'l1_ratio':[0.1,0.3,0.5,0.7,0.9]},
    'tweedie_reg':{'power':[0,1,2,3], 'alpha':[0, 0.5, 1, 5, 10, 20], 'link':['log','identity']}
}


In [8]:
# Optimización de modelos con GridSearchCV
result = []
model_keys = list(model_hyperparameters.keys())

In [9]:
# Optimización de modelos con GridSearchCV
result = []
model_keys = list(model_hyperparameters.keys())

i = 0
for model in models:
    key = model_keys[i]
    i += 1
    params = model_hyperparameters[key]
    
    regressor = GridSearchCV(model, params, cv=5, scoring=['neg_mean_absolute_error','r2'], refit='neg_mean_absolute_error')
    regressor.fit(X_train, y_train)
    
    result.append({
        'model_used': model,
        'highest_score': regressor.best_score_,
        'best_hyperparameters': regressor.best_params_
    })

ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n

In [None]:
df_results = pd.DataFrame(result)
print(df_results)

                         model_used  highest_score  \
0                LinearRegression()      -1.427217   
1                           Lasso()      -1.427223   
2              Ridge(max_iter=1000)      -1.427217   
3                      ElasticNet()      -1.427223   
4  TweedieRegressor(max_iter=10000)      -1.427217   

                           best_hyperparameters  
0                                            {}  
1                                {'alpha': 0.5}  
2                                {'alpha': 0.5}  
3               {'alpha': 0.5, 'l1_ratio': 0.9}  
4  {'alpha': 0, 'link': 'identity', 'power': 0}  
