In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
# Cargar datos
df2 = pd.read_csv('/mnt/data/datos2_limpios.csv')

# Usar solo la variable 'Edad' para entrenar un modelo simplificado
df2_limpio = df2[['Edad', 'Años Hasta Jubilación']]

In [None]:
# Definir variables de entrada y salida
X = df2_limpio[['Edad']]
y = df2_limpio['Años Hasta Jubilación']

In [None]:
# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Modelo con Random Forest y búsqueda de hiperparámetros
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestRegressor(random_state=0)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


In [None]:
# Evaluar modelo
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MAE: {mae}, R2: {r2}')


In [None]:

# Cargar nuevo dataset y predecir
df = pd.read_csv('/mnt/data/datos1_limpios.csv')

In [None]:

# Usar solo la variable 'Edad' para predecir
df['Años Hasta Jubilación'] = best_model.predict(df[['EDAD']])
df['Edad de Jubilación'] = df['EDAD'] + df['Años Hasta Jubilación']

# Mostrar resultados
df.head()