In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_basico = pd.read_csv("../dataset_basico.csv")



# --- 1. Separar variables independientes y dependientes ---
X = df_basico.drop(columns=["Precio_usd"]).values
y = df_basico["Precio_usd"].values

# --- 2. Separar train/validación ---
X_basico, X_basico_val, y_basico, y_basico_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3. Definir modelo base ---
modelo_basico = Ridge()

# --- 4. Búsqueda de hiperparámetros ---
param_grid = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(modelo_basico, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_basico, y_basico)

# --- 5. Mejor modelo ---
mejor_modelo_basico = grid_search.best_estimator_
print(f"Mejor alpha encontrado: {grid_search.best_params_['alpha']}")

# --- 6. Predicción sobre validación ---
y_pred = mejor_modelo_basico.predict(X_basico_val)

# --- 7. Métricas ---
mae = mean_absolute_error(y_basico_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_basico_val, y_pred))
r2 = r2_score(y_basico_val, y_pred)

print(f"📊 MAE:  {mae:.2f} USD")
print(f"📉 RMSE: {rmse:.2f} USD")
print(f"📈 R²:   {r2:.4f}")


Mejor alpha encontrado: 0.01
📊 MAE:  3756.90 USD
📉 RMSE: 5343.57 USD
📈 R²:   0.7469


In [2]:
df_intermedio = pd.read_csv("../intermedio_sin_outliers.csv")

# --- 1. Separar variables independientes y dependientes ---
X = df_intermedio.drop(columns=["Precio_usd"]).values
y = df_intermedio["Precio_usd"].values

# --- 2. Separar train/validación ---
X_inter, X_inter_val, y_inter, y_inter_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3. Definir modelo base ---
modelo_inter = Ridge()

# --- 4. Búsqueda de hiperparámetros ---
param_grid = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(modelo_inter, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_inter, y_inter)

# --- 5. Mejor modelo ---
mejor_modelo_inter = grid_search.best_estimator_
print(f"Mejor alpha encontrado: {grid_search.best_params_['alpha']}")

# --- 6. Predicción sobre validación ---
y_pred = mejor_modelo_inter.predict(X_inter_val)

# --- 7. Métricas ---
mae = mean_absolute_error(y_inter_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_inter_val, y_pred))
r2 = r2_score(y_inter_val, y_pred)

print(f"📊 MAE:  {mae:.2f} USD")
print(f"📉 RMSE: {rmse:.2f} USD")
print(f"📈 R²:   {r2:.4f}")


Mejor alpha encontrado: 0.1
📊 MAE:  3112.29 USD
📉 RMSE: 4399.84 USD
📈 R²:   0.8307


In [3]:
df_final = pd.read_csv("../dataset_final.csv")

# --- 1. Separar variables independientes y dependientes ---
X = df_final.drop(columns=["Precio_usd"]).values
y = df_final["Precio_usd"].values

# --- 2. Separar train/validación ---
X_final, X_final_val, y_final, y_final_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3. Definir modelo base ---
modelo_final = Ridge()

# --- 4. Búsqueda de hiperparámetros ---
param_grid = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(modelo_final, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_final, y_final)

# --- 5. Mejor modelo ---
mejor_modelo_final = grid_search.best_estimator_
print(f"Mejor alpha encontrado: {grid_search.best_params_['alpha']}")

# --- 6. Predicción sobre validación ---
y_pred = mejor_modelo_final.predict(X_final_val)

# --- 7. Métricas ---
mae = mean_absolute_error(y_final_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_final_val, y_pred))
r2 = r2_score(y_final_val, y_pred)

print(f"📊 MAE:  {mae:.2f} USD")
print(f"📉 RMSE: {rmse:.2f} USD")
print(f"📈 R²:   {r2:.4f}")


Mejor alpha encontrado: 0.1
📊 MAE:  3606.99 USD
📉 RMSE: 4956.91 USD
📈 R²:   0.7851


In [4]:
df_test = pd.read_csv("../pf_dataset_test_final.csv")
df_test = df_test.drop(columns=['Combustible_Diésel.1'], errors='ignore')

In [5]:
#Uso el modelo intermedio que es el que mejor metricas tiene
pred_test = mejor_modelo_inter.predict(df_test)



In [6]:
#Guardar las predicciones
pred_df = pd.DataFrame(pred_test, columns=["Precio_usd_predicho"])
pred_df.to_csv("predicciones_linear_reg.csv", index=False)