In [141]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns
import mlflow
import joblib
import os

In [142]:
# Configurar MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("toyota_project")

<Experiment: artifact_location='mlflow-artifacts:/833447958012810940', creation_time=1747431455974, experiment_id='833447958012810940', last_update_time=1747431455974, lifecycle_stage='active', name='toyota_project', tags={}>

In [143]:
# Cargar dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/dodobeatle/dataeng-datos/refs/heads/main/ToyotaCorolla.csv",
    encoding="utf8",
    engine="python"
)

In [144]:
# Separar variables
X = df.drop("Price", axis=1)
y = df["Price"]

X
y

0       13500
1       13750
2       13950
3       14950
4       13750
        ...  
1431     7500
1432    10845
1433     8500
1434     7250
1435     6950
Name: Price, Length: 1436, dtype: int64

In [145]:
# Filtrar columnas que querés mantener
# columns_to_keep = ["Age_08_04", "KM", "Fuel_Type", "HP", "cc", "Doors", "Gears", "Weight"]
# X = df[columns_to_keep]
# y = df["Price"]
X = df.drop(["Price", "Age_08_04", "Model", "Met_Color", "Automatic", "Cylinders", "Radio", 
             "Radio_cassette", "BOVAG_Guarantee", "Mfg_Month", "Backseat_Divider", "ABS",
             "Doors", "Guarantee_Period", "Mistlamps"], axis=1)

# COMBINAR AIRBAGS
X["Airbag_Count"] = df["Airbag_1"].fillna(0) + df["Airbag_2"].fillna(0)
X.drop(["Airbag_1", "Airbag_2"], axis=1, inplace=True)

# Preprocesamiento
X = pd.get_dummies(X, drop_first=True)           # One-hot encoding
X = X.apply(pd.to_numeric, errors='coerce')      # Forzar datos a numérico
y = pd.to_numeric(y, errors='coerce')            # Lo mismo para el target

# Combinar X e y para eliminar filas con NaNs
combined = pd.concat([X, y], axis=1).dropna()
X = combined.drop("Price", axis=1)
y = combined["Price"]

# Convertir a float y agregar constante para OLS
X = X.astype(float)
y = y.astype(float)
X = sm.add_constant(X)

In [146]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1
)

In [147]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Iniciar experimento
with mlflow.start_run(run_name="ols_model_run"):
    # Ajustar modelo
    model = sm.OLS(y_train, X_train)
    results = model.fit()

    # Predicciones
    y_pred = results.predict(X_test)

    # Métricas
    rmse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2_score", r2)

    # Crear carpeta si no existe
    os.makedirs("mlartifacts", exist_ok=True)

    # Guardar resumen como archivo .txt
    summary_path = os.path.join("mlartifacts", "ols_summary.txt")
    with open(summary_path, "w") as f:
        f.write(results.summary().as_text())
    mlflow.log_artifact(summary_path)

    # Guardar modelo con joblib
    model_path = os.path.join("mlartifacts", "ols_model.pkl")
    joblib.dump(results, model_path)
    mlflow.log_artifact(model_path)

    # Guardar nombres de features
    features_path = os.path.join("mlartifacts", "features.txt")
    with open(features_path, "w") as f:
        f.write("\n".join(X.columns))
    mlflow.log_artifact(features_path)

    # =======================
    # VIF ANALYSIS
    # =======================

    vif_data = pd.DataFrame()
    vif_data["feature"] = X_train.columns
    vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

    vif_path = os.path.join("mlartifacts", "vif_report.csv")
    vif_data.to_csv(vif_path, index=False)
    mlflow.log_artifact(vif_path)

    # =======================
    # GRÁFICOS
    # =======================

    # Plot: Real vs. Predicho
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.6, edgecolors='k')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', lw=2)
    plt.xlabel("Actual Price")
    plt.ylabel("Predicted Price")
    plt.title("Actual vs. Predicted Price")
    plt.grid(True)
    real_vs_pred_path = os.path.join("mlartifacts", "actual_vs_predicted.png")
    plt.savefig(real_vs_pred_path)
    mlflow.log_artifact(real_vs_pred_path)
    plt.close()

    # Plot: Residuals
    residuals = y_test - y_pred
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred, residuals, alpha=0.6, edgecolors='k')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel("Predicted Price")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.grid(True)
    residuals_path = os.path.join("mlartifacts", "residuals_plot.png")
    plt.savefig(residuals_path)
    mlflow.log_artifact(residuals_path)
    plt.close()

    # =======================
    # MATRIZ DE CORRELACIÓN
    # =======================

    plt.figure(figsize=(16, 12))
    corr_matrix = pd.concat([X_train, y_train], axis=1).corr()
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt=".2f",
        cmap="coolwarm",
        cbar_kws={"shrink": 0.75},
        linewidths=0.5,
        annot_kws={"size": 10}
    )
    plt.title("Correlation Matrix", fontsize=16)
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(fontsize=10)

    corr_matrix_path = os.path.join("mlartifacts", "correlation_matrix.png")
    plt.tight_layout()
    plt.savefig(corr_matrix_path)
    mlflow.log_artifact(corr_matrix_path)
    plt.close()


print("Modelo OLS registrado con MLflow.")


🏃 View run ols_model_run at: http://localhost:5000/#/experiments/833447958012810940/runs/8cfba474b53b454f92ba10c26908edd9
🧪 View experiment at: http://localhost:5000/#/experiments/833447958012810940
Modelo OLS registrado con MLflow.
