## Importando Bibliotecas e primeiras modificações

#### Adicionando escalonamento

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
%pip install --upgrade mlflow

In [0]:
%restart_python

In [0]:
import mlflow
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils

# Workaround to set the registry URI manually
mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"

mlflow.login() # This prints an INFO-log: Login successful!
# mlflow.set_model_uri("databricks")

In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Optional
from sklearn.impute import KNNImputer
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
import pyspark.sql.functions as F

In [0]:
def get_data():
    df = spark.read.table("workspace.ml_datasets.house_prediction_train")
    train = df.toPandas()
    X = train.drop(columns=["SalePrice","Id"])
    y = train["SalePrice"]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,train_size=0.8,random_state=42)
    return X_train,X_test,y_train,y_test

X_train,X_test,y_train,y_test = get_data()

#### Traduzindo dados categóricos

In [0]:
# Mapping utilizado com base em reposta de IA para tornar os nomes mais descritivos:
value_mapping = {
    # Road and Alley Surfaces
    "Grvl": "Gravel",
    "Pave": "Paved",
    "NA": "No Access",  # Used for alleys and basements
    # Lot Shape
    "Reg": "Regular",
    "IR1": "Slightly Irregular",
    "IR2": "Moderately Irregular",
    "IR3": "Irregular",
    # Land Contour
    "Lvl": "Near Flat/Level",
    "Bnk": "Banked",
    "HLS": "Hillside",
    "Low": "Depression",
    # Utilities
    "AllPub": "All Public Utilities",
    "NoSewr": "No Sewer (Septic Tank)",
    "NoSeWa": "No Sewer or Water",
    "ELO": "Electricity Only",
    # Lot Config
    "Inside": "Inside Lot",
    "Corner": "Corner Lot",
    "CulDSac": "Cul-de-sac",
    "FR2": "Frontage on 2 Sides",
    "FR3": "Frontage on 3 Sides",
    # Land Slope
    "Gtl": "Gentle Slope",
    "Mod": "Moderate Slope",
    "Sev": "Severe Slope",
    # Condition (Condition1 & Condition2)
    "Artery": "Adjacent to Arterial Street",
    "Feedr": "Adjacent to Feeder Street",
    "Norm": "Normal",
    "RRNn": "Near N-S Railroad",
    "RRAn": "Adjacent to N-S Railroad",
    "RRNe": "Near E-W Railroad",
    "RRAe": "Adjacent to E-W Railroad",
    "PosN": "Near Positive Off-site Feature",
    "PosA": "Adjacent to Positive Off-site Feature",
    # Building Type
    "1Fam": "Single-family Detached",
    "2FmCon": "Two-family Conversion",
    "Duplx": "Duplex",
    "TwnhsE": "Townhouse End Unit",
    "TwnhsI": "Townhouse Inside Unit",
    # House Style
    "1Story": "One Story",
    "1.5Fin": "One and Half Story Finished",
    "1.5Unf": "One and Half Story Unfinished",
    "2Story": "Two Story",
    "2.5Fin": "Two and Half Story Finished",
    "2.5Unf": "Two and Half Story Unfinished",
    "SFoyer": "Split Foyer",
    "SLvl": "Split Level",
    # Quality and Condition Ratings
    "Ex": "Excellent",
    "Gd": "Good",
    "TA": "Typical/Average",
    "Fa": "Fair",
    "Po": "Poor",
    # Basement Specific
    "Av": "Average Exposure",
    "Mn": "Minimum Exposure",
    "No": "No Exposure",
    "GLQ": "Good Living Quarters",
    "ALQ": "Average Living Quarters",
    "BLQ": "Below Average Living Quarters",
    "Rec": "Recreation Room",
    "LwQ": "Low Quality",
    "Unf": "Unfinished",
    # Heating
    "Floor": "Floor Furnace",
    "GasA": "Gas Forced Warm Air",
    "GasW": "Gas Hot Water or Steam",
    "Grav": "Gravity Furnace",
    "OthW": "Other Water Heater",
    "Wall": "Wall Furnace",
    # Central Air
    "Y": "Yes",
    "N": "No",
    # Electrical
    "SBrkr": "Standard Circuit Breakers",
    "FuseA": "Fuse Box >60AMP + Romex",
    "FuseF": "60AMP Fuse Box + Mostly Romex",
    "FuseP": "60AMP + Mostly Knob & Tube",
    "Mix": "Mixed Wiring",
    # Kitchen Quality
    # Already mapped above: Ex, Gd, TA, Fa, Po
    # Functional
    "Typ": "Typical Functionality",
    "Min1": "Minor Deductions 1",
    "Min2": "Minor Deductions 2",
    "Mod": "Moderate Deductions",
    "Maj1": "Major Deductions 1",
    "Maj2": "Major Deductions 2",
    "Sev": "Severely Damaged",
    "Sal": "Salvage Only",
    # Fireplace Quality
    # Already mapped above: Ex, Gd, TA, Fa, Po, NA
    # Garage Type
    "2Types": "More than One Type",
    "Attchd": "Attached",
    "Basment": "Basement",
    "BuiltIn": "Built-In",
}

map_zoneamento = {
    "A": "Agriculture",
    "C": "Commercial",
    "FV": "Floating Village Residential",
    "I": "Industrial",
    "RH": "Residential High Density",
    "RL": "Residential Low Density",
    "RP": "Residential Low Density Park ",
    "RM": "Residential Medium Density",
}

## Primeiras Transformações

In [0]:
class MappingValues(BaseEstimator, TransformerMixin):
    def __init__(self, dicionario, col: Optional[str] = None):
        self.dicionario = dicionario
        self.col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        if self.col is not None:
            X_transformed[self.col] = X_transformed[self.col].replace(self.dicionario)
        else:
            X_transformed = X_transformed.replace(self.dicionario)
        return X_transformed

class numerical_only(BaseEstimator,TransformerMixin):
    """Retorna somente valores numéricos"""
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X = X.copy()
        X_transformed = X.select_dtypes(include="number")
        return X_transformed

class NullReplacer(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        if isinstance(columns,str):
            self.columns = [columns]
        else:
            self.columns = columns
            
    def fit(self,X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna("None")
                
        return X_transformed

lista = [
    "MiscFeature","Alley","Fence","MasVnrType","FireplaceQu","GarageCond","GarageQual",
    "GarageFinish","GarageCond", "GarageQual", "GarageFinish","GarageType","GarageYrBlt", 
    "BsmtExposure", "BsmtFinType2", "BsmtCond","BsmtFinType1", "BsmtQual","PoolQC"
]


## Apenas dados numéricos

#### Regressão Linear básica

In [0]:
with mlflow.start_run(run_name="Linear_regression_standard_scaler"):

    # Get your training and test data
    X_train, X_test, y_train, y_test = get_data()

    # Store column names and indices before transformation
    train_index = X_train.index
    test_index = X_test.index

    # Save original columns (only works if numerical_only does not change them)
    original_columns = X_train.columns

    # Parameters for regression
    fit_intercept = True
    copy_X = True
    n_jobs = None
    positive = False

    # Log parameters
    params = {
        "fit_intercept": fit_intercept,
        "copy_X": copy_X,
        "n_jobs": n_jobs,
        "positive": positive,
    }
    mlflow.log_params(params)

    # Define preprocessing pipeline
    pipeline = Pipeline(steps=[
        ('mappingvalues', MappingValues(value_mapping)),
        ('numerical_only', numerical_only()),
        ('replace_null', NullReplacer(lista)),
        ('scaler', StandardScaler())   
    ])

    # Fit and transform training data
    X_train_np = pipeline.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_np, index=train_index, columns=[f'feature_{i}' for i in range(X_train_np.shape[1])])
    y_train = y_train.loc[train_index]

    # Train model
    lin_reg = LinearRegression(fit_intercept=fit_intercept, copy_X=copy_X, n_jobs=n_jobs, positive=positive)
    lin_reg.fit(X_train, y_train)

    # Log model
    mlflow.sklearn.log_model(lin_reg, artifact_path="Linear_regression_standard_scaler", input_example=X_train)

    # Transform and prepare test data
    X_test_np = pipeline.transform(X_test)
    X_test_ = pd.DataFrame(X_test_np, index=test_index, columns=[f'feature_{i}' for i in range(X_test_np.shape[1])])
    y_test = y_test.loc[test_index]

    # Predict
    predictions = lin_reg.predict(X_test_)

    # Metrics
    mse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    # Save predictions
    predictions_df = pd.DataFrame({"prediction": predictions, "real": y_test})
    predictions_df.to_csv("predictions.csv", index=False)
    mlflow.log_artifact("predictions.csv")

    # Residual plot
    residuals = predictions - y_test
    residuals_df = pd.DataFrame({"residuals": residuals})
    sns.scatterplot(data=residuals_df, y="residuals", x=residuals_df.index)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    plt.close()
    mlflow.log_artifact("residuals_plot.png")

    # Real vs Predicted plot
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Valor real da casa")
    plt.ylabel("Preço previsto")
    plt.title("Real x Previsto")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    plt.close()
    mlflow.log_artifact("pred_vs_actual.png")

    # Print summary
    print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)

In [0]:
class DataFramePolynomialFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2, include_bias=True, interaction_only=False):
        self.degree = degree
        self.include_bias = include_bias
        self.interaction_only = interaction_only
        self.poly = PolynomialFeatures(
            degree=self.degree,
            include_bias=self.include_bias,
            interaction_only=self.interaction_only
        )

    def fit(self, X, y=None):
        self.poly.fit(X)
        self.feature_names = self.poly.get_feature_names_out(input_features=X.columns)
        return self

    def transform(self, X):
        X_poly = self.poly.transform(X)
        return pd.DataFrame(X_poly, columns=self.feature_names, index=X.index)


Ridge 3rd degree

In [0]:
X_train, X_test, y_train, y_test = get_data()

with mlflow.start_run(run_name="Ridge_numeric_3nd_degree_standard_scaler"):
    fit_intercept = True
    copy_X = True
    n_jobs = None
    positive = False

    params = {"fit_intercetpt": fit_intercept, 
              "copy_X": copy_X}

    mlflow.log_params(params)

    pipeline = Pipeline(
        steps=[
            ("mappingvalues", MappingValues(value_mapping)),
            ("numerical_only", numerical_only()),
            ("replace_null", NullReplacer(lista)),
            ("polynomial",DataFramePolynomialFeatures(degree=3, include_bias=True, interaction_only=True))
        ]
    )

    X_train = pipeline.fit_transform(X_train)
    y_train = y_train.loc[X_train.index]

    lin_reg = Ridge(fit_intercept=fit_intercept, copy_X=copy_X)
    lin_reg.fit(X_train, y_train)

    logged_model = mlflow.sklearn.log_model(
        lin_reg, name="Ridge_numeric_3nd_degree_standard_scaler", input_example=X_train
    )

    X_test = pipeline.transform(X_test)
    y_test = y_test.loc[X_test.index]

    predictions = lin_reg.predict(X_test)

    mse = mean_squared_error(y_test, predictions, squared=False)

    r2 = r2_score(y_test, predictions)

    mlflow.log_metric("mse", mse)

    mlflow.log_metric("r2",r2)

    mlflow.log_artifact("predictions.csv")
    df = pd.DataFrame(data=predictions - y_test)

    sns.scatterplot(data=df)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    mlflow.log_artifact("residuals_plot.png")

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predicted vs Actual")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    mlflow.log_artifact("pred_vs_actual.png")
    print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)


#### Lasso

In [0]:


# Get data
X_train, X_test, y_train, y_test = get_data()

# Save index before transformation
train_index = X_train.index
test_index = X_test.index

with mlflow.start_run(run_name="Lasso_numeric_3nd_degree"):

    # Regression parameters
    fit_intercept = True
    copy_X = True
    n_jobs = None
    positive = False

    # Log parameters (also fix typo: 'fit_intercetpt' → 'fit_intercept')
    params = {
        "fit_intercept": fit_intercept,
        "copy_X": copy_X,
        "n_jobs": n_jobs,
        "positive": positive
    }
    mlflow.log_params(params)

    # Define pipeline
    pipeline = Pipeline(steps=[
        ("mappingvalues", MappingValues(value_mapping)),
        ("numerical_only", numerical_only()),
        ("replace_null", NullReplacer(lista)),
        ("polynomial", DataFramePolynomialFeatures(degree=3, include_bias=True, interaction_only=True))
    ])

    # Fit and transform training data
    X_train_np = pipeline.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_np, index=train_index, columns=[f'feature_{i}' for i in range(X_train_np.shape[1])])
    y_train = y_train.loc[train_index]

    # Fit Lasso model
    lin_reg = Lasso(fit_intercept=fit_intercept, copy_X=copy_X)
    lin_reg.fit(X_train, y_train)

    # Log model
    mlflow.sklearn.log_model(
        lin_reg,
        artifact_path="Lasso_numeric_3nd_degree_numeric_only",
        input_example=X_train
    )

    # Transform test set
    X_test_np = pipeline.transform(X_test)
    X_test_ = pd.DataFrame(X_test_np, index=test_index, columns=[f'feature_{i}' for i in range(X_test_np.shape[1])])
    y_test = y_test.loc[test_index]

    # Predict
    predictions = lin_reg.predict(X_test_)

    # Metrics
    mse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    # Save predictions to CSV
    predictions_df = pd.DataFrame({"prediction": predictions, "real": y_test})
    predictions_df.to_csv("predictions.csv", index=False)
    mlflow.log_artifact("predictions.csv")

    # Residual plot
    residuals = predictions - y_test
    residuals_df = pd.DataFrame({"residuals": residuals})
    sns.scatterplot(data=residuals_df, y="residuals", x=residuals_df.index)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    plt.close()
    mlflow.log_artifact("residuals_plot.png")

    # Real vs Predicted plot
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predicted vs Actual")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    plt.close()
    mlflow.log_artifact("pred_vs_actual.png")

    # Print metrics
    print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)


#### Decision Trees

In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Your custom transformers and dataset
# from your_module import MappingValues, numerical_only, NullReplacer
# from your_data_module import get_data, value_mapping, lista

# Get data
X_train, X_test, y_train, y_test = get_data()

# Save index and columns before transformation
train_index = X_train.index
test_index = X_test.index

with mlflow.start_run(run_name="Decision_tree_numeric_only"):

    # Hyperparameters
    criterion = "squared_error"
    max_depth = 10
    min_samples_split = 15
    min_samples_leaf = 5
    random_state = 42

    # Log parameters
    params = {
        "criterion": criterion,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf,
        "random_state": random_state
    }
    mlflow.log_params(params)

    # Define pipeline
    pipeline = Pipeline(steps=[
        ('mappingvalues', MappingValues(value_mapping)),
        ('numerical_only', numerical_only()),
        ('replace_null', NullReplacer(lista)),
        ('scaler', StandardScaler())   
    ])

    # Fit and transform training data
    X_train_np = pipeline.fit_transform(X_train)
    X_train = pd.DataFrame(X_train_np, index=train_index, columns=[f'feature_{i}' for i in range(X_train_np.shape[1])])
    y_train = y_train.loc[train_index]

    # Train decision tree model
    decision_tree = DecisionTreeRegressor(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state
    )
    decision_tree.fit(X_train, y_train)

    # Log model
    mlflow.sklearn.log_model(decision_tree, artifact_path="decision-tree-numeric_only", input_example=X_train)

    # Transform and prepare test data
    X_test_np = pipeline.transform(X_test)
    X_test_ = pd.DataFrame(X_test_np, index=test_index, columns=[f'feature_{i}' for i in range(X_test_np.shape[1])])
    y_test = y_test.loc[test_index]

    # Predict
    predictions = decision_tree.predict(X_test_)

    # Metrics
    mse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    # Save predictions
    pd.DataFrame({"prediction": predictions, "real": y_test}).to_csv("predictions.csv", index=False)
    mlflow.log_artifact("predictions.csv")

    # Residual plot
    residuals = predictions - y_test
    residuals_df = pd.DataFrame({"residuals": residuals})
    sns.scatterplot(data=residuals_df, y="residuals", x=residuals_df.index)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    plt.close()
    mlflow.log_artifact("residuals_plot.png")

    # Real vs Predicted plot
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predicted vs Actual")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    plt.close()
    mlflow.log_artifact("pred_vs_actual.png")

    # Print metrics
    print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)


#### Random Fortest

In [0]:
X_train,X_test,y_train,y_test = get_data()

with mlflow.start_run(run_name="random_forest_numeric_only"):

   
  n_estimators = 100
  max_depth = 6
  max_features = 3
  params = {
    "n_estimators": n_estimators,
    "max_depth": max_depth,
    "max_features": max_features
  }

  
  mlflow.log_params(params)
  
  pipeline = Pipeline(steps=[
        ('mappingvalues', MappingValues(value_mapping)),
        ('numerical_only', numerical_only()),
        ('replace_null', NullReplacer(lista)), ('scaler', StandardScaler())   
    ])
  
  X_train = pipeline.fit_transform(X_train)
  y_train = y_train.loc[X_train.index]

  # Create and train model.
  rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
  rf.fit(X_train, y_train)

  
  logged_model = mlflow.sklearn.log_model(rf, name="random-forest-numeric_only", input_example=X_train)


  X_test = pipeline.transform(X_test)
  y_test = y_test.loc[X_test.index]

  
  predictions = rf.predict(X_test)

  
  mse = mean_squared_error(y_test, predictions, squared= False)
  r2 = r2_score(y_test, predictions)  
  
  mlflow.log_metric("mse", mse)
  mlflow.log_metric("r2", r2)

  # Save the table of predicted values
  np.savetxt('predictions.csv', predictions, delimiter=',')

  
  mlflow.log_artifact("predictions.csv")

  
  df = pd.DataFrame(data = predictions - y_test)
  
  sns.scatterplot(data=df)
  plt.xlabel("Observation")
  plt.ylabel("Residual")
  plt.title("Residuals")
  plt.savefig("residuals_plot.png")
  mlflow.log_artifact("residuals_plot.png")

  plt.figure(figsize=(8, 5))
  sns.scatterplot(x=y_test, y=predictions)
  plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
  plt.xlabel("Actual")
  plt.ylabel("Predicted")
  plt.title("Predicted vs Actual")
  plt.grid(True)
  plt.tight_layout()
  plt.savefig("pred_vs_actual.png")
  mlflow.log_artifact("pred_vs_actual.png")
  print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)