## Importando Bibliotecas e primeiras modificações

#### Importando Bibliotecas

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
%pip install --upgrade mlflow

In [0]:
%restart_python

In [0]:
import mlflow
import databricks.connect as db_connect
import mlflow.tracking._model_registry.utils

# Workaround to set the registry URI manually
mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"

mlflow.login() # This prints an INFO-log: Login successful!
# mlflow.set_model_uri("databricks")

In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Optional
from sklearn.impute import KNNImputer
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
import pyspark.sql.functions as F

In [0]:
def get_data():
    df = spark.read.table("workspace.ml_datasets.house_prediction_train")
    train = df.toPandas()
    X = train.drop(columns=["SalePrice","Id"])
    y = train["SalePrice"]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,train_size=0.8,random_state=42)
    return X_train,X_test,y_train,y_test

X_train,X_test,y_train,y_test = get_data()

#### Traduzindo dados categóricos

In [0]:
# Mapping utilizado com base em reposta de IA para tornar os nomes mais descritivos:
value_mapping = {
    # Road and Alley Surfaces
    "Grvl": "Gravel",
    "Pave": "Paved",
    "NA": "No Access",  # Used for alleys and basements
    # Lot Shape
    "Reg": "Regular",
    "IR1": "Slightly Irregular",
    "IR2": "Moderately Irregular",
    "IR3": "Irregular",
    # Land Contour
    "Lvl": "Near Flat/Level",
    "Bnk": "Banked",
    "HLS": "Hillside",
    "Low": "Depression",
    # Utilities
    "AllPub": "All Public Utilities",
    "NoSewr": "No Sewer (Septic Tank)",
    "NoSeWa": "No Sewer or Water",
    "ELO": "Electricity Only",
    # Lot Config
    "Inside": "Inside Lot",
    "Corner": "Corner Lot",
    "CulDSac": "Cul-de-sac",
    "FR2": "Frontage on 2 Sides",
    "FR3": "Frontage on 3 Sides",
    # Land Slope
    "Gtl": "Gentle Slope",
    "Mod": "Moderate Slope",
    "Sev": "Severe Slope",
    # Condition (Condition1 & Condition2)
    "Artery": "Adjacent to Arterial Street",
    "Feedr": "Adjacent to Feeder Street",
    "Norm": "Normal",
    "RRNn": "Near N-S Railroad",
    "RRAn": "Adjacent to N-S Railroad",
    "RRNe": "Near E-W Railroad",
    "RRAe": "Adjacent to E-W Railroad",
    "PosN": "Near Positive Off-site Feature",
    "PosA": "Adjacent to Positive Off-site Feature",
    # Building Type
    "1Fam": "Single-family Detached",
    "2FmCon": "Two-family Conversion",
    "Duplx": "Duplex",
    "TwnhsE": "Townhouse End Unit",
    "TwnhsI": "Townhouse Inside Unit",
    # House Style
    "1Story": "One Story",
    "1.5Fin": "One and Half Story Finished",
    "1.5Unf": "One and Half Story Unfinished",
    "2Story": "Two Story",
    "2.5Fin": "Two and Half Story Finished",
    "2.5Unf": "Two and Half Story Unfinished",
    "SFoyer": "Split Foyer",
    "SLvl": "Split Level",
    # Quality and Condition Ratings
    "Ex": "Excellent",
    "Gd": "Good",
    "TA": "Typical/Average",
    "Fa": "Fair",
    "Po": "Poor",
    # Basement Specific
    "Av": "Average Exposure",
    "Mn": "Minimum Exposure",
    "No": "No Exposure",
    "GLQ": "Good Living Quarters",
    "ALQ": "Average Living Quarters",
    "BLQ": "Below Average Living Quarters",
    "Rec": "Recreation Room",
    "LwQ": "Low Quality",
    "Unf": "Unfinished",
    # Heating
    "Floor": "Floor Furnace",
    "GasA": "Gas Forced Warm Air",
    "GasW": "Gas Hot Water or Steam",
    "Grav": "Gravity Furnace",
    "OthW": "Other Water Heater",
    "Wall": "Wall Furnace",
    # Central Air
    "Y": "Yes",
    "N": "No",
    # Electrical
    "SBrkr": "Standard Circuit Breakers",
    "FuseA": "Fuse Box >60AMP + Romex",
    "FuseF": "60AMP Fuse Box + Mostly Romex",
    "FuseP": "60AMP + Mostly Knob & Tube",
    "Mix": "Mixed Wiring",
    # Kitchen Quality
    # Already mapped above: Ex, Gd, TA, Fa, Po
    # Functional
    "Typ": "Typical Functionality",
    "Min1": "Minor Deductions 1",
    "Min2": "Minor Deductions 2",
    "Mod": "Moderate Deductions",
    "Maj1": "Major Deductions 1",
    "Maj2": "Major Deductions 2",
    "Sev": "Severely Damaged",
    "Sal": "Salvage Only",
    # Fireplace Quality
    # Already mapped above: Ex, Gd, TA, Fa, Po, NA
    # Garage Type
    "2Types": "More than One Type",
    "Attchd": "Attached",
    "Basment": "Basement",
    "BuiltIn": "Built-In",
}

map_zoneamento = {
    "A": "Agriculture",
    "C": "Commercial",
    "FV": "Floating Village Residential",
    "I": "Industrial",
    "RH": "Residential High Density",
    "RL": "Residential Low Density",
    "RP": "Residential Low Density Park ",
    "RM": "Residential Medium Density",
}

## Primeiras Transformações

In [0]:
class MappingValues(BaseEstimator, TransformerMixin):
    def __init__(self, dicionario, col: Optional[str] = None):
        self.dicionario = dicionario
        self.col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        if self.col is not None:
            X_transformed[self.col] = X_transformed[self.col].replace(self.dicionario)
        else:
            X_transformed = X_transformed.replace(self.dicionario)
        return X_transformed

class numerical_only(BaseEstimator,TransformerMixin):
    """Retorna somente valores numéricos"""
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X = X.copy()
        X_transformed = X.select_dtypes(include="number")
        return X_transformed

class NullReplacer(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        if isinstance(columns,str):
            self.columns = [columns]
        else:
            self.columns = columns
            
    def fit(self,X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna("None")
                
        return X_transformed

lista = [
    "MiscFeature","Alley","Fence","MasVnrType","FireplaceQu","GarageCond","GarageQual",
    "GarageFinish","GarageCond", "GarageQual", "GarageFinish","GarageType","GarageYrBlt", 
    "BsmtExposure", "BsmtFinType2", "BsmtCond","BsmtFinType1", "BsmtQual","PoolQC"
]


## Apenas dados numéricos

#### Regressão Linear básica

In [0]:
with mlflow.start_run(run_name="Linear_regression_numerical_only"):

    X_train, X_test, y_train, y_test = get_data()

    fit_intercept = True
    copy_X = True
    n_jobs = None
    positive = False

    params = {
    "fit_intercetpt": fit_intercept,
    "copy_X": copy_X,
    "n_jobs": n_jobs,
    "positive": positive,
    }
    
    mlflow.log_params(params)

    pipeline = Pipeline(steps=[
      ('mappingvalues', MappingValues(value_mapping)),
      ('numerical_only', numerical_only()),
      ('replace_null', NullReplacer(lista)),
    ])


    X_train = pipeline.fit_transform(X_train)
    y_train = y_train.loc[X_train.index]

    lin_reg = LinearRegression(fit_intercept= fit_intercept,  copy_X = copy_X, n_jobs = n_jobs, positive =positive)
    
    lin_reg.fit(X_train, y_train)

    logged_model = mlflow.sklearn.log_model(lin_reg, name="Linear_regression_numerical_only", input_example=X_train)

    X_test = pipeline.transform(X_test)
    y_test = y_test.loc[X_test.index]

    predictions = lin_reg.predict(X_test)

    mse = mean_squared_error(y_test, predictions,squared=False)

    r2 = r2_score(y_test, predictions)

    mlflow.log_metric("mse", mse)

    mlflow.log_metric("r2",r2)

    mlflow.log_artifact("predictions.csv")

    df = pd.DataFrame(data = predictions - y_test)

    sns.scatterplot(data=df)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    mlflow.log_artifact("residuals_plot.png")

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Valor real da casa")
    plt.ylabel("Preço previsto")
    plt.title("Real x Previsto")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    mlflow.log_artifact("pred_vs_actual.png")
    print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)



In [0]:
class DataFramePolynomialFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2, include_bias=True, interaction_only=False):
        self.degree = degree
        self.include_bias = include_bias
        self.interaction_only = interaction_only
        self.poly = PolynomialFeatures(
            degree=self.degree,
            include_bias=self.include_bias,
            interaction_only=self.interaction_only
        )

    def fit(self, X, y=None):
        self.poly.fit(X)
        self.feature_names = self.poly.get_feature_names_out(input_features=X.columns)
        return self

    def transform(self, X):
        X_poly = self.poly.transform(X)
        return pd.DataFrame(X_poly, columns=self.feature_names, index=X.index)


Ridge 3rd degree

In [0]:
X_train, X_test, y_train, y_test = get_data()

with mlflow.start_run(run_name="Ridge_numeric_3nd_degree"):
    fit_intercept = True
    copy_X = True
    n_jobs = None
    positive = False

    params = {"fit_intercetpt": fit_intercept, 
              "copy_X": copy_X}

    mlflow.log_params(params)

    pipeline = Pipeline(
        steps=[
            ("mappingvalues", MappingValues(value_mapping)),
            ("numerical_only", numerical_only()),
            ("replace_null", NullReplacer(lista)),
            ("polynomial",DataFramePolynomialFeatures(degree=3, include_bias=True, interaction_only=True))
        ]
    )

    X_train = pipeline.fit_transform(X_train)
    y_train = y_train.loc[X_train.index]

    lin_reg = Ridge(fit_intercept=fit_intercept, copy_X=copy_X)
    lin_reg.fit(X_train, y_train)

    logged_model = mlflow.sklearn.log_model(
        lin_reg, name="Ridge_numeric_3nd_degree_numeric_only", input_example=X_train
    )

    X_test = pipeline.transform(X_test)
    y_test = y_test.loc[X_test.index]

    predictions = lin_reg.predict(X_test)

    mse = mean_squared_error(y_test, predictions, squared=False)

    r2 = r2_score(y_test, predictions)

    mlflow.log_metric("mse", mse)

    mlflow.log_metric("r2",r2)

    mlflow.log_artifact("predictions.csv")
    df = pd.DataFrame(data=predictions - y_test)

    sns.scatterplot(data=df)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    mlflow.log_artifact("residuals_plot.png")

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predicted vs Actual")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    mlflow.log_artifact("pred_vs_actual.png")
    print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)


#### Lasso

In [0]:
X_train, X_test, y_train, y_test = get_data()

with mlflow.start_run(run_name="Lasso_numeric_3nd_degree"):
    fit_intercept = True
    copy_X = True
    n_jobs = None
    positive = False

    params = {"fit_intercetpt": fit_intercept, 
              "copy_X": copy_X}

    mlflow.log_params(params)

    pipeline = Pipeline(
        steps=[
            ("mappingvalues", MappingValues(value_mapping)),
            ("numerical_only", numerical_only()),
            ("replace_null", NullReplacer(lista)),
            ("polynomial",DataFramePolynomialFeatures(degree=3, include_bias=True, interaction_only=True))
        ]
    )

    X_train = pipeline.fit_transform(X_train)
    y_train = y_train.loc[X_train.index]

    lin_reg = Lasso(fit_intercept=fit_intercept, copy_X=copy_X)
    lin_reg.fit(X_train, y_train)

    logged_model = mlflow.sklearn.log_model(
        lin_reg, name="Lasso_numeric_3nd_degree_numeric_only", input_example=X_train
    )

    X_test = pipeline.transform(X_test)
    y_test = y_test.loc[X_test.index]

    predictions = lin_reg.predict(X_test)

    mse = mean_squared_error(y_test, predictions,squared=False)

    r2 = r2_score(y_test, predictions)

    mlflow.log_metric("mse", mse)

    mlflow.log_metric("r2",r2)
    mlflow.log_artifact("predictions.csv")
    df = pd.DataFrame(data=predictions - y_test)

    sns.scatterplot(data=df)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    mlflow.log_artifact("residuals_plot.png")

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predicted vs Actual")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    mlflow.log_artifact("pred_vs_actual.png")
    print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)


#### Decision Trees

In [0]:
X_train,X_test,y_train,y_test = get_data()

with mlflow.start_run(run_name="Decision_tree_numeric_only"):

  criterion= "squared_error"
  max_depth=10
  min_samples_split=15
  min_samples_leaf=5
  random_state=42

  params = {
  "criterion": criterion ,
  "max_depth":  max_depth ,
  "min_samples_split":  min_samples_split ,
  "min_samples_leaf":  min_samples_leaf ,
  "random_state":  random_state
  }

  mlflow.log_params(params)
  
  pipeline = Pipeline(steps=[
        ('mappingvalues', MappingValues(value_mapping)),
        ('numerical_only', numerical_only()),
        ('replace_null', NullReplacer(lista)),
    ])
  
  X_train = pipeline.fit_transform(X_train)
  y_train = y_train.loc[X_train.index]

  decision_tree = DecisionTreeRegressor(
    criterion=criterion,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=random_state
)
  decision_tree.fit(X_train, y_train)

  logged_model = mlflow.sklearn.log_model(decision_tree, name="decision-forest-numeric_only", input_example=X_train)


  X_test = pipeline.transform(X_test)
  y_test = y_test.loc[X_test.index]

  
  predictions = decision_tree.predict(X_test)

  
  mse = mean_squared_error(y_test, predictions,squared=False)

  r2 = r2_score(y_test, predictions)

  mlflow.log_metric("mse", mse)

  mlflow.log_metric("r2",r2)

  np.savetxt('predictions.csv', predictions, delimiter=',')

  
  mlflow.log_artifact("predictions.csv")

  
  df = pd.DataFrame(data = predictions - y_test)

  
  sns.scatterplot(data=df)
  plt.xlabel("Observation")
  plt.ylabel("Residual")
  plt.title("Residuals")
  plt.savefig("residuals_plot.png")
  mlflow.log_artifact("residuals_plot.png")

  plt.figure(figsize=(8, 5))
  sns.scatterplot(x=y_test, y=predictions)
  plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
  plt.xlabel("Actual")
  plt.ylabel("Predicted")
  plt.title("Predicted vs Actual")
  plt.grid(True)
  plt.tight_layout()
  plt.savefig("pred_vs_actual.png")
  mlflow.log_artifact("pred_vs_actual.png")
  print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)

#### Random Fortest

In [0]:
X_train,X_test,y_train,y_test = get_data()

with mlflow.start_run(run_name="random_forest_numeric_only"):

   
  n_estimators = 100
  max_depth = 6
  max_features = 3
  params = {
    "n_estimators": n_estimators,
    "max_depth": max_depth,
    "max_features": max_features
  }

  
  mlflow.log_params(params)
  
  pipeline = Pipeline(steps=[
        ('mappingvalues', MappingValues(value_mapping)),
        ('numerical_only', numerical_only()),
        ('replace_null', NullReplacer(lista)),
    ])
  
  X_train = pipeline.fit_transform(X_train)
  y_train = y_train.loc[X_train.index]

  # Create and train model.
  rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
  rf.fit(X_train, y_train)

  
  logged_model = mlflow.sklearn.log_model(rf, name="random-forest-numeric_only", input_example=X_train)


  X_test = pipeline.transform(X_test)
  y_test = y_test.loc[X_test.index]

  
  predictions = rf.predict(X_test)

  
  mse = mean_squared_error(y_test, predictions, squared= False)
  r2 = r2_score(y_test, predictions)  
  
  mlflow.log_metric("mse", mse)
  mlflow.log_metric("r2", r2)

  # Save the table of predicted values
  np.savetxt('predictions.csv', predictions, delimiter=',')

  
  mlflow.log_artifact("predictions.csv")

  
  df = pd.DataFrame(data = predictions - y_test)
  
  sns.scatterplot(data=df)
  plt.xlabel("Observation")
  plt.ylabel("Residual")
  plt.title("Residuals")
  plt.savefig("residuals_plot.png")
  mlflow.log_artifact("residuals_plot.png")

  plt.figure(figsize=(8, 5))
  sns.scatterplot(x=y_test, y=predictions)
  plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
  plt.xlabel("Actual")
  plt.ylabel("Predicted")
  plt.title("Predicted vs Actual")
  plt.grid(True)
  plt.tight_layout()
  plt.savefig("pred_vs_actual.png")
  mlflow.log_artifact("pred_vs_actual.png")
  print(f"""
    relatório de métricas de sucesso:
    r2: {r2}
    mse: {mse}
    """)

## Adicionando regularização dos dados numéricos

## Adicionando informações categóricas

In [0]:
X_train,X_test,y_train,y_test = get_data()

#### Selecionando colunas categóricas para serem adicionadas

In [0]:
selected_cols = X_train.select_dtypes(include="object").columns


encoded_df = pd.get_dummies(X_train[selected_cols], drop_first=True)

X_train_v1 = encoded_df
model = RandomForestRegressor(n_estimators=1000, random_state=42, max_depth=6)
model.fit(X_train_v1, y_train)

importances = pd.Series(model.feature_importances_, index=X_train_v1.columns)
top_features = importances.sort_values(ascending=False).head(10)
top_original_cols = set([col.split('_')[0] for col in top_features.index])

#### Criando Classes

In [0]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler

categoricas = X_train[top_original_cols]
categoricas.head()
ordinal_columns = ["BsmtExposure","BsmtQual","ExterQual","KitchenQual","FireplaceQu"]
cat_columns = [x for x in categoricas.columns if x not in ordinal_columns]

class OrdinalColumns(BaseEstimator,TransformerMixin):
    def __init__(self,lista_ordinais):
        self.lista_ordinais = lista_ordinais 
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        Ordinal = OrdinalEncoder()
        X_transformed = X.copy()
        for col in self.lista_ordinais:
            X_transformed[col] = Ordinal.fit_transform(X_transformed[[col]])
        return X_transformed

class CatColumns(BaseEstimator,TransformerMixin):
    def __init__(self,lista_categoricas):
        self.lista_categoricas = lista_categoricas 
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        OneHot = OneHotEncoder()
        X_transformed = X.copy()
        for col in self.lista_categoricas:
            X_transformed[col] = OneHot.fit_transform(X_transformed[[col]])
        return X_transformed

class NumericAndInt(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass 
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X_transformed = X.copy()
        colunas_numericas = X_transformed.select_dtypes(include="number").columns
        lista_final = colunas_numericas + ordinal_columns + cat_columns
        X_transformed = X_transformed[lista_final]
        return X_transformed



In [0]:
ordinal_columns = ["BsmtQual", "ExterQual", "KitchenQual"]
cat_columns = [x for x in X_train.columns if x not in ordinal_columns and X_train[x].dtype == "object"]

class OrdinalColumns(BaseEstimator, TransformerMixin):
    def __init__(self, lista_ordinais):
        self.lista_ordinais = lista_ordinais
        self.encoder = OrdinalEncoder()

    def fit(self, X, y=None):
        self.encoder.fit(X[self.lista_ordinais])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.lista_ordinais] = self.encoder.transform(X[self.lista_ordinais])
        return X_transformed

class CatColumns(BaseEstimator, TransformerMixin):
    def __init__(self, lista_categoricas):
        self.lista_categoricas = lista_categoricas
        self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

    def fit(self, X, y=None):
        self.encoder.fit(X[self.lista_categoricas])
        self.feature_names = self.encoder.get_feature_names_out(self.lista_categoricas)
        return self

    def transform(self, X):
        X_transformed = X.copy()
        encoded = self.encoder.transform(X[self.lista_categoricas])
        encoded_df = pd.DataFrame(encoded, columns=self.feature_names, index=X.index)
        X_transformed = X_transformed.drop(columns=self.lista_categoricas)
        X_transformed = pd.concat([X_transformed, encoded_df], axis=1)
        return X_transformed

class NumericAndInt(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.select_dtypes(include="number")

with mlflow.start_run(run_name="cat+int"):

    n_estimators = 100
    max_depth = 6
    max_features = 3
    params = {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "max_features": max_features
    }

    mlflow.log_params(params)

    pipeline = Pipeline(steps=[
        ('mappingvalues', MappingValues(value_mapping)),
        ('ordinal_columns', OrdinalColumns(ordinal_columns)),
        ('replace_null', NullReplacer(lista)),
        ("number_only",numerical_only()),
        ("standard_scaler", StandardScaler())
    ])

    # Apply transformation
    X_train_transformed = pipeline.fit_transform(X_train)
    y_train = y_train.loc[X_train.index]

    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
    rf.fit(X_train_transformed, y_train)

    mlflow.sklearn.log_model(rf, name="random-forest-model", input_example=X_train_transformed)

    X_test_transformed = pipeline.transform(X_test)
    y_test = y_test.loc[X_test.index]

    predictions = rf.predict(X_test_transformed)
    mse = mean_squared_error(y_test, predictions, squared=False)
    mlflow.log_metric("mse", mse)

    np.savetxt('predictions.csv', predictions, delimiter=',')
    mlflow.log_artifact("predictions.csv")

    df = pd.DataFrame(data=predictions - y_test)
    sns.scatterplot(data=df)
    plt.xlabel("Observation")
    plt.ylabel("Residual")
    plt.title("Residuals")
    plt.savefig("residuals_plot.png")
    mlflow.log_artifact("residuals_plot.png")

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=y_test, y=predictions)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Predicted vs Actual")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("pred_vs_actual.png")
    mlflow.log_artifact("pred_vs_actual.png")


In [0]:
categoricas["BsmtExposure"].unique()

In [0]:
from sklearn.preprocessing import OrdinalEncoder
list_of_ordinal_columns = [x for x in X_train.columns if "Qual" in x[-4:] or "Cond" in x[-4:]]
teste = X_train[list_of_ordinal_columns]
teste.head()

In [0]:
a = "Condition2"
a[-4:]

## Verificação de qualidade de dados

In [0]:
with mlflow.start_run(run_name="v1_LinReg_numerical_only"):



# Set the params for linear_gression
params = {
  "n_estimators": n_estimators,
  "max_depth": max_depth,
  "max_features": max_features
}


mlflow.log_params(params)

# Create and train model.
rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
rf.fit(X_train, y_train)


logged_model = mlflow.sklearn.log_model(rf, name="random-forest-model", input_example=X_train)

pipeline = Pipeline(steps=[
      ('mappingvalues', MappingValues(value_mapping)),
      ('numerical_only', numerical_only()),
      ('replace_null', NullReplacer(lista)),
  ])

X_test = pipeline.transform(X_test)
y_test = y_test.loc[X_test.index]


predictions = rf.predict(X_test)


mse = mean_squared_error(y_test, predictions)
  

mlflow.log_metric("mse", mse)

# Save the table of predicted values
np.savetxt('predictions.csv', predictions, delimiter=',')


mlflow.log_artifact("predictions.csv")


df = pd.DataFrame(data = predictions - y_test)


sns.scatterplot(data=df)
plt.xlabel("Observation")
plt.ylabel("Residual")
plt.title("Residuals")


plt.savefig("residuals_plot.png")
mlflow.log_artifact("residuals_plot.png")
df.head()

In [0]:
n = len(X_train.count())
nulos = X_train.isnull().sum().reset_index()
nulos["relativa"] = round(nulos[0]/n,0)
nulos.sort_values(by="relativa",ascending=False)

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin    

class NullReplacer(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns):
        if isinstance(columns,str):
            self.columns = [columns]
        else:
            self.columns = columns
            
    def fit(self,X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna("None")
                
        return X_transformed

In [0]:
X_train.isnull().sum().sort_values(ascending=False)

In [0]:
lista = [
    "MiscFeature","Alley","Fence","MasVnrType","FireplaceQu","GarageCond","GarageQual",
    "GarageFinish","GarageCond", "GarageQual", "GarageFinish","GarageType","GarageYrBlt", 
    "BsmtExposure", "BsmtFinType2", "BsmtCond","BsmtFinType1", "BsmtQual","PoolQC"
]

null_replacer = NullReplacer(lista)
for col in lista:
    X_train = null_replacer.transform(X_train)

X_train.isnull().sum().sort_values(ascending=False)

In [0]:
knn = KNNImputer()
X_train["LotFrontage"] = knn.fit_transform(X_train[["LotFrontage"]])

In [0]:
X_train.isnull().sum().sort_values(ascending=False)

In [0]:
X_train = X_train.dropna()

In [0]:
X_train.hist(figsize=(12,10))

In [0]:
y_train.shape
X_train.shape

In [0]:
y_train = y_train.loc[X_train.index]


def show_plot():   
    n_cols = 6
    n_rows = int(np.ceil(len(X_train.select_dtypes(include="number")) / n_cols))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
    axes = axes.flatten()

    for i, col in enumerate(X_train.select_dtypes(include="number")):
        ax = axes[i]
        try:
            modelo = LinearRegression()
            X_train_value = X_train[[col]]
            modelo.fit(X_train_value, y_train)
            previsao = modelo.predict(X_train_value)

            r2 = round(r2_score(y_train, previsao), 2)
            mse = round(mean_squared_error(y_train, previsao), 0)

            sns.scatterplot(data=X_train_value, x=col, y=y_train, ax=ax, label=f"r2: {r2}\nmse: {mse}")
            sns.lineplot(x=X_train[col], y=previsao, color='red', ax=ax)

            ax.set_title(col)
            ax.legend()
        except Exception as e:
            ax.set_visible(False)
            print(f"Skipped {col} due to error: {e}")

    for j in range(i+1, len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()


In [0]:
show_plot()

## CONSERTAR RESTANTE DO CODIGO

Realizando a remoção de alguns outliers:

In [0]:
train = train[train["SalePrice"] <= 500000]
train = train[train["GrLivArea"] <= 4000]
train = train[train["TotalBsmtSF"] <= 3000]

In [0]:
show_plot()

### Verificando Distribuição de preço de acordo com boxplots. Criando uma função que permita visualizar os dados de acordo com os valores únicos.

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns


def tabela_de_frequencia(df, col, col_target):
    """
    Gera um DataFrame com:
    - Frequência absoluta
    - Frequência relativa (%)
    - Estatísticas de col_target (média, mediana, std, min, max, Q1, Q3)

    Args:
        df (pd.DataFrame): DataFrame de entrada
        col (str): Nome da coluna categórica a ser analisada

    Returns:
        pd.DataFrame: Tabela resumo com frequências e estatísticas de posição e dispersão
    """

    frequencias = df[col].value_counts(dropna=False)
    frequencias_rel = df[col].value_counts(normalize=True, dropna=False) * 100

    # Estatísticas de SalePrice por categoria
    estatisticas = df.groupby(col)[col_target].agg(
        q25=lambda x: x.quantile(0.25),
        media="mean",
        mediana="median",
        desvio_padrao="std",
        q75=lambda x: x.quantile(0.75),
        maximo="max"
    )

    tabela = pd.DataFrame(
        {
            col: frequencias.index,
            "Frequência Absoluta": frequencias.values,
            "Frequência Relativa (%)": frequencias_rel.values,
        }
    ).set_index(col)

    tabela = tabela.join(estatisticas)
    tabela = tabela.round(2)

    return tabela.reset_index()


def visualizacao_box_plot(df, col, col_target):
    """
    Gera um boxplot para a coluna col comparando com SalePrice.

    Parâmetros:
    - df: DataFrame pandas contendo os dados
    - col: string, nome da coluna categórica a ser usada no eixo X
    """
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, x=col, y=col_target, hue=col, legend=False)
    plt.title(f"Boxplot de SalePrice por {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

#### A classifação da área Impacta no preço do empreendimento?

In [0]:
colunas = train.select_dtypes(include="object").columns
n_cols = 5 
n_rows = int(np.ceil(len(colunas) / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*10, n_rows*9))
axes = axes.flatten()

for i, col in enumerate(colunas):
    ax = axes[i]
    try:
        sns.boxplot(data=train, x=col, y="SalePrice", hue=col, ax=ax, legend=False)
        ax.set_title(col)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
    except Exception as e:
        ax.set_visible(False)

for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [0]:
tabela_de_frequencia(train, "MSZoning", "SalePrice")

#### Principais pontos da Zona:
1. O dataset é composto majoritariamente por partes residenciais de baixa densidade.
2. O Conjunto de baixa densidade possui um número alto de dispersão, com o maior desvio padrão entre as séries, isso provavelmente se deve ao fato que tanto residências em locais de pouca habitação como regiões mais rurais e regiões mais ricas onde o número de casas é menor devido a extensão das casas pode explicar esse fato.
3. Podemos fazer uma verificação do tamanho das casas em cada MSZoning para verificar se existe essa relação, dado que casas mais caras podem ser casas maiores, o que causaria o aumento do preço.

Para verificar isso, podemos identificar a distribuição de tamanho de casas para casa tipo de zona

In [0]:
visualizacao_box_plot(train, "MSZoning", "LotArea")

Como esperado, as residências em regiões de baixa densidade também contém os outliers em relação com tamanho residencial. Com relação a distribuição de residências com baixa residência, é estimado que 45% da área de Ames é relacionada com residências de unifamiliares, que geralmente são classificadas como baixa densidade residencial.
[Referência: Zoneamento de Ames, Iowa](https://www.zoneomics.com/zoning-maps/iowa/ames?utm_source=chatgpt.com)

Entretanto, considerando o exposto, podemos tentar entender o preço da residência de acordo com o tamanho de sua extensão, vamos verificar isso com um gráfico de correlação entre preço e tamanho da área e verificar como se distribuem de acordo com o Zoneamento