## *Machine Learning*

In [0]:
path = "workspace.homepricesprojectdatawarehouse.goldenhouseprices_info_view"
df = spark.read.table(path)

In [0]:
from pyspark.sql import functions as F

null_counts = df.select(
    [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]
)

In [0]:
from pyspark.sql.functions import * # month, when, col
df=df.withColumn('month', month(df['Date_sold']))\
    .withColumn("Average_Price_per_Bedroom",when(df["Number_Of_Bedrooms"] > 0, df['price'] / df['Number_Of_Bedrooms'])\
    .otherwise(0))
df = df.select([col(c).alias(c.capitalize()) for c in df.columns])

In [0]:
dfpd = df.toPandas()

In [0]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PowerTransformer,OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


x=dfpd.drop(["Date_sold","Price","Month_name"],axis=1)
y=dfpd["Price"]

numeric_transformer = MinMaxScaler()
oneh_categorical_transformer=OneHotEncoder(sparse_output=False)
Ordinal_categorical_transformer=OrdinalEncoder()

transform_columns = ["Average_price_per_bedroom"]
num_features = ["Average_price_per_bedroom"]
oneh_features=["Region_postcode","Property_type"]
Ordinal_features=["Number_of_bedrooms","Month","Day_name","Year_date"]

transform_pipe = Pipeline(steps=[
                ('transformer', PowerTransformer(method='yeo-johnson'))

            ])
preprocessor = ColumnTransformer(
                [
                    ("Transformer", transform_pipe, transform_columns),
                    ("StandardScaler", numeric_transformer, num_features),
                    ("OrdinalEncoder", Ordinal_categorical_transformer, Ordinal_features),
                    ("OneHotEncoder", oneh_categorical_transformer, oneh_features),
                    
                ]
            )

preprocessed_data=preprocessor.fit_transform(x)            

In [0]:
import mlflow
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, 
    AdaBoostRegressor, BaggingRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [0]:
with mlflow.start_run(run_name="Cross-Validation Experiment-House Pricing Project"):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    models = {
        "RandomForestRegressor": RandomForestRegressor(),
        "GradientBoostingRegressor": GradientBoostingRegressor(),
        "AdaBoostRegressor": AdaBoostRegressor(),
        "KNeighborsRegressor": KNeighborsRegressor(),
        "SGDRegressor": SGDRegressor(),
        "BaggingRegressor": BaggingRegressor(DecisionTreeRegressor(), n_estimators=500)
    }

    for model_name, model_instance in models.items():
       
        with mlflow.start_run(nested=True, run_name=model_name):
            mlflow.log_param("model_type", model_name)
            fold_metrics = []

            for fold_idx, (train_index, test_index) in enumerate(kf.split(preprocessed_data)):
                with mlflow.start_run(nested=True, run_name=f"{model_name}_Fold_{fold_idx}"):
                    X_train, X_test = preprocessed_data[train_index], preprocessed_data[test_index]
                    y_train, y_val = y[train_index], y[test_index]

                    model_instance.fit(X_train, y_train)
                    predictions = model_instance.predict(X_test)

                 
                    rmse = np.sqrt(mean_squared_error(y_val, predictions))
                    mlflow.log_metric("RMSE", rmse)
                    fold_metrics.append(rmse)

            avg_rmse = np.mean(fold_metrics)
            mlflow.log_metric("avg_RMSE", avg_rmse)


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_data, y, test_size=0.3, random_state=42
)
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
final_mse = mean_squared_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)

print("Test set MSE:", np.round(final_mse, 2))
print("Test set RMSE:", np.round(final_rmse, 2))

In [0]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams["figure.figsize"] = (20, 5)

y_val = y_test
plt.subplot(121)
sns.regplot(x=y_test, y=y_pred, ci=95, color="red")

plt.subplot(122)
sns.residplot(x=y_test, y=y_pred)
plt.xlabel("Price")
plt.ylabel("Residuals");

In [0]:
%sql
DROP VIEW   goldenhouseprices_info_view