### Modellvorbereitung (siehe ```notebooks/main.ipynb```)

In [1]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

FILEPATH = os.path.join("..", "input", "armslengthsales_2022_valid.csv")
df = pd.read_csv(FILEPATH)

df = df.drop('PropertyID', axis=1)
df = df.drop('taxkey', axis=1)
df = df.drop('Address', axis=1)
df = df.drop('CondoProject', axis=1)
df = df.drop('PropType', axis=1)
df = df.drop('Style', axis=1)
df = df.drop('Sale_date', axis=1)

df['District'] = df['District'].astype(object)
df['nbhd'] = df['nbhd'].astype(object)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

num_features = ["Stories", "Year_Built", "FinishedSqft", "Units", "Fbath", "Hbath", "Lotsize", "Rooms", "Bdrms"]
cat_features = ["District", "nbhd", "Extwall"]

y = ["Sale_price"]
X = [*num_features, *cat_features]

set_config(transform_output="pandas")

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

prep = ColumnTransformer(
    remainder="drop",
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ],
)

prep_transformed = prep.fit_transform(X=train_data[X])

## Hyperparameter-Optimierung

Die verwendeteten Parameter für die Optimierung wurden mithilfe von ChatGPT3.5 generiert (siehe https://chat.openai.com/share/d4d885a8-7d0e-423e-8e9f-93567ec20465). Hierbei wurden jedoch nur 5 Parameter ausgewählt und leicht angepasst, da die Optimierung sonst viel zu lange laufen würde.

In [2]:
parameters = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [3, 5, 7, 9, 11],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__min_samples_split': [2, 5, 10],
    'model__subsample': [0.5, 0.7, 0.9]
}

In [3]:
pipe = Pipeline(steps=[
    ('preprocessor', prep),
    ('model', GradientBoostingRegressor(random_state=42))
])

grid_search = GridSearchCV(estimator=pipe, param_grid=parameters, cv=3, n_jobs=-1)
grid_search.fit(X=train_data[X], y=train_data[y])

  y = column_or_1d(y, warn=True)


In [4]:
grid_search.best_params_

{'model__learning_rate': 0.2,
 'model__max_depth': 5,
 'model__min_samples_split': 5,
 'model__n_estimators': 300,
 'model__subsample': 0.9}

In [5]:
cross_val_score(estimator=grid_search.best_estimator_, X=train_data[X], y=train_data[y], cv=3, n_jobs=-1)

array([0.75774323, 0.80218211, 0.63887412])

In [6]:
grid_search.best_score_

0.7329331551365116

In [7]:
y_pred = grid_search.predict(test_data[X])

print("MAE:", mean_absolute_error(y_true=test_data[y], y_pred=y_pred))
print("MSE:", mean_squared_error(y_true=test_data[y], y_pred=y_pred))

MAE: 49618.634652311295
MSE: 23158094025.666336


In [8]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(pd.DataFrame(grid_search.cv_results_).sort_values(by="rank_test_score"))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,param_model__max_depth,param_model__min_samples_split,param_model__n_estimators,param_model__subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
446,16.777876,0.667343,0.067162,0.00788,0.2,5,5,300,0.9,"{'model__learning_rate': 0.2, 'model__max_dept...",0.757743,0.802182,0.638874,0.732933,0.06894,1
449,27.707382,1.726754,0.087961,0.014223,0.2,5,5,500,0.9,"{'model__learning_rate': 0.2, 'model__max_dept...",0.757458,0.802116,0.63822,0.732598,0.069181,2
443,5.075222,0.205975,0.061179,0.004473,0.2,5,5,100,0.9,"{'model__learning_rate': 0.2, 'model__max_dept...",0.757247,0.800599,0.636278,0.731375,0.069534,3
351,6.300726,0.823837,0.091271,0.017326,0.1,9,2,100,0.5,"{'model__learning_rate': 0.1, 'model__max_dept...",0.833267,0.76751,0.568271,0.723016,0.112666,4
354,17.156684,1.638544,0.129646,0.018601,0.1,9,2,300,0.5,"{'model__learning_rate': 0.1, 'model__max_dept...",0.836938,0.767717,0.563718,0.722791,0.115977,5
357,34.76496,3.061132,0.120607,0.028648,0.1,9,2,500,0.5,"{'model__learning_rate': 0.1, 'model__max_dept...",0.836921,0.767713,0.563553,0.722729,0.116047,6
314,27.902363,1.218313,0.093599,0.010845,0.1,5,5,500,0.9,"{'model__learning_rate': 0.1, 'model__max_dept...",0.830341,0.816003,0.521114,0.722486,0.142512,7
311,18.485169,0.441925,0.089907,0.012107,0.1,5,5,300,0.9,"{'model__learning_rate': 0.1, 'model__max_dept...",0.830471,0.815615,0.520913,0.722333,0.142555,8
341,36.873937,1.363078,0.119808,0.027649,0.1,7,5,500,0.9,"{'model__learning_rate': 0.1, 'model__max_dept...",0.797409,0.815072,0.551055,0.721179,0.120512,9
338,23.205911,0.839625,0.097371,0.013129,0.1,7,5,300,0.9,"{'model__learning_rate': 0.1, 'model__max_dept...",0.797671,0.814759,0.550404,0.720945,0.120792,10
