### Modellvorbereitung (siehe ```notebooks/main.ipynb```)

In [1]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

FILEPATH = os.path.join("..", "input", "armslengthsales_2022_valid.csv")
df = pd.read_csv(FILEPATH)

# dropped features wird nicht verwendet, ist lediglich eine Kennzeichnung der nicht verwendeten Features
dropped_features = ["PropertyID", "taxkey", "Address", "CondoProject", "PropType", "Style", "Sale_date"]

num_features = ["Stories", "Year_Built", "FinishedSqft", "Units", "Fbath", "Hbath", "Lotsize", "Rooms", "Bdrms"]
cat_features = ["District", "nbhd", "Extwall"]

df['District'] = df['District'].astype(object)
df['nbhd'] = df['nbhd'].astype(object)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

y = ["Sale_price"]
X = [*num_features, *cat_features]

set_config(transform_output="pandas")

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

prep = ColumnTransformer(
    remainder="drop",
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ],
)

prep_transformed = prep.fit_transform(X=train_data[X])

## Hyperparameter-Optimierung

Die verwendeteten Parameter für die Optimierung wurden mithilfe von ChatGPT3.5 generiert (siehe https://chat.openai.com/share/2664e409-ecb3-4b6f-a98b-0f58d83d97c4).

In [4]:
models = [
    ('Decision Tree', DecisionTreeRegressor(random_state=42), {
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('Random Forest', RandomForestRegressor(random_state=42), {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42), {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.05, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('K-nearest Neighbors', KNeighborsRegressor(), {
        'model__n_neighbors': [3, 5, 7, 9],
        'model__weights': ['uniform', 'distance'],
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    })
]

print("Avg. Price on Training", train_data[y].mean().iloc[0])
print("Avg. Price on Test", test_data[y].mean().iloc[0])

Avg. Price on Training 277506.43614415673
Avg. Price on Test 247715.75804195803


In [5]:
for name, model, parameters in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])

    y_train = train_data[y].values.ravel()
    y_test = test_data[y].values.ravel()

    grid_search = GridSearchCV(estimator=pipe, param_grid=parameters, cv=5, n_jobs=-1)
    grid_search.fit(X=train_data[X], y=y_train)

    cv = cross_val_score(estimator=grid_search.best_estimator_, X=train_data[X], y=y_train, cv=5, n_jobs=-1)

    print("====== ", name, " ======")
    print("Best parameters:", grid_search.best_params_)
    print("Cross validation", cv)
    print(" MAE (Train):", round(mean_absolute_error(y_true=y_train, y_pred=grid_search.predict(train_data[X])), 2))
    print("MAPE (Train):", round(mean_absolute_percentage_error(y_true=y_train, y_pred=grid_search.predict(train_data[X])) * 100, 2), "%")
    print(" MAE (Test) :", round(mean_absolute_error(y_true=y_test, y_pred=grid_search.predict(test_data[X])), 2))
    print("MAPE (Test) :", round(mean_absolute_percentage_error(y_true=y_test, y_pred=grid_search.predict(test_data[X])) * 100, 2), "%")
    print("RMSE (Test) :", round(np.sqrt(mean_squared_error(y_true=y_test, y_pred=grid_search.predict(test_data[X]))), 2))
    print("  R2 (Test) :", round(grid_search.best_score_), 2)
    print()

Best parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2}
Cross validation [0.07930417 0.71967379 0.76169902 0.20610311 0.63742624]
R2: 0.48084126630161333
MAE: 63315.077784031455
MSE: 33812075294.57129

Best parameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 50}
Cross validation [0.68808391 0.72217775 0.77652767 0.62274344 0.72100665]
R2: 0.7061078823636509
MAE: 53032.778400284034
MSE: 31029408445.659473

Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__min_samples_leaf': 1, 'model__min_samples_split': 10, 'model__n_estimators': 200}
Cross validation [0.63614934 0.78655215 0.80430033 0.42438778 0.83861228]
R2: 0.6980003747936694
MAE: 53177.88278485768
MSE: 29089438962.616226

Best parameters: {'model__algorithm': 'ball_tree', 'model__n_neighbors': 9, 'model__weights': 'distance'}
Cross validation [0.73887547 0.61095805 0.72481514 0.31228257 