## Modellvorbereitung (siehe ```notebooks/main.ipynb```)

In [1]:
import os
import pandas as pd
from datetime import datetime
import numpy as np
np.random.seed(42)

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

FILEPATH = os.path.join('..', 'input', 'armslengthsales_2022_valid.csv')
df = pd.read_csv(FILEPATH)

current_year = datetime.now().year
df['Age'] = current_year - df['Year_Built']
df['Bath'] = df['Fbath']+(df['Hbath']/2)
df['Total_Rooms'] = df['Bdrms'] + df['Rooms']

df['District'] = df['District'].astype(object)
df['nbhd'] = df['nbhd'].astype(object)

num_features = ['Stories', 'Age', 'Total_Rooms', 'FinishedSqft', 'Units', 'Bath', 'Lotsize']
cat_features = ['District', 'nbhd', 'Extwall']

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

y = ['Sale_price']
X = [*num_features, *cat_features]

set_config(transform_output='pandas')

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

prep = ColumnTransformer(
    remainder='drop',
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features),
    ],
)

prep_transformed = prep.fit_transform(X=train_data[X])

## Hyperparameter-Optimierung
Die verwendeteten Parameter für die Optimierung wurden mithilfe von ChatGPT3.5 generiert (siehe https://chat.openai.com/share/2664e409-ecb3-4b6f-a98b-0f58d83d97c4).

In [2]:
models = [
    ('Decision Tree', DecisionTreeRegressor(random_state=42), {
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('Random Forest', RandomForestRegressor(random_state=42), {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42), {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.05, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('K-nearest Neighbors', KNeighborsRegressor(), {
        'model__n_neighbors': [3, 5, 7, 9],
        'model__weights': ['uniform', 'distance'],
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    })
]

In [4]:
for name, model, parameters in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])

    grid_search = GridSearchCV(estimator=pipe, param_grid=parameters, cv=5, n_jobs=-1)
    grid_search.fit(X=train_data[X], y=train_data[y].values.ravel())

    cv = cross_val_score(estimator=grid_search.best_estimator_, X=train_data[X], y=train_data[y], cv=5, n_jobs=-1)

    print(f'{name} - Best parameters -  {grid_search.best_params_}')
    print(f'{name} - Cross validation {cv}')
    print(f'{name} -     In-sample -  mae: {mean_absolute_error(y_true=train_data[y], y_pred=grid_search.predict(train_data[X])):.2f}')
    print(f'{name} -     In-sample - mape: {mean_absolute_percentage_error(y_true=train_data[y], y_pred=grid_search.predict(train_data[X]))*100:.2f}%')
    print(f'{name} - Out-of-sample -  mae: {mean_absolute_error(y_true=test_data[y], y_pred=grid_search.predict(test_data[X])):.2f}')
    print(f'{name} - Out-of-sample - mape: {mean_absolute_percentage_error(y_true=test_data[y], y_pred=grid_search.predict(test_data[X]))*100:.2f}%')
    print(f'{name} - Out-of-sample - rsme: {np.sqrt(mean_squared_error(y_true=test_data[y], y_pred=grid_search.predict(test_data[X]))):.2f}')
    print(f'{name} - Out-of-sample -   r2: {grid_search.best_score_:.2f}')

Decision Tree - Best parameters -  {'model__max_depth': 10}
Decision Tree - Cross validation [-0.32447434  0.42385393  0.78046054 -0.18007649  0.68016006]
Decision Tree -     In-sample -  mae: 59938.58
Decision Tree -     In-sample - mape: 43.23%
Decision Tree - Out-of-sample -  mae: 77905.62
Decision Tree - Out-of-sample - mape: 38.71%
Decision Tree - Out-of-sample - rsme: 248751.59
Decision Tree - Out-of-sample -   r2: 0.28
Random Forest - Best parameters -  {'model__n_estimators': 50}
Random Forest - Cross validation [0.64235963 0.70756342 0.77664287 0.62255755 0.71741365]
Random Forest -     In-sample -  mae: 28401.08
Random Forest -     In-sample - mape: 10.40%
Random Forest - Out-of-sample -  mae: 52892.95
Random Forest - Out-of-sample - mape: 22.59%
Random Forest - Out-of-sample - rsme: 178672.00
Random Forest - Out-of-sample -   r2: 0.69
Gradient Boosting - Best parameters -  {'model__n_estimators': 200}
Gradient Boosting - Cross validation [0.49383099 0.71100716 0.75426652 0.3