In [1]:
import os
import pandas as pd
from datetime import datetime
import numpy as np
np.random.seed(42)

import matplotlib.pyplot as plt

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, learning_curve, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

from sklearn.decomposition import PCA

## Datenimport
Quelle der Daten: https://data.milwaukee.gov/dataset/property-sales-data

In [2]:
FILEPATH = os.path.join('..', 'input', 'armslengthsales_2022_valid.csv')

In [3]:
df = pd.read_csv(FILEPATH)

# Hilfsfunktionen

In [84]:
def evaluate_model(model_name, ct, pipe, train_data, test_data, X, y):
    cv = cross_val_score(estimator=pipe[-1], X=ct.fit_transform(train_data[X]), y=train_data[y], cv=10, n_jobs=4)
    in_sample_mae = mean_absolute_error(y_true=train_data[y], y_pred=pipe.predict(train_data[X]))
    in_sample_mape = mean_absolute_percentage_error(y_true=train_data[y], y_pred=pipe.predict(train_data[X]))*100
    out_of_sample_mae = mean_absolute_error(y_true=test_data[y], y_pred=pipe.predict(test_data[X]))
    out_of_sample_mape = mean_absolute_percentage_error(y_true=test_data[y], y_pred=pipe.predict(test_data[X]))*100
    out_of_sample_rsme = np.sqrt(mean_squared_error(y_true=test_data[y], y_pred=pipe.predict(test_data[X])))
    out_of_sample_r2 = pipe.score(X=test_data[X], y=test_data[y])

    return pd.DataFrame({
        'model': [model_name],
        'cv_mean': [cv.mean()],
        'in_sample_mae': [in_sample_mae],
        'in_sample_mape': [in_sample_mape],
        'out_of_sample_mae': [out_of_sample_mae],
        'out_of_sample_mape': [out_of_sample_mape],
        'out_of_sample_rsme': [out_of_sample_rsme],
        'out_of_sample_r2': [out_of_sample_r2],
        'cv': [cv]
    })

def print_evaluation(metrics):
    print(f'{metrics["model"].iloc[0]} - Cross validation {metrics["cv"].iloc[0]}')
    print(f'{metrics["model"].iloc[0]} - Cross validation mean: {metrics["cv_mean"].iloc[0]:.2f}')
    print(f'{metrics["model"].iloc[0]} -      In-sample -  mae: {metrics["in_sample_mae"].iloc[0]:.2f}')
    print(f'{metrics["model"].iloc[0]} -      In-sample - mape: {metrics["in_sample_mape"].iloc[0]:.2f}%')
    print(f'{metrics["model"].iloc[0]} -  Out-of-sample -  mae: {metrics["out_of_sample_mae"].iloc[0]:.2f}')
    print(f'{metrics["model"].iloc[0]} -  Out-of-sample - mape: {metrics["out_of_sample_mape"].iloc[0]:.2f}%')
    print(f'{metrics["model"].iloc[0]} -  Out-of-sample - rsme: {metrics["out_of_sample_rsme"].iloc[0]:.2f}')
    print(f'{metrics["model"].iloc[0]} -  Out-of-sample -   r2: {metrics["out_of_sample_r2"].iloc[0]:.2f}')

def plot_evaluation(name, ct, pipe, train_data, test_data, X, y):
    train_predictions = pipe.predict(train_data[X])
    test_predictions = pipe.predict(test_data[X])

    plt.figure(figsize=(12, 10))

    plt.subplot(3, 1, 1)
    plt.scatter(train_predictions, train_data[y], color='blue', label='Trainingsdaten')
    plt.scatter(test_predictions, test_data[y], color='red', label='Testdaten')
    plt.xlabel('Vorhergesagter Verkaufspreis')
    plt.ylabel('Tatsächlicher Verkaufspreis')
    plt.title(f'{name} - Vorhersagen vs. Tatsächliche Werte')
    plt.ticklabel_format(style='plain', axis='x') # Wird benötigt, damit die Werte nicht als 0,2 x 10^X dargestellt werden sondern als Ganzzahlen
    plt.ticklabel_format(style='plain', axis='y') # Wird benötigt, damit die Werte nicht als 0,2 x 10^X dargestellt werden sondern als Ganzzahlen
    plt.legend()

    cv_results = cross_validate(estimator=pipe[-1], X=ct.fit_transform(train_data[X]), y=train_data[y].values.ravel(), cv=10, n_jobs=4, return_train_score=True)

    train_scores = cv_results['train_score']
    test_scores = cv_results['test_score']
    
    plt.subplot(3, 1, 2)
    plt.plot(range(len(train_scores)), train_scores, label='In-sample Score', marker='o')
    plt.plot(range(len(test_scores)), test_scores, label='Out-of-sample Score', marker='x')
    plt.xticks(rotation=45)
    plt.title(f'{name} - Cross Validation - In-sample vs Out-of-sample Scores')
    plt.legend()

    train_sizes, train_scores, test_scores = learning_curve(pipe, train_data[X], train_data[y].values.ravel(), cv=5, scoring='r2', train_sizes=np.linspace(0.1, 1.0, 10))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.subplot(3, 1, 3)
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='In-sample score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Out-of-sample score')
    plt.xlabel('Steps')
    plt.ylabel('R2-Score')
    plt.legend(loc='best')
    plt.title(f'{name} - Learning Curve R2')

    plt.tight_layout()
    plt.show()

# Preprocessing

In [5]:
current_year = datetime.now().year
df['Age'] = current_year - df['Year_Built']
df['Bath'] = df['Fbath']+(df['Hbath']/2)
df['Total_Rooms'] = df['Bdrms'] + df['Rooms']

df['District'] = df['District'].astype(object)
df['nbhd'] = df['nbhd'].astype(object)

num_features = ['Stories', 'Age', 'Total_Rooms', 'FinishedSqft', 'Units', 'Bath', 'Lotsize']
cat_features = ['District', 'nbhd', 'Extwall']

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

y = ['Sale_price']
X = [*num_features, *cat_features]

In [6]:
set_config(transform_output='pandas')

# Auffüllen der fehlenden Numerischen Werte mit dem Durchschnitt + Anwendung StandardScaler
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Auffüllen der fehlenden Kategorischen Werte mit den häufigsten Werten + Anwendung OneHotEncoder
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

prep = ColumnTransformer(
    remainder='drop',
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features),
    ],
)

prep

In [7]:
prep_transformed = prep.fit_transform(X=train_data[X])
prep_transformed

Unnamed: 0,num__Stories,num__Age,num__Total_Rooms,num__FinishedSqft,num__Units,num__Bath,num__Lotsize,cat__District_1,cat__District_2,cat__District_3,...,cat__Extwall_Fiber Cement/Hardiplank,cat__Extwall_Masonary Frame,cat__Extwall_Masonry/Frame,cat__Extwall_Metal Siding,cat__Extwall_Other,cat__Extwall_Precast Masonary,cat__Extwall_Prem Wood,cat__Extwall_Stone,cat__Extwall_Stucco,cat__Extwall_Wood
59,2.040804,-0.309840,-0.624829,-0.148994,-0.069573,-0.115614,-0.250146,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3717,1.123395,0.107680,-1.643368,0.151981,0.333957,-2.092238,-0.104320,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
787,-0.711424,-0.265423,-0.624829,-0.138065,-0.069573,-0.115614,-0.054248,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2540,0.205985,0.311998,-0.115560,-0.121344,-0.069573,0.543260,-0.117382,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6225,0.205985,0.089913,0.054196,-0.048013,-0.002318,0.543260,-0.112956,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,-0.711424,0.143213,-0.455073,-0.136535,-0.069573,-0.774489,-0.077469,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5191,-0.711424,0.463015,-0.794586,-0.168009,-0.069573,-0.774489,-0.141330,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5226,-0.711424,0.320881,0.223953,-0.095006,-0.002318,0.543260,-0.123188,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,-0.711424,0.063263,-0.455073,-0.161452,-0.069573,-0.774489,-0.047499,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modelltraining und -vergleich

In [None]:
models = [
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42)),
    ('K-nearest Neighbors', KNeighborsRegressor())
]

for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])

    pipe.fit(X=train_data[X], y=train_data[y].values.ravel())

    metrics = evaluate_model(name, prep, pipe, train_data, test_data, X, y)
    print_evaluation(metrics)

## Hyperparameter-Optimierung
Die verwendeteten Parameter für die Optimierung wurden mithilfe von ChatGPT3.5 generiert (siehe https://chat.openai.com/share/2664e409-ecb3-4b6f-a98b-0f58d83d97c4).

In [None]:
models = [
    ('Decision Tree', DecisionTreeRegressor(random_state=42), {
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('Random Forest', RandomForestRegressor(random_state=42), {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 5, 10, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42), {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.05, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }),
    ('K-nearest Neighbors', KNeighborsRegressor(), {
        'model__n_neighbors': [3, 5, 7, 9],
        'model__weights': ['uniform', 'distance'],
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    })
]

for name, model, parameters in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])

    grid_search = GridSearchCV(estimator=pipe, param_grid=parameters, cv=5, n_jobs=-1)
    grid_search.fit(X=train_data[X], y=train_data[y].values.ravel())

    metrics = evaluate_model(name, prep, pipe, train_data, test_data, X, y)
    print_evaluation(metrics)

In [8]:
params_dt = {'random_state': 42, 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2}
params_rf = {'random_state': 42, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
params_gb = {'random_state': 42, 'learning_rate': 0.1, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
params_knn = {'algorithm': 'ball_tree', 'n_neighbors': 9, 'weights': 'distance'}

## Evaluation - Decision Tree

In [85]:
model_name = 'Decision Tree'

pipe_dt = Pipeline(steps=[
        ('preprocessor', prep),
        (model_name, DecisionTreeRegressor(**params_dt))
    ])

pipe_dt.fit(train_data[X], train_data[y].values.ravel())

metrics_dt = evaluate_model(model_name, prep, pipe_dt, train_data, test_data, X, y)
print_evaluation( metrics_dt)
#plot_evaluation(model_name, prep, pipe_dt, train_data, test_data, X, y)

Decision Tree - Cross validation [0.76388721 0.60293563 0.71747412 0.72065361 0.71313161 0.70568303
 0.69385802 0.75109362 0.74574307 0.56418264]
Decision Tree - Cross validation mean: 0.70
Decision Tree -      In-sample -  mae: 28925.10
Decision Tree -      In-sample - mape: 17.52%
Decision Tree -  Out-of-sample -  mae: 44857.22
Decision Tree -  Out-of-sample - mape: 28.54%
Decision Tree -  Out-of-sample - rsme: 79837.93
Decision Tree -  Out-of-sample -   r2: 0.68


## Evaluation - Random Forest

In [83]:
model_name = 'Random Forest'

pipe_rf = Pipeline(steps=[
        ('preprocessor', prep),
        (model_name, RandomForestRegressor(**params_rf))
    ])

pipe_rf.fit(train_data[X], train_data[y].values.ravel())

metrics_rf = evaluate_model(model_name, prep, pipe_rf, train_data, test_data, X, y)
print_evaluation(metrics_rf)
#plot_evaluation(model_name, prep, pipe_rf, train_data, test_data, X, y)

Random Forest - Cross validation [0.80783166 0.76967513 0.80422765 0.85311332 0.8187797  0.74543225
 0.83913298 0.84569864 0.81080196 0.68827778]
Random Forest - Cross validation mean: 0.80
Random Forest -      In-sample -  mae: 17268.24
Random Forest -      In-sample - mape: 10.36%
Random Forest -  Out-of-sample -  mae: 37817.68
Random Forest -  Out-of-sample - mape: 23.99%
Random Forest -  Out-of-sample - rsme: 66841.37
Random Forest -  Out-of-sample -   r2: 0.78


## Evaluation - Gradient Boosting

In [44]:
model_name = 'Gradient Boosting'

pipe_gb = Pipeline(steps=[
        ('preprocessor', prep),
        (model_name, GradientBoostingRegressor(**params_gb))
    ])

pipe_gb.fit(train_data[X], train_data[y].values.ravel())

metrics_gb = evaluate_model(model_name, prep, pipe_gb, train_data, test_data, X, y)
print_evaluation(metrics_gb)
#plot_evaluation(model_name, prep, pipe_gb, train_data, test_data, X, y)

Gradient Boosting - Cross validation [ 0.68181683  0.79175383  0.77974182  0.75130214  0.80184054  0.56519383
  0.92303438 -0.09995249  0.71899517  0.82984626]
Gradient Boosting - Cross validation mean: 0.67
Gradient Boosting -      In-sample -  mae: 33911.97
Gradient Boosting -      In-sample - mape: 20.67%
Gradient Boosting -  Out-of-sample -  mae: 52109.98
Gradient Boosting -  Out-of-sample - mape: 22.84%
Gradient Boosting -  Out-of-sample - rsme: 178770.60
Gradient Boosting -  Out-of-sample -   r2: 0.85


## Evaluation - K-nearest Neighbors

In [45]:
model_name = 'Gradient Boosting'

pipe_knn = Pipeline(steps=[
        ('preprocessor', prep),
        (model_name, KNeighborsRegressor(**params_knn))
    ])

pipe_knn.fit(train_data[X], train_data[y].values.ravel())

metrics_knn = evaluate_model(model_name, prep, pipe_knn, train_data, test_data, X, y)
print_evaluation(metrics_knn)
#plot_evaluation(model_name, prep, pipe_knn, train_data, test_data, X, y)

Gradient Boosting - Cross validation [-0.07550206  0.78470894  0.57534876  0.66164734  0.70764409  0.31652389
  0.91164314 -0.20138067  0.72659046  0.61082601]
Gradient Boosting - Cross validation mean: 0.50
Gradient Boosting -      In-sample -  mae: 1183.18
Gradient Boosting -      In-sample - mape: 0.88%
Gradient Boosting -  Out-of-sample -  mae: 63243.10
Gradient Boosting -  Out-of-sample - mape: 25.78%
Gradient Boosting -  Out-of-sample - rsme: 258268.32
Gradient Boosting -  Out-of-sample -   r2: 0.68


## Evaluation - Modellvergleich

In [71]:
df_metrics = pd.concat([metrics_dt, metrics_rf, metrics_gb, metrics_knn])
df_metrics = df_metrics.set_index('model')
df_metrics = df_metrics.round(2)
df_metrics

Unnamed: 0_level_0,cv,in_sample_mae,in_sample_mape,out_of_sample_mae,out_of_sample_mape,out_of_sample_rsme,out_of_sample_r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Decision Tree,0.35,57226.61,23.7,63003.08,27.69,184273.51,0.84
Random Forest,0.68,34153.53,12.37,52688.05,22.74,168197.9,0.86
Gradient Boosting,0.67,33911.97,20.67,52109.98,22.84,178770.6,0.85
Gradient Boosting,0.5,1183.18,0.88,63243.1,25.78,258268.32,0.68


## Entfernen von weiteren Features + Tests anhand vom Random Forest

### Erweitertes Cleansing basierend auf der Verteilung der Features

In [68]:
df = df[df['Stories'] <= 8]
df = df[df['FinishedSqft'] <= 15000]
df = df[df['Units'] <= 15]
df = df[df['Lotsize'] <= 80000]
df = df[df['Bath'] <= 5]
df = df[df['Total_Rooms'] <= 40]
df = df[df['Age'] <= 250]

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
len(df)

6647

### Evaluation

In [86]:
models = [
    ('Decision Tree', DecisionTreeRegressor(**params_dt)),
    ('Random Forest', RandomForestRegressor(**params_rf)),
    ('Gradient Boosting', GradientBoostingRegressor(**params_gb)),
    ('K-nearest Neighbors', KNeighborsRegressor(**params_knn))
]

df_metrics = pd.DataFrame()

for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])

    pipe.fit(X=train_data[X], y=train_data[y].values.ravel())

    metrics = evaluate_model(name, prep, pipe, train_data, test_data, X, y)
    df_metrics = pd.concat([df_metrics, metrics])

In [88]:
df_metrics = df_metrics.set_index('model')
df_metrics = df_metrics.round(2)
df_metrics

Unnamed: 0_level_0,cv_mean,in_sample_mae,in_sample_mape,out_of_sample_mae,out_of_sample_mape,out_of_sample_rsme,out_of_sample_r2,cv
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Decision Tree,0.7,28925.1,17.52,44857.22,28.54,79837.93,0.68,"[0.7638872099981315, 0.6029356347350874, 0.717..."
Random Forest,0.8,17268.24,10.36,37817.68,23.99,66841.37,0.78,"[0.8078316557580798, 0.769675127659367, 0.8042..."
Gradient Boosting,0.81,24373.66,15.7,36708.27,23.31,64137.17,0.79,"[0.8523862691086106, 0.7598116352920186, 0.774..."
K-nearest Neighbors,0.77,1327.63,0.94,40169.7,24.98,75199.78,0.72,"[0.8197165207795235, 0.7308779408967256, 0.738..."
