In [2]:
import os
import pandas as pd
import numpy as np
np.random.seed(42)

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

## Datenimport
Quelle der Daten: https://data.milwaukee.gov/dataset/property-sales-data

In [3]:
FILEPATH = os.path.join("..", "input", "armslengthsales_2022_valid.csv")

In [4]:
df = pd.read_csv(FILEPATH)

## Preprocessing

In [5]:
# dropped features wird nicht verwendet, ist lediglich eine Kennzeichnung der nicht verwendeten Features
dropped_features = ["PropertyID", "taxkey", "Address", "CondoProject", "PropType", "Style", "Sale_date"]

num_features = ["Stories", "Year_Built", "FinishedSqft", "Units", "Fbath", "Hbath", "Lotsize", "Rooms", "Bdrms"]
cat_features = ["District", "nbhd", "Extwall"]

df['District'] = df['District'].astype(object)
df['nbhd'] = df['nbhd'].astype(object)

In [6]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
y = ["Sale_price"]
X = [*num_features, *cat_features]

In [8]:
set_config(transform_output="pandas")

# Auffüllen der fehlenden Numerischen Werte mit dem Durchschnitt + Anwendung StandardScaler
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Auffüllen der fehlenden Kategorischen Werte mit den häufigsten Werten + Anwendung OneHotEncoder
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
])

prep = ColumnTransformer(
    remainder="drop",
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ],
)

prep

In [9]:
prep_transformed = prep.fit_transform(X=train_data[X])
prep_transformed

Unnamed: 0,num__Stories,num__Year_Built,num__FinishedSqft,num__Units,num__Fbath,num__Hbath,num__Lotsize,num__Rooms,num__Bdrms,cat__District_1,...,cat__Extwall_Fiber Cement/Hardiplank,cat__Extwall_Masonary Frame,cat__Extwall_Masonry/Frame,cat__Extwall_Metal Siding,cat__Extwall_Other,cat__Extwall_Precast Masonary,cat__Extwall_Prem Wood,cat__Extwall_Stone,cat__Extwall_Stucco,cat__Extwall_Wood
59,2.040804,0.309840,-0.148994,-0.069573,-0.604721,1.370772,-0.250146,-0.680141,-0.465163,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3717,1.123395,-0.107680,0.151981,0.333957,-1.986251,-0.587572,-0.104320,-1.919856,-0.961282,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
787,-0.711424,0.265423,-0.138065,-0.069573,-0.604721,1.370772,-0.054248,-0.680141,-0.465163,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2540,0.205985,-0.311998,-0.121344,-0.069573,0.776808,-0.587572,-0.117382,0.063688,-0.465163,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6225,0.205985,-0.089913,-0.048013,-0.002318,0.776808,-0.587572,-0.112956,0.063688,0.030955,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,-0.711424,-0.143213,-0.136535,-0.069573,-0.604721,-0.587572,-0.077469,-0.432198,-0.465163,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5191,-0.711424,-0.463015,-0.168009,-0.069573,-1.986251,3.329115,-0.141330,-0.680141,-0.961282,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5226,-0.711424,-0.320881,-0.095006,-0.002318,0.776808,-0.587572,-0.123188,0.311631,0.030955,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,-0.711424,-0.063263,-0.161452,-0.069573,-0.604721,-0.587572,-0.047499,-0.432198,-0.465163,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modelltraining und -vergleich

In [13]:
models = [
    ('Decision Tree', DecisionTreeRegressor(random_state=42, max_depth=5)),
    ('Random Forest', RandomForestRegressor(random_state=42, max_depth=5, n_estimators=100)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42, max_depth=5, n_estimators=100)),
    ('K-nearest Neighbors', KNeighborsRegressor(n_neighbors=5))
]

print("Avg. Price on Training", train_data[y].mean().iloc[0])
print("Avg. Price on Test", test_data[y].mean().iloc[0])

for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])
    y_train = train_data[y].values.ravel()
    y_test = test_data[y].values.ravel()
    pipe.fit(X=train_data[X], y=y_train)
    cv = cross_val_score(estimator=pipe[-1], X=prep.fit_transform(train_data[X]), y=y_train, cv=3, n_jobs=4)

    print("====== ", name, " ======")
    print("Cross validation", cv)
    print("R2:", pipe.score(X=test_data[X], y=y_test))
    print(" MAE (Train):", mean_absolute_error(y_true=y_train, y_pred=pipe.predict(train_data[X])))
    print("MAPE (Train):", mean_absolute_percentage_error(y_true=y_train, y_pred=pipe.predict(train_data[X])))
    print(" MAE (Test) :", mean_absolute_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print("MAPE (Test) :", mean_absolute_percentage_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print(" MSE (Test) :", mean_squared_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print()

Avg. Price on Training 277506.43614415673
Avg. Price on Test 247715.75804195803
Cross validation [ 0.72887237  0.79064276 -0.22231756]
R2: 0.7649267275159781
MAE: 96749.0228887905
MAPE: 0.5437299934883674
MSE: 49159958185.98739

Cross validation [0.73881153 0.76572653 0.54419689]
R2: 0.8113374468040578
MAE: 90900.75454661582
MAPE: 0.5296408852097959
MSE: 39454265167.48961

Cross validation [0.58708179 0.67949594 0.27137477]
R2: 0.9040820758098749
MAE: 55490.04983013258
MAPE: 0.2742531583492748
MSE: 20058942016.87164

Cross validation [0.65912918 0.64361976 0.09926983]
R2: 0.6089377647442852
MAE: 66994.39230769231
MAPE: 0.27898179586035404
MSE: 81781322606.96042



## Hyperparameter-Optimierung
Die Hyperparameter-Optimierung erfolgt in einem separaten Notebook ```notebooks/hyperparameter_optimierung.ipynb```.  
Im folgenden Codeabschnitt werden die besten Ergebnisse angewandt und gegenübergestellt.

In [9]:
print("Avg. Price on Training", train_data[y].mean().iloc[0])
print("Avg. Price on Test", test_data[y].mean().iloc[0])

Avg. Price on Training 277506.43614415673
Avg. Price on Test 247715.75804195803


In [15]:
models = [
    ('Decision Tree', DecisionTreeRegressor(random_state=42, max_depth=20, min_samples_leaf=4, min_samples_split=2)),
    ('Random Forest', RandomForestRegressor(random_state=42, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42, learning_rate=0.1, max_depth=7, min_samples_leaf=1, min_samples_split=10, n_estimators=200)),
    ('K-nearest Neighbors', KNeighborsRegressor(algorithm='ball_tree', n_neighbors=9, weights='distance'))
]

for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])
    y_train = train_data[y].values.ravel()
    y_test = test_data[y].values.ravel()
    pipe.fit(X=train_data[X], y=y_train)
    cv = cross_val_score(estimator=pipe[-1], X=prep.fit_transform(train_data[X]), y=y_train, cv=5, n_jobs=4)

    print("====== ", name, " ======")
    print("Cross validation", cv)
    print("R2:", pipe.score(X=test_data[X], y=y_test))
    print(" MAE (Train):", mean_absolute_error(y_true=y_train, y_pred=pipe.predict(train_data[X])))
    print("MAPE (Train):", mean_absolute_percentage_error(y_true=y_train, y_pred=pipe.predict(train_data[X])))
    print(" MAE (Test) :", mean_absolute_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print("MAPE (Test) :", mean_absolute_percentage_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print(" MSE (Test) :", mean_squared_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print()

Cross validation [0.07933971 0.64249257 0.76171911 0.20707356 0.58862988]
R2: 0.8383172915058211
 MAE (Train): 56913.02793432916
MAPE (Train): 0.23477103085606352
 MAE (Test) : 63315.077784031455
MAPE (Test) : 0.27699099323970316
 MSE (Test) : 33812075294.57129

Cross validation [0.63398918 0.70092176 0.77988272 0.59523916 0.65993142]
R2: 0.8516234582835007
 MAE (Train): 34411.588006630154
MAPE (Train): 0.12432344954151052
 MAE (Test) : 53032.778400284034
MAPE (Test) : 0.22675176742920486
 MSE (Test) : 31029408445.659473

Cross validation [0.61861551 0.76487753 0.8117025  0.42430909 0.77769052]
R2: 0.8609000116355763
 MAE (Train): 34066.0587288974
MAPE (Train): 0.2090359881109338
 MAE (Test) : 53177.88278485768
MAPE (Test) : 0.23109395242016909
 MSE (Test) : 29089438962.616226

Cross validation [0.73953788 0.61446268 0.72714992 0.34437816 0.6192094 ]
R2: 0.6751361216490472
 MAE (Train): 1183.1825674519866
MAPE (Train): 0.00883654571064326
 MAE (Test) : 66082.68544688755
MAPE (Test) : 0

## Optimierung des besten Modells (Gradient Boosting)

### Erweitertes Cleansing basierend auf der Verteilung der Features

In [None]:
df = df[df['Stories'] < 10]
df = df[df['Year_Built'] > 1800]
df = df[df['Rooms'] < 15]
df = df[df['FinishedSqft'] < 10000]
df = df[df['Bdrms'] < 8]
df = df[df['Lotsize'] < 100000]
df = df[df['Units'] < 15]
df = df[df['Sale_price'] < 1500000]

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
len(df)

In [16]:
models = [
    ('Decision Tree', DecisionTreeRegressor(random_state=42, max_depth=20, min_samples_leaf=4, min_samples_split=2)),
    ('Random Forest', RandomForestRegressor(random_state=42, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=42, learning_rate=0.1, max_depth=7, min_samples_leaf=1, min_samples_split=10, n_estimators=200)),
    ('K-nearest Neighbors', KNeighborsRegressor(algorithm='ball_tree', n_neighbors=9, weights='distance'))
]

for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', prep),
        ('model', model)
    ])
    y_train = train_data[y].values.ravel()
    y_test = test_data[y].values.ravel()
    pipe.fit(X=train_data[X], y=y_train)
    cv = cross_val_score(estimator=pipe[-1], X=prep.fit_transform(train_data[X]), y=y_train, cv=5, n_jobs=4)

    print("====== ", name, " ======")
    print("Cross validation", cv)
    print("R2:", pipe.score(X=test_data[X], y=y_test))
    print(" MAE (Train):", mean_absolute_error(y_true=y_train, y_pred=pipe.predict(train_data[X])))
    print("MAPE (Train):", mean_absolute_percentage_error(y_true=y_train, y_pred=pipe.predict(train_data[X])))
    print(" MAE (Test) :", mean_absolute_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print("MAPE (Test) :", mean_absolute_percentage_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print(" MSE (Test) :", mean_squared_error(y_true=y_test, y_pred=pipe.predict(test_data[X])))
    print()

Cross validation [0.07933971 0.64249257 0.76171911 0.20707356 0.58862988]
R2: 0.8383172915058211
 MAE (Train): 56913.02793432916
MAPE (Train): 0.23477103085606352
 MAE (Test) : 63315.077784031455
MAPE (Test) : 0.27699099323970316
 MSE (Test) : 33812075294.57129

Cross validation [0.63398918 0.70092176 0.77988272 0.59523916 0.65993142]
R2: 0.8516234582835007
 MAE (Train): 34411.588006630154
MAPE (Train): 0.12432344954151052
 MAE (Test) : 53032.778400284034
MAPE (Test) : 0.22675176742920486
 MSE (Test) : 31029408445.659473

Cross validation [0.61861551 0.76487753 0.8117025  0.42430909 0.77769052]
R2: 0.8609000116355763
 MAE (Train): 34066.0587288974
MAPE (Train): 0.2090359881109338
 MAE (Test) : 53177.88278485768
MAPE (Test) : 0.23109395242016909
 MSE (Test) : 29089438962.616226

Cross validation [0.73953788 0.61446268 0.72714992 0.34437816 0.6192094 ]
R2: 0.6751361216490472
 MAE (Train): 1183.1825674519866
MAPE (Train): 0.00883654571064326
 MAE (Test) : 66082.68544688755
MAPE (Test) : 0