## Classification House-types
#### Anforderungen:
<p> Entwickle und vergleiche drei sinnvolle Modelle zur Klassifikation von Immobilien Objekten hinsichtlich `type`. </p>
<p> Was sind sinnvolle Metriken zur Messung der Genauigkeit der Vorhersage im vorliegenden Fall? Was ist zu beachten um eine gute Abschätzung des Fehlers für neue Daten zu bekommen? </p>
<p> Rapportiere diese Metrik(en) mit einer Abschätzung des Fehlers für alle drei Modelle </p>

In [113]:
# import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn import model_selection, svm, ensemble, tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
import random
import xgboost as xgb
from boruta import BorutaPy


In [114]:
# import dataset
df = pd.read_csv('../data/model/immoscout_robust.csv')
df.tail(5)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
13308,-0.214762,0.614418,0.482482,0.0,-0.066882,0.825061,0.0,0.0,-0.007174,0.214048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13309,0.031322,0.659631,0.467615,0.0,0.684524,0.537781,108.488669,0.0,0.527702,0.11105,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13310,-0.214762,0.812664,0.476216,0.0,-0.005971,0.720047,0.0,0.0,-0.019672,0.78985,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13311,0.291106,0.703465,0.506514,0.0,0.060185,0.243838,0.0,0.0,0.896919,0.279233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13312,0.521797,0.755503,0.512116,0.0,-1.150419,0.255263,0.0,0.0,-0.292978,-0.287039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
# add a new column with different types
types = [col for col in df.columns if 'type' in col]
df['type'] = df[types].idxmax(axis=1)
df['type'] = df['type'].str.replace('type_', '')

In [116]:
df.columns

Index(['ForestDensityM', 'Latitude', 'Longitude', 'NoisePollutionRailwayM',
       'NoisePollutionRoadM', 'PopulationDensityM', 'RiversAndLakesM',
       'RiversAndLakesS', 'distanceToTrainStation',
       'gde_area_agriculture_percentage', 'gde_area_forest_percentage',
       'gde_area_nonproductive_percentage', 'gde_average_house_hold',
       'gde_empty_apartments', 'gde_foreigners_percentage',
       'gde_new_homes_per_1000', 'gde_politics_bdp', 'gde_politics_cvp',
       'gde_politics_evp', 'gde_politics_fdp', 'gde_politics_glp',
       'gde_politics_gps', 'gde_politics_pda', 'gde_politics_rights',
       'gde_politics_sp', 'gde_politics_svp', 'gde_pop_per_km2',
       'gde_population', 'gde_social_help_quota', 'gde_tax',
       'gde_workers_sector1', 'rooms', 'Floor space_m2', 'Plot area_m2',
       'living_space_m2', 'price', 'type_attic-flat', 'type_attic-room',
       'type_castle', 'type_chalet', 'type_detached-house',
       'type_detached-secondary-suite', 'type_duplex-mais

In [117]:
#drop type-columns for the features
types.append('type')
types

['type_attic-flat',
 'type_attic-room',
 'type_castle',
 'type_chalet',
 'type_detached-house',
 'type_detached-secondary-suite',
 'type_duplex-maisonette',
 'type_farmhouse',
 'type_flat',
 'type_furnished-residential-property',
 'type_loft',
 'type_penthouse',
 'type_rustico',
 'type_secondary-suite',
 'type_semi-detached-house',
 'type_single-room',
 'type_stepped-apartment',
 'type_stepped-house',
 'type_studio',
 'type_terrace-house',
 'type_villa',
 'type']

In [118]:
X = df.drop(types, axis=1)
y = df['type']
df = df.drop(types, axis=1)

In [119]:
def train_test_split(X, y):
    return model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

In [120]:
#select random columns for the features
def get_random_column_names(columns: list, percentage: float, add_type_columns: bool) -> list:
    type_columns = [c for c in columns if c.startswith("type_") and add_type_columns]
    regular_columns = [c for c in columns if (not c.startswith("type_")) and random.random() < percentage]
    return [*type_columns, *regular_columns]

In [121]:
# K-nearest-neighbour
def knn(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = KNeighborsClassifier(n_neighbors= 5)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "K-Nearest-Neighbour",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [122]:
# Support Vector Machine other kernels should be tried
def svm(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = SVC(kernel= 'rbf')
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "SVM",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [123]:
# Random Forest Classifier max depth and other coefficients should be tried
def random_forest(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = ensemble.RandomForestClassifier(n_estimators=50, criterion = 'gini', random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "RandomForest",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [128]:
# XGBoost Classifier
def xg_boost(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    labelencoder = LabelEncoder()
    y_train = labelencoder.fit_transform(y_train)
    y_test = labelencoder.fit_transform(y_test)
    model = xgb.XGBClassifier(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "XGBoost",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [125]:
# split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [129]:
# Constants
NUM_ITERATIONS = 2
MIN_PERCENTAGE_COLUMNS = 0.2
MAX_PERCENTAGE_COLUMNS = 1
TRAINING_FUNCTIONS = [knn, svm, random_forest, xg_boost]

print("Total number of iterations:", NUM_ITERATIONS * len(TRAINING_FUNCTIONS) * 2)

Total number of iterations: 16


In [130]:
# Train different Models
results = []
for _ in range(NUM_ITERATIONS):
    for add_type_columns in [True, False]:
        column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
        column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
        temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

        for func in TRAINING_FUNCTIONS:
            results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})

pd.DataFrame(results).sort_values("score", ascending=False)

Unnamed: 0,type,columns,num_columns,score,accuracy,model,add_type_columns
14,RandomForest,"[Latitude, NoisePollutionRailwayM, RiversAndLa...",20,0.653962,0.653962,"(DecisionTreeClassifier(max_features='sqrt', r...",False
6,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",33,0.647015,0.647015,"(DecisionTreeClassifier(max_features='sqrt', r...",False
2,RandomForest,"[Latitude, Longitude, PopulationDensityM, Rive...",25,0.633684,0.633684,"(DecisionTreeClassifier(max_features='sqrt', r...",True
7,XGBoost,"[ForestDensityM, Latitude, Longitude, NoisePol...",33,0.606834,0.606834,"XGBClassifier(base_score=0.5, booster='gbtree'...",False
15,XGBoost,"[Latitude, NoisePollutionRailwayM, RiversAndLa...",20,0.606083,0.606083,"XGBClassifier(base_score=0.5, booster='gbtree'...",False
3,XGBoost,"[Latitude, Longitude, PopulationDensityM, Rive...",25,0.604394,0.604394,"XGBClassifier(base_score=0.5, booster='gbtree'...",True
10,RandomForest,"[NoisePollutionRailwayM, PopulationDensityM, R...",9,0.604394,0.604394,"(DecisionTreeClassifier(max_features='sqrt', r...",True
11,XGBoost,"[NoisePollutionRailwayM, PopulationDensityM, R...",9,0.58543,0.58543,"XGBClassifier(base_score=0.5, booster='gbtree'...",True
8,K-Nearest-Neighbour,"[NoisePollutionRailwayM, PopulationDensityM, R...",9,0.55445,0.55445,KNeighborsClassifier(),True
9,SVM,"[NoisePollutionRailwayM, PopulationDensityM, R...",9,0.48742,0.48742,SVC(),True


## Offene Punkte:
Random Forest domminiert

try different kernel, forste depth etc...


### Feature selection with the best performing Models
BorutayPy tries different cominations with different Features and renks the Features

In [None]:
model = ensemble.RandomForestClassifier(n_estimators=50, criterion = 'gini', random_state=42)

In [None]:
selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=42, max_iter=10)
selector.fit(X_train.values, y_train.values)

Iteration: 	1 / 10
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	2 / 10
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	3 / 10
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	4 / 10
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	5 / 10
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	6 / 10
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	7 / 10
Confirmed: 	0
Tentative: 	36
Rejected: 	0
Iteration: 	8 / 10
Confirmed: 	7
Tentative: 	4
Rejected: 	25
Iteration: 	9 / 10
Confirmed: 	7
Tentative: 	4
Rejected: 	25


BorutaPy finished running.

Iteration: 	10 / 10
Confirmed: 	7
Tentative: 	3
Rejected: 	25


In [None]:
x_filtered = selector.transform(X_train.values)
x_filtered

array([[ 3.53359078e-01,  1.58401791e+00,  5.00000000e+00, ...,
         2.74382803e+00, -2.77000369e-01,  6.45000000e+05],
       [-4.34758757e-01, -4.26343488e-01,  5.00000000e+00, ...,
        -9.45237499e-02, -2.65920354e-01,  5.27000000e+05],
       [-3.58902362e-01,  1.11604566e+00,  5.00000000e+00, ...,
         8.76129445e-01, -4.43200590e-01,  4.99000000e+05],
       ...,
       [ 4.60649035e-01,  7.79287604e-01,  5.00000000e+00, ...,
         3.13355510e-03, -7.31280974e-01,  3.19000000e+05],
       [ 8.15955612e-01, -5.64962737e-01,  5.00000000e+00, ...,
        -1.55565325e-01, -1.99440266e-01,  1.05500000e+06],
       [-5.32674297e-01,  5.33352176e-01,  5.00000000e+00, ...,
        -2.34614748e-01, -9.19641225e-01,  4.90000000e+05]])

In [None]:
ranking = list(zip(X_train.columns, selector.ranking_, selector.support_))
ranking.sort(key=lambda x: x[1])
ranking

[('Latitude', 1, True),
 ('PopulationDensityM', 1, True),
 ('rooms', 1, True),
 ('Floor space_m2', 1, True),
 ('Plot area_m2', 1, True),
 ('living_space_m2', 1, True),
 ('price', 1, True),
 ('Longitude', 2, False),
 ('distanceToTrainStation', 2, False),
 ('gde_politics_svp', 2, False),
 ('gde_pop_per_km2', 3, False),
 ('ForestDensityM', 4, False),
 ('gde_politics_glp', 4, False),
 ('gde_politics_bdp', 6, False),
 ('gde_politics_rights', 7, False),
 ('gde_politics_pda', 8, False),
 ('NoisePollutionRoadM', 9, False),
 ('gde_politics_evp', 10, False),
 ('gde_average_house_hold', 11, False),
 ('gde_tax', 11, False),
 ('gde_area_forest_percentage', 13, False),
 ('gde_population', 13, False),
 ('gde_area_nonproductive_percentage', 15, False),
 ('gde_politics_cvp', 16, False),
 ('gde_area_agriculture_percentage', 17, False),
 ('gde_foreigners_percentage', 18, False),
 ('gde_politics_sp', 18, False),
 ('gde_politics_fdp', 20, False),
 ('gde_politics_gps', 21, False),
 ('gde_empty_apartments', 

### Boosting Descisiontree