## Classification House-types
#### Anforderungen:
<p> Entwickle und vergleiche drei sinnvolle Modelle zur Klassifikation von Immobilien Objekten hinsichtlich `type`. </p>
<p> Was sind sinnvolle Metriken zur Messung der Genauigkeit der Vorhersage im vorliegenden Fall? Was ist zu beachten um eine gute Abschätzung des Fehlers für neue Daten zu bekommen? </p>
<p> Rapportiere diese Metrik(en) mit einer Abschätzung des Fehlers für alle drei Modelle </p>

In [25]:
# import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection, svm, ensemble, tree
from sklearn.metrics import classification_report, confusion_matrix
import random


In [None]:
# import dataset
df = pd.read_csv('../data/model/immoscout_robust.csv')
df.tail(5)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
13308,-0.214762,0.614418,0.482482,0.0,-0.066882,0.825061,0.0,0.0,-0.007174,0.214048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13309,0.031322,0.659631,0.467615,0.0,0.684524,0.537781,108.488669,0.0,0.527702,0.11105,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13310,-0.214762,0.812664,0.476216,0.0,-0.005971,0.720047,0.0,0.0,-0.019672,0.78985,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13311,0.291106,0.703465,0.506514,0.0,0.060185,0.243838,0.0,0.0,0.896919,0.279233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13312,0.521797,0.755503,0.512116,0.0,-1.150419,0.255263,0.0,0.0,-0.292978,-0.287039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# add a new column with different types
types = [col for col in df.columns if 'type' in col]
df['type'] = df[types].idxmax(axis=1)
df['type'] = df['type'].str.replace('type_', '')

In [None]:
# split dataset into features and target variables and drop type-columns for the features
types.append('type')
X = df.drop(types, axis=1)
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
def train_test_split(X, y):
    return model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

In [None]:
#select random columns for the features
def get_random_column_names(columns: list, percentage: float, add_type_columns: bool) -> list:
    type_columns = [c for c in columns if c.startswith("type_") and add_type_columns]
    regular_columns = [c for c in columns if (not c.startswith("type_")) and random.random() < percentage]
    return [*type_columns, *regular_columns]

In [None]:
# K-nearest-neighbour
def knn(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = KNeighborsClassifier(n_neighbors= 5)
    model.fit(X_train, y_train)

    return {
        "type": "K-Nearest-Neighbour",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'conf_matrix': confusion_matrix(y_test, model.predict(X_test)),
        "model": model
    }

In [None]:
# Support Vector Machine other kernels should be tried
def svm(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = SVC(kernel= 'rbf')
    model.fit(X_train, y_train)

    return {
        "type": "SVM",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'conf_matrix': confusion_matrix(y_test, model.predict(X_test)),
        "model": model
    }

In [None]:
# Random Forest Classifier max depth and other coefficients should be tried
def random_forest(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = ensemble.RandomForestClassifier(n_estimators=50, criterion = 'gini', random_state=42)
    model.fit(X_train, y_train)

    return {
        "type": "RandomForest",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'conf_matrix': confusion_matrix(y_test, model.predict(X_test)),
        "model": model
    }

In [None]:
# Constants
NUM_ITERATIONS = 5
MIN_PERCENTAGE_COLUMNS = 0.2
MAX_PERCENTAGE_COLUMNS = 1
TRAINING_FUNCTIONS = [knn, random_forest]

print("Total number of iterations:", NUM_ITERATIONS * len(TRAINING_FUNCTIONS) * 2)

Total number of iterations: 40


In [None]:
results = []
for _ in range(NUM_ITERATIONS):
    for add_type_columns in [True, False]:
        column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
        column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
        temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

        for func in TRAINING_FUNCTIONS:
            results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})

pd.DataFrame(results).sort_values("score", ascending=False)

Unnamed: 0,type,columns,num_columns,score,model,add_type_columns
9,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",24,0.668669,"(DecisionTreeClassifier(max_features='sqrt', r...",True
13,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",36,0.665065,"(DecisionTreeClassifier(max_features='sqrt', r...",True
23,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",36,0.665065,"(DecisionTreeClassifier(max_features='sqrt', r...",False
27,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",34,0.662061,"(DecisionTreeClassifier(max_features='sqrt', r...",False
25,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",34,0.659958,"(DecisionTreeClassifier(max_features='sqrt', r...",True
1,RandomForest,"[Latitude, gde_area_forest_percentage, gde_ave...",8,0.649144,"(DecisionTreeClassifier(max_features='sqrt', r...",True
5,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",19,0.641634,"(DecisionTreeClassifier(max_features='sqrt', r...",True
3,RandomForest,"[Latitude, Longitude, NoisePollutionRailwayM, ...",28,0.638029,"(DecisionTreeClassifier(max_features='sqrt', r...",False
37,RandomForest,"[ForestDensityM, Longitude, gde_area_nonproduc...",7,0.629919,"(DecisionTreeClassifier(max_features='sqrt', r...",True
17,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",30,0.626915,"(DecisionTreeClassifier(max_features='sqrt', r...",True


## Offene Punkte:
Random Forest besser als KNN
SVM funktion error -> Marvin
kontrolle ob type berücksichtigt werden