## Classification House-types
#### Anforderungen:
<p> Entwickle und vergleiche drei sinnvolle Modelle zur Klassifikation von Immobilien Objekten hinsichtlich `type`. </p>
<p> Was sind sinnvolle Metriken zur Messung der Genauigkeit der Vorhersage im vorliegenden Fall? Was ist zu beachten um eine gute Abschätzung des Fehlers für neue Daten zu bekommen? </p>
<p> Rapportiere diese Metrik(en) mit einer Abschätzung des Fehlers für alle drei Modelle </p>

In [1]:
# import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
# import dataset
df = pd.read_csv('../data/model/immoscout_robust.csv')
df.tail(5)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
13373,-0.21301,0.614418,0.482482,0.0,-0.065714,0.826321,0.0,0.0,-0.010277,0.214048,...,0,0,0,0,0,0,0,0,0,0
13374,0.030807,0.659631,0.467615,0.0,0.685309,0.539041,164.44881,0.0,0.524307,0.11105,...,0,0,0,1,0,0,0,0,0,0
13375,-0.21301,0.812664,0.476216,0.0,-0.004834,0.721307,0.0,0.0,-0.022768,0.78985,...,0,0,0,0,0,0,0,0,1,0
13376,0.288199,0.703465,0.506514,0.0,0.061288,0.245098,0.0,0.0,0.893323,0.279233,...,0,0,0,0,0,0,0,0,0,0
13377,0.516765,0.755503,0.512116,0.0,-1.148699,0.256523,0.0,0.0,-0.295925,-0.287039,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# add new column with different types
types = [col for col in df.columns if 'type' in col]
df['type'] = df[types].idxmax(axis=1)
df['type'] = df['type'].str.replace('type_', '')
df.head(5)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa,type
0,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,0,0,penthouse
1,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,1,0,terrace-house
2,0.21574,0.66892,0.106747,0.0,0.715909,0.80415,1893.128856,0.0,-0.071577,-0.709981,...,0,0,0,0,0,0,0,0,0,penthouse
3,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,0,0,detached-house
4,1.035876,0.676133,0.112097,0.0,-0.74281,-0.302241,1422.882194,0.091805,0.202417,-0.025255,...,0,0,0,0,0,0,0,0,0,flat


In [4]:
# split dataset into features and target variables and drop type-columns for the features

X = df.drop(types, axis=1)
y = df['type']

# select n random features
def random_columns(n: int, df, y: list):
    X = df.sample(n, axis=1)
    mask = ~pd.isna(X) & ~pd.isna(y)
    y = y[mask]
    y = y.reshape(-1,1)
    for i in range(n):
        X.iloc[:,i] = X.iloc[:,i][mask]
    return X, y




In [None]:
random_columns(3, X, y)

In [48]:
#split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


In [47]:
def knn(X_train: list, X_test: list, y_train: list, y_test: list, neighbors: int):
    # Use the KNN classifier to fit data:
    classifier = KNeighborsClassifier(n_neighbors= neighbors)
    classifier.fit(X_train, y_train) 
    # Predict y data with classifier: 
    y_predict = classifier.predict(X_test)
    # Print results: 
    print(confusion_matrix(y_test, y_predict))
    print(classification_report(y_test, y_predict))

In [None]:
knn()