## Classification House-types
#### Anforderungen:
<p> Entwickle und vergleiche drei sinnvolle Modelle zur Klassifikation von Immobilien Objekten hinsichtlich `type`. </p>
<p> Was sind sinnvolle Metriken zur Messung der Genauigkeit der Vorhersage im vorliegenden Fall? Was ist zu beachten um eine gute Abschätzung des Fehlers für neue Daten zu bekommen? </p>
<p> Rapportiere diese Metrik(en) mit einer Abschätzung des Fehlers für alle drei Modelle </p>

In [63]:
# import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn import model_selection, svm, ensemble, tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
import random
import xgboost as xgb
from boruta import BorutaPy


In [64]:
# import dataset
df = pd.read_csv('../data/model/immoscout_robust.csv')
df.tail(5)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
13308,-0.214762,0.614418,0.482482,0.0,-0.066882,0.825061,0.0,0.0,-0.007174,0.214048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13309,0.031322,0.659631,0.467615,0.0,0.684524,0.537781,108.488669,0.0,0.527702,0.11105,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13310,-0.214762,0.812664,0.476216,0.0,-0.005971,0.720047,0.0,0.0,-0.019672,0.78985,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13311,0.291106,0.703465,0.506514,0.0,0.060185,0.243838,0.0,0.0,0.896919,0.279233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13312,0.521797,0.755503,0.512116,0.0,-1.150419,0.255263,0.0,0.0,-0.292978,-0.287039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
# add a new column with different types
types = [col for col in df.columns if 'type' in col]
df['type'] = df[types].idxmax(axis=1)
df['type'] = df['type'].str.replace('type_', '')

In [66]:
df.columns

Index(['ForestDensityM', 'Latitude', 'Longitude', 'NoisePollutionRailwayM',
       'NoisePollutionRoadM', 'PopulationDensityM', 'RiversAndLakesM',
       'RiversAndLakesS', 'distanceToTrainStation',
       'gde_area_agriculture_percentage', 'gde_area_forest_percentage',
       'gde_area_nonproductive_percentage', 'gde_average_house_hold',
       'gde_empty_apartments', 'gde_foreigners_percentage',
       'gde_new_homes_per_1000', 'gde_politics_bdp', 'gde_politics_cvp',
       'gde_politics_evp', 'gde_politics_fdp', 'gde_politics_glp',
       'gde_politics_gps', 'gde_politics_pda', 'gde_politics_rights',
       'gde_politics_sp', 'gde_politics_svp', 'gde_pop_per_km2',
       'gde_population', 'gde_social_help_quota', 'gde_tax',
       'gde_workers_sector1', 'rooms', 'Floor space_m2', 'Plot area_m2',
       'living_space_m2', 'price', 'type_attic-flat', 'type_attic-room',
       'type_castle', 'type_chalet', 'type_detached-house',
       'type_detached-secondary-suite', 'type_duplex-mais

In [67]:
#drop type-columns for the features
types.append('type')
types

['type_attic-flat',
 'type_attic-room',
 'type_castle',
 'type_chalet',
 'type_detached-house',
 'type_detached-secondary-suite',
 'type_duplex-maisonette',
 'type_farmhouse',
 'type_flat',
 'type_furnished-residential-property',
 'type_loft',
 'type_penthouse',
 'type_rustico',
 'type_secondary-suite',
 'type_semi-detached-house',
 'type_single-room',
 'type_stepped-apartment',
 'type_stepped-house',
 'type_studio',
 'type_terrace-house',
 'type_villa',
 'type']

In [68]:
X = df.drop(types, axis=1)
y = df['type']
df = df.drop(types, axis=1)

In [69]:
def train_test_split(X, y):
    return model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

In [70]:
#select random columns for the features
def get_random_column_names(columns: list, percentage: float, add_type_columns: bool) -> list:
    type_columns = [c for c in columns if c.startswith("type_") and add_type_columns]
    regular_columns = [c for c in columns if (not c.startswith("type_")) and random.random() < percentage]
    return [*type_columns, *regular_columns]

In [71]:
# K-nearest-neighbour
def knn(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = KNeighborsClassifier(n_neighbors= 5)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "K-Nearest-Neighbour",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [72]:
# Support Vector Machine other kernels should be tried
def svm(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = SVC(kernel= 'rbf')
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "SVM",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [73]:
# Random Forest Classifier max depth and other coefficients should be tried
def random_forest(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = ensemble.RandomForestClassifier(n_estimators=100, criterion = 'gini', random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "RandomForest",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [74]:
# XGBoost Classifier
def xg_boost(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    labelencoder = LabelEncoder()
    y_train = labelencoder.fit_transform(y_train)
    y_test = labelencoder.fit_transform(y_test)
    model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "XGBoost",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [75]:
# split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [76]:
# Constants
NUM_ITERATIONS = 2
MIN_PERCENTAGE_COLUMNS = 0.2
MAX_PERCENTAGE_COLUMNS = 1
TRAINING_FUNCTIONS = [knn, svm, random_forest, xg_boost]

print("Total number of iterations:", NUM_ITERATIONS * len(TRAINING_FUNCTIONS) * 2)

Total number of iterations: 16


In [77]:
# Train different Models
results = []
for _ in range(NUM_ITERATIONS):
    for add_type_columns in [True, False]:
        column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
        column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
        temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

        for func in TRAINING_FUNCTIONS:
            results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})

pd.DataFrame(results).sort_values("score", ascending=False)

Unnamed: 0,type,columns,num_columns,score,accuracy,model,add_type_columns
6,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",25,0.639129,0.639129,"(DecisionTreeClassifier(max_features='sqrt', r...",False
2,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",28,0.634623,0.634623,"(DecisionTreeClassifier(max_features='sqrt', r...",True
3,XGBoost,"[ForestDensityM, Latitude, Longitude, NoisePol...",28,0.603267,0.603267,"XGBClassifier(base_score=0.5, booster='gbtree'...",True
7,XGBoost,"[ForestDensityM, Latitude, Longitude, NoisePol...",25,0.59801,0.59801,"XGBClassifier(base_score=0.5, booster='gbtree'...",False
14,RandomForest,"[Longitude, NoisePollutionRoadM, PopulationDen...",19,0.587683,0.587683,"(DecisionTreeClassifier(max_features='sqrt', r...",False
15,XGBoost,"[Longitude, NoisePollutionRoadM, PopulationDen...",19,0.571911,0.571911,"XGBClassifier(base_score=0.5, booster='gbtree'...",False
4,K-Nearest-Neighbour,"[ForestDensityM, Latitude, Longitude, NoisePol...",25,0.552197,0.552197,KNeighborsClassifier(),False
12,K-Nearest-Neighbour,"[Longitude, NoisePollutionRoadM, PopulationDen...",19,0.546752,0.546752,KNeighborsClassifier(),False
0,K-Nearest-Neighbour,"[ForestDensityM, Latitude, Longitude, NoisePol...",28,0.53098,0.53098,KNeighborsClassifier(),True
10,RandomForest,"[ForestDensityM, NoisePollutionRailwayM, gde_a...",9,0.486106,0.486106,"(DecisionTreeClassifier(max_features='sqrt', r...",True


## Offene Punkte:
Random Forest domminiert

try different kernel, forste depth etc...


### Feature selection with the best performing Models
BorutayPy tries different cominations with different Features and ranks the Features

In [78]:
'''selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=42, max_iter=200)
selector.fit(X_train.values, y_train.values)'''

"selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=42, max_iter=200)\nselector.fit(X_train.values, y_train.values)"

In [79]:
'''x_filtered = selector.transform(X_train.values)
x_filtered'''

'x_filtered = selector.transform(X_train.values)\nx_filtered'

In [80]:
'''ranking = list(zip(X_train.columns, selector.ranking_, selector.support_))
ranking.sort(key=lambda x: x[1])
ranking'''

'ranking = list(zip(X_train.columns, selector.ranking_, selector.support_))\nranking.sort(key=lambda x: x[1])\nranking'

In [81]:
'''#ranking as a dataframe
ranking_df = pd.DataFrame(ranking, columns=['Feature name', 'Ranking', 'Support'])
ranking_df'''

"#ranking as a dataframe\nranking_df = pd.DataFrame(ranking, columns=['Feature name', 'Ranking', 'Support'])\nranking_df"

### Model testing

In [82]:
ranking_df = pd.read_csv('ranking_features_borutapy.csv')

In [83]:
features = list(ranking_df[ranking_df['Support'] == True]['Feature name'].values)
features

['Latitude',
 'Longitude',
 'PopulationDensityM',
 'distanceToTrainStation',
 'gde_politics_svp',
 'gde_pop_per_km2',
 'rooms',
 'Floor space_m2',
 'Plot area_m2',
 'living_space_m2',
 'price']

In [84]:
X = X[features]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

In [85]:
# try different functions
models = ['gini', 'entropy', 'log_loss']
score  = []

for model in models:
    model = ensemble.RandomForestClassifier(n_estimators=100, criterion = model, random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    score.append(accuracy_score(y_test, y_predict))
models, score

(['gini', 'entropy', 'log_loss'],
 [0.6740518212542246, 0.6714232069095005, 0.6714232069095005])

#### Gini performs best

In [87]:
estimators = [10, 20, 50, 100, 200, 500]
score  = []
depth = []

for i in estimators:
    model = ensemble.RandomForestClassifier(n_estimators= i, criterion = 'gini', random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    score.append(accuracy_score(y_test, y_predict))
    iteration_depth = [estimator.get_depth() for estimator in model.estimators_]
    depth.append(max(iteration_depth))
    depth.append(min(iteration_depth))
    depth.append(np.mean(iteration_depth))

estimators, score, depth

([10, 20, 50, 100, 200, 500],
 [0.6571535861809988,
  0.6646639128802103,
  0.6699211415696583,
  0.6740518212542246,
  0.6702966579046189,
  0.6725497559143823],
 [43,
  28,
  34.3,
  43,
  27,
  32.95,
  43,
  27,
  32.46,
  43,
  27,
  32.18,
  43,
  26,
  32.325,
  44,
  26,
  32.406])

##### we get the best result arround 100 trees in the randomforest