## Classification House-types
#### Anforderungen:
<p> Entwickle und vergleiche drei sinnvolle Modelle zur Klassifikation von Immobilien Objekten hinsichtlich `type`. </p>
<p> Was sind sinnvolle Metriken zur Messung der Genauigkeit der Vorhersage im vorliegenden Fall? Was ist zu beachten um eine gute Abschätzung des Fehlers für neue Daten zu bekommen? </p>
<p> Rapportiere diese Metrik(en) mit einer Abschätzung des Fehlers für alle drei Modelle </p>

In [84]:
# import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn import model_selection, svm, ensemble, tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
import random
import xgboost as xgb
#from boruta import BorutaPy


In [85]:
# import dataset
df = pd.read_csv('../data/model/immoscout_min_max_v2.csv')
df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,gde_workers_sector3_4,gde_workers_sector3_5,gde_workers_total_2,gde_workers_total_3,gde_workers_total_4,gde_workers_total_5,rooms_2,rooms_3,rooms_4,rooms_5
0,0.009716,0.001532,0.285714,0.754914,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,2.204691e-13,1.510722e-16,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,0.020408,0.002915,0.000416,5.9e-05
1,0.015504,0.000587,0.228571,0.754914,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,2.204691e-13,1.510722e-16,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,0.020408,0.002915,0.000416,5.9e-05
2,0.008992,0.005032,0.214286,0.426764,0.09593,0.001911,0.799258,0.468164,0.0,0.0,...,2.280957e-05,1.576327e-06,0.005103043,0.0003645388,2.604104e-05,1.860257e-06,0.020408,0.002915,0.000416,5.9e-05
3,0.015297,0.001009,0.207143,0.754914,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,2.204691e-13,1.510722e-16,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,0.020408,0.002915,0.000416,5.9e-05
4,0.014057,0.001619,0.142857,0.610095,0.279429,0.145835,0.803051,0.470341,0.0,0.0,...,2.043945e-11,4.345968e-14,8.15219e-06,2.327617e-08,6.64582e-11,1.897517e-13,0.020408,0.002915,0.000416,5.9e-05


In [86]:
# add a new column with different types
types = [col for col in df.columns if 'type' in col]
# dummies in one column
df['type'] = df[types].idxmax(axis=1)
df['type'] = df['type'].str.replace('type_unified_', '')

In [88]:
# price_clean has to be tranformed and scaled and add their power in df
df['price_cleaned'] = df['price_cleaned'] ** 0.5
df['price_cleaned'] = (df['price_cleaned'] - df['price_cleaned'].min()) / (df['price_cleaned'].max() - df['price_cleaned'].min())
for i in range(1, 6):
    df[f'price_cleaned_{i}'] = df['price_cleaned'] ** i

In [92]:
# define X and y
X = df.drop('type', axis=1)
y = df['type']

In [97]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
# K-nearest-neighbour
def knn(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = KNeighborsClassifier(n_neighbors= 5)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "K-Nearest-Neighbour",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [72]:
# Support Vector Machine other kernels should be tried
def svm(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = SVC(kernel= 'rbf')
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "SVM",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [73]:
# Random Forest Classifier max depth and other coefficients should be tried
def random_forest(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = ensemble.RandomForestClassifier(n_estimators=100, criterion = 'gini', random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "RandomForest",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

In [74]:
# XGBoost Classifier
def xg_boost(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    labelencoder = LabelEncoder()
    y_train = labelencoder.fit_transform(y_train)
    y_test = labelencoder.fit_transform(y_test)
    model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    return {
        "type": "XGBoost",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        'accuracy': accuracy_score(y_test, y_predict),
        "model": model
    }

-----------------------------------------------------------------------------------------------------

In [76]:
# Constants
NUM_ITERATIONS = 2
MIN_PERCENTAGE_COLUMNS = 0.2
MAX_PERCENTAGE_COLUMNS = 1
TRAINING_FUNCTIONS = [knn, svm, random_forest, xg_boost]

print("Total number of iterations:", NUM_ITERATIONS * len(TRAINING_FUNCTIONS) * 2)

Total number of iterations: 16


In [77]:
# Train different Models
results = []
for _ in range(NUM_ITERATIONS):
    for add_type_columns in [True, False]:
        column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
        column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
        temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

        for func in TRAINING_FUNCTIONS:
            results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})

pd.DataFrame(results).sort_values("score", ascending=False)

Unnamed: 0,type,columns,num_columns,score,accuracy,model,add_type_columns
6,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",25,0.639129,0.639129,"(DecisionTreeClassifier(max_features='sqrt', r...",False
2,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",28,0.634623,0.634623,"(DecisionTreeClassifier(max_features='sqrt', r...",True
3,XGBoost,"[ForestDensityM, Latitude, Longitude, NoisePol...",28,0.603267,0.603267,"XGBClassifier(base_score=0.5, booster='gbtree'...",True
7,XGBoost,"[ForestDensityM, Latitude, Longitude, NoisePol...",25,0.59801,0.59801,"XGBClassifier(base_score=0.5, booster='gbtree'...",False
14,RandomForest,"[Longitude, NoisePollutionRoadM, PopulationDen...",19,0.587683,0.587683,"(DecisionTreeClassifier(max_features='sqrt', r...",False
15,XGBoost,"[Longitude, NoisePollutionRoadM, PopulationDen...",19,0.571911,0.571911,"XGBClassifier(base_score=0.5, booster='gbtree'...",False
4,K-Nearest-Neighbour,"[ForestDensityM, Latitude, Longitude, NoisePol...",25,0.552197,0.552197,KNeighborsClassifier(),False
12,K-Nearest-Neighbour,"[Longitude, NoisePollutionRoadM, PopulationDen...",19,0.546752,0.546752,KNeighborsClassifier(),False
0,K-Nearest-Neighbour,"[ForestDensityM, Latitude, Longitude, NoisePol...",28,0.53098,0.53098,KNeighborsClassifier(),True
10,RandomForest,"[ForestDensityM, NoisePollutionRailwayM, gde_a...",9,0.486106,0.486106,"(DecisionTreeClassifier(max_features='sqrt', r...",True


## Offene Punkte:
Random Forest domminiert

try different kernel, forste depth etc...


### Feature selection with the best performing Models
BorutayPy tries different cominations with different Features and ranks the Features

In [78]:
'''selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=42, max_iter=200)
selector.fit(X_train.values, y_train.values)'''

"selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=42, max_iter=200)\nselector.fit(X_train.values, y_train.values)"

In [79]:
'''x_filtered = selector.transform(X_train.values)
x_filtered'''

'x_filtered = selector.transform(X_train.values)\nx_filtered'

In [80]:
'''ranking = list(zip(X_train.columns, selector.ranking_, selector.support_))
ranking.sort(key=lambda x: x[1])
ranking'''

'ranking = list(zip(X_train.columns, selector.ranking_, selector.support_))\nranking.sort(key=lambda x: x[1])\nranking'

In [81]:
'''#ranking as a dataframe
ranking_df = pd.DataFrame(ranking, columns=['Feature name', 'Ranking', 'Support'])
ranking_df'''

"#ranking as a dataframe\nranking_df = pd.DataFrame(ranking, columns=['Feature name', 'Ranking', 'Support'])\nranking_df"

### Model testing

In [82]:
ranking_df = pd.read_csv('ranking_features_borutapy.csv')

In [83]:
features = list(ranking_df[ranking_df['Support'] == True]['Feature name'].values)
features

['Latitude',
 'Longitude',
 'PopulationDensityM',
 'distanceToTrainStation',
 'gde_politics_svp',
 'gde_pop_per_km2',
 'rooms',
 'Floor space_m2',
 'Plot area_m2',
 'living_space_m2',
 'price']

In [84]:
X = X[features]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

In [85]:
# try different functions
models = ['gini', 'entropy', 'log_loss']
score  = []

for model in models:
    model = ensemble.RandomForestClassifier(n_estimators=100, criterion = model, random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    score.append(accuracy_score(y_test, y_predict))
models, score

(['gini', 'entropy', 'log_loss'],
 [0.6740518212542246, 0.6714232069095005, 0.6714232069095005])

#### Gini performs best

In [99]:
estimators = [10, 20, 50, 100, 200, 500]
score  = []
depth = []

for i in estimators:
    model = ensemble.RandomForestClassifier(n_estimators= i, criterion = 'gini', random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    score.append(accuracy_score(y_test, y_predict))
    iteration_depth = [estimator.get_depth() for estimator in model.estimators_]
    depth.append(max(iteration_depth))
    depth.append(min(iteration_depth))
    depth.append(np.mean(iteration_depth))

estimators, score, depth

KeyboardInterrupt: 

##### we get the best result arround 100 trees in the randomforest