### Immobilienrechner - Modellentwicklung

[Immobilienrechner - Main](./immo_main.ipynb)<br>
[Immobilienrechner - Webscraper](./immo_scrap.ipynb)<br>
[Immobilienrechner - Explorative Datenanalyse](./immo_eda.ipynb)<br>
[Immobilienrechner - Bereitstellung des besten Preisvorhersagemodells per Webinterface](./flask/immo_flask.ipynb)

In [37]:
import numpy as np
import pandas as pd
import os
import sys
from random import randint
from time import sleep
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import joblib
%matplotlib inline

#### Datensätze laden

In [2]:
df_man = pd.read_csv('df_man.csv')
df_quant = pd.read_csv('df_quant.csv')
print(df_man.shape, df_quant.shape)

(786, 7) (869, 7)


#### Feature Sets erzeugen inklusive one hot encoding

In [3]:
df_man = df_man.join(pd.get_dummies(df_man.stadtteil))
df_quant = df_quant.join(pd.get_dummies(df_quant.stadtteil))

features_all_man = df_man.drop(['plz','stadtteil','a_name','link','preis'], axis=1)
features_groesse_man = df_man.groesse.to_frame()
features_steil_man = df_man.drop(['plz','stadtteil','a_name','link','groesse','zimmer','preis'], axis=1)
target_man = df_man.preis.to_frame()

features_all_q = df_quant.drop(['plz','stadtteil','a_name','link','preis'], axis=1) 
features_groesse_q = df_quant.groesse.to_frame()
features_steil_q = df_quant.drop(['plz','stadtteil','a_name','link','groesse','zimmer','preis'], axis=1)
target_q = df_quant.preis.to_frame()

#### Test- und Trainingssets erzeugen

In [5]:
X_train_all_man, X_test_all_man, y_train_all_man, y_test_all_man = \
                                                train_test_split(
                                                    features_all_man, 
                                                    target_man,   
                                                    test_size=0.2,
                                                    random_state=42)
X_train_groesse_man, X_test_groesse_man, y_train_groesse_man, y_test_groesse_man = \
                                                train_test_split(
                                                    features_groesse_man, 
                                                    target_man,   
                                                    test_size=0.2,
                                                    random_state=42)
X_train_steil_man, X_test_steil_man, y_train_steil_man, y_test_steil_man = \
                                                train_test_split(
                                                    features_steil_man, 
                                                    target_man,   
                                                    test_size=0.2,
                                                    random_state=42)

X_train_all_q, X_test_all_q, y_train_all_q, y_test_all_q = \
                                                train_test_split(
                                                    features_all_q, 
                                                    target_q,   
                                                    test_size=0.2,
                                                    random_state=42)
X_train_groesse_q, X_test_groesse_q, y_train_groesse_q, y_test_groesse_q = \
                                                train_test_split(
                                                    features_groesse_q, 
                                                    target_q,   
                                                    test_size=0.2,
                                                    random_state=42)
X_train_steil_q, X_test_steil_q, y_train_steil_q, y_test_steil_q = \
                                                train_test_split(
                                                    features_steil_q, 
                                                    target_q,   
                                                    test_size=0.2,
                                                    random_state=42)

#### Modellerzeugung Lineare Regression

In [6]:
reg_all_man = LinearRegression()
reg_groesse_man = LinearRegression()
reg_steil_man = LinearRegression()

reg_all_man.fit(X_train_all_man, y_train_all_man)
reg_groesse_man.fit(X_train_groesse_man, y_train_groesse_man)
reg_steil_man.fit(X_train_steil_man, y_train_steil_man)

reg_all_q = LinearRegression()
reg_groesse_q = LinearRegression()
reg_steil_q = LinearRegression()

reg_all_q.fit(X_train_all_q, y_train_all_q)
reg_groesse_q.fit(X_train_groesse_q, y_train_groesse_q)
reg_steil_q.fit(X_train_steil_q, y_train_steil_q)

LinearRegression()

In [7]:
reg_r2_man = {}

reg_r2_man['all_train'] = r2_score(y_train_all_man, reg_all_man.predict(X_train_all_man))
reg_r2_man['groesse_train'] = r2_score(y_train_groesse_man, reg_groesse_man.predict(X_train_groesse_man))
reg_r2_man['steil_train'] = r2_score(y_train_steil_man, reg_steil_man.predict(X_train_steil_man))

reg_r2_man['all_test'] = r2_score(y_test_all_man, reg_all_man.predict(X_test_all_man))
reg_r2_man['groesse_test'] = r2_score(y_test_groesse_man, reg_groesse_man.predict(X_test_groesse_man))
reg_r2_man['steil_test'] = r2_score(y_test_steil_man, reg_steil_man.predict(X_test_steil_man))

print('R2 score fuer Regression (manuell):\n',reg_r2_man)

R2 score fuer Regression (manuell):
 {'all_train': 0.5425116549708868, 'groesse_train': 0.3402521275944036, 'steil_train': 0.22341808014079545, 'all_test': 0.5702036057356301, 'groesse_test': 0.4805108997989619, 'steil_test': -1.9295787697966859e+25}


In [8]:
reg_r2_q = {}

reg_r2_q['all_train'] = r2_score(y_train_all_q, reg_all_q.predict(X_train_all_q))
reg_r2_q['groesse_train'] = r2_score(y_train_groesse_q, reg_groesse_q.predict(X_train_groesse_q))
reg_r2_q['steil_train'] = r2_score(y_train_steil_q, reg_steil_q.predict(X_train_steil_q))

reg_r2_q['all_test'] = r2_score(y_test_all_q, reg_all_q.predict(X_test_all_q))
reg_r2_q['groesse_test'] = r2_score(y_test_groesse_q, reg_groesse_q.predict(X_test_groesse_q))
reg_r2_q['steil_test'] = r2_score(y_test_steil_q, reg_steil_q.predict(X_test_steil_q))

print('R2 score fuer Regression (quantile):\n',reg_r2_q)

R2 score fuer Regression (quantile):
 {'all_train': 0.5485412942914323, 'groesse_train': 0.3885998860636658, 'steil_train': 0.2019804550217028, 'all_test': 0.6571238975017322, 'groesse_test': 0.4169173251759454, 'steil_test': -2.164430921351472e+25}


In [9]:
print('LinearRegression Vorhersagen:')

print('Manuell: Predict fuer Modell Groesse fuer 50qm:',reg_groesse_man.predict([[50]])[0][0])
print('Manuell: Predict fuer Modell Groesse fuer 75qm:',reg_groesse_man.predict([[75]])[0][0])
print('Manuell: Predict fuer Modell Groesse fuer 100qm:',reg_groesse_man.predict([[100]])[0][0])

print('Q: Predict fuer Modell Groesse fuer 50qm:',reg_groesse_q.predict([[50]])[0][0])
print('Q: Predict fuer Modell Groesse fuer 75qm:',reg_groesse_q.predict([[75]])[0][0])
print('Q: Predict fuer Modell Groesse fuer 100qm:',reg_groesse_q.predict([[100]])[0][0])

dummy = [0 for i in range(X_train_all_man.shape[1]-2)]
steil_buch = dummy.copy()
steil_buch[X_test_steil_man.columns.to_list().index('Buch')] = 1
steil_char = dummy.copy()
steil_char[X_test_steil_man.columns.to_list().index('Charlottenburg')] = 1
steil_xberg = dummy.copy()
steil_xberg[X_test_steil_man.columns.to_list().index('Kreuzberg')] = 1

print('Manuell: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden):', 
      reg_steil_man.predict([steil_buch])[0][0])
print('Manuell: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden):', 
      reg_steil_man.predict([steil_xberg])[0][0])
print('Manuell: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden):', 
      reg_steil_man.predict([steil_char])[0][0])

print('Q: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden):', 
      reg_steil_q.predict([steil_buch])[0][0])
print('Q: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden):', 
      reg_steil_q.predict([steil_xberg])[0][0])
print('Q: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden):', 
      reg_steil_q.predict([steil_char])[0][0])

print('Manuell: Predict fuer Modell all_features mit 50qm, 2 Zi für Kreuzberg (wenige Daten vorhanden):', 
      reg_all_man.predict([ [50, 2] + steil_xberg])[0][0])
print('Manuell: Predict fuer Modell all_features mit 75qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      reg_all_man.predict([ [75, 3] + steil_xberg])[0][0])
print('Manuell: Predict fuer Modell all_features mit 100qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      reg_all_man.predict([ [100, 3] + steil_xberg])[0][0])

print('Manuell: Predict fuer Modell all_features mit 50qm, 2 Zi für Charlottenburg (einige Daten vorhanden):', 
      reg_all_man.predict([ [50, 2] + steil_char])[0][0])
print('Manuell: Predict fuer Modell all_features mit 75qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      reg_all_man.predict([ [75, 3] + steil_char])[0][0])
print('Manuell: Predict fuer Modell all_features mit 100qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      reg_all_man.predict([ [100, 3] + steil_char])[0][0])

print('Q: Predict fuer Modell all_features mit 50qm, 2 Zi für Kreuzberg (wenige Daten vorhanden):', 
      reg_all_q.predict([ [50, 2] + steil_xberg])[0][0])
print('Q: Predict fuer Modell all_features mit 75qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      reg_all_q.predict([ [75, 3] + steil_xberg])[0][0])
print('Q: Predict fuer Modell all_features mit 100qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      reg_all_q.predict([ [100, 3] + steil_xberg])[0][0])

print('Q: Predict fuer Modell all_features mit 50qm, 2 Zi für Charlottenburg (einige Daten vorhanden):', 
      reg_all_q.predict([ [50, 2] + steil_char])[0][0])
print('Q: Predict fuer Modell all_features mit 75qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      reg_all_q.predict([ [75, 3] + steil_char])[0][0])
print('Q: Predict fuer Modell all_features mit 100qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      reg_all_q.predict([ [100, 3] + steil_char])[0][0])

LinearRegression Vorhersagen:
Manuell: Predict fuer Modell Groesse fuer 50qm: 296396.21071312984
Manuell: Predict fuer Modell Groesse fuer 75qm: 446239.38879518135
Manuell: Predict fuer Modell Groesse fuer 100qm: 596082.5668772329
Q: Predict fuer Modell Groesse fuer 50qm: 290202.15746575507
Q: Predict fuer Modell Groesse fuer 75qm: 457885.5519555693
Q: Predict fuer Modell Groesse fuer 100qm: 625568.9464453836
Manuell: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden): 215040.0
Manuell: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden): 522240.0
Manuell: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden): 462848.0
Q: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden): 235520.0
Q: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden): 495616.0
Q: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden): 458752.0
Manuell: Predict fuer Modell all_features mit 50qm, 2 Zi für Kreuzb

#### Modellerzeugung Gradient Boosting Regressor

In [10]:
gbr_all_man = GradientBoostingRegressor()
gbr_groesse_man = GradientBoostingRegressor()
gbr_steil_man = GradientBoostingRegressor()

res = gbr_all_man.fit(X_test_all_man, y_test_all_man)
gbr_groesse_man.fit(X_test_groesse_man, y_test_groesse_man)
gbr_steil_man.fit(X_test_steil_man, y_test_steil_man)

gbr_all_q = GradientBoostingRegressor()
gbr_groesse_q = GradientBoostingRegressor()
gbr_steil_q = GradientBoostingRegressor()

gbr_all_q.fit(X_test_all_q, y_test_all_q)
gbr_groesse_q.fit(X_test_groesse_q, y_test_groesse_q)
gbr_steil_q.fit(X_test_steil_q, y_test_steil_q)

GradientBoostingRegressor()

In [11]:
gbr_r2_man = {}

gbr_r2_man['all_train'] = r2_score(y_train_all_man, gbr_all_man.predict(X_train_all_man))
gbr_r2_man['groesse_train'] = r2_score(y_train_groesse_man, gbr_groesse_man.predict(X_train_groesse_man))
gbr_r2_man['steil_train'] = r2_score(y_train_steil_man, gbr_steil_man.predict(X_train_steil_man))

gbr_r2_man['all_test'] = r2_score(y_test_all_man, gbr_all_man.predict(X_test_all_man))
gbr_r2_man['groesse_test'] = r2_score(y_test_groesse_man, gbr_groesse_man.predict(X_test_groesse_man))
gbr_r2_man['steil_test'] = r2_score(y_test_steil_man, gbr_steil_man.predict(X_test_steil_man))

print('R2 score fuer GradientBR (manuell):\n',gbr_r2_man)

R2 score fuer GradientBR (manuell):
 {'all_train': 0.31136501376543524, 'groesse_train': 0.029113631923103123, 'steil_train': 0.10946104100195819, 'all_test': 0.895424475351126, 'groesse_test': 0.8388528620164564, 'steil_test': 0.2530981952056309}


In [12]:
gbr_r2_q = {}

gbr_r2_q['all_train'] = r2_score(y_train_all_q, gbr_all_q.predict(X_train_all_q))
gbr_r2_q['groesse_train'] = r2_score(y_train_groesse_q, gbr_groesse_q.predict(X_train_groesse_q))
gbr_r2_q['steil_train'] = r2_score(y_train_steil_q, gbr_steil_q.predict(X_train_steil_q))

gbr_r2_q['all_test'] = r2_score(y_test_all_q, gbr_all_q.predict(X_test_all_q))
gbr_r2_q['groesse_test'] = r2_score(y_test_groesse_q, gbr_groesse_q.predict(X_test_groesse_q))
gbr_r2_q['steil_test'] = r2_score(y_test_steil_q, gbr_steil_q.predict(X_test_steil_q))

print('R2 score fuer GradientBR (quantile):\n',gbr_r2_q)

R2 score fuer GradientBR (quantile):
 {'all_train': 0.45297675386897485, 'groesse_train': 0.1660272684027494, 'steil_train': 0.09328836835349441, 'all_test': 0.8856866011534149, 'groesse_test': 0.8061210748520713, 'steil_test': 0.38265102347050184}


In [14]:
print('GradientBoostingRegressor Vorhersagen:')

print('Manuell: Predict fuer Modell Groesse fuer 50qm:',gbr_groesse_man.predict([[50]])[0])
print('Manuell: Predict fuer Modell Groesse fuer 75qm:',gbr_groesse_man.predict([[75]])[0])
print('Manuell: Predict fuer Modell Groesse fuer 100qm:',gbr_groesse_man.predict([[100]])[0])

print('Q: Predict fuer Modell Groesse fuer 50qm:',gbr_groesse_q.predict([[50]])[0])
print('Q: Predict fuer Modell Groesse fuer 75qm:',gbr_groesse_q.predict([[75]])[0])
print('Q: Predict fuer Modell Groesse fuer 100qm:',gbr_groesse_q.predict([[100]])[0])

dummy = [0 for i in range(X_test_all_man.shape[1]-2)]
steil_buch = dummy.copy()
steil_buch[X_test_steil_man.columns.to_list().index('Buch')] = 1
steil_char = dummy.copy()
steil_char[X_test_steil_man.columns.to_list().index('Charlottenburg')] = 1
steil_xberg = dummy.copy()
steil_xberg[X_test_steil_man.columns.to_list().index('Kreuzberg')] = 1

print('Manuell: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden):', 
      gbr_steil_man.predict([steil_buch])[0])
print('Manuell: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden):', 
      gbr_steil_man.predict([steil_xberg])[0])
print('Manuell: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden):', 
      gbr_steil_man.predict([steil_char])[0])

print('Q: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden):', 
      gbr_steil_q.predict([steil_buch])[0])
print('Q: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden):', 
      gbr_steil_q.predict([steil_xberg])[0])
print('Q: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden):', 
      gbr_steil_q.predict([steil_char])[0])

print('Manuell: Predict fuer Modell all_features mit 50qm, 2 Zi für Kreuzberg (wenige Daten vorhanden):', 
      gbr_all_man.predict([ [50, 2] + steil_xberg])[0])
print('Manuell: Predict fuer Modell all_features mit 75qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      gbr_all_man.predict([ [75, 3] + steil_xberg])[0])
print('Manuell: Predict fuer Modell all_features mit 100qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      gbr_all_man.predict([ [100, 3] + steil_xberg])[0])

print('Manuell: Predict fuer Modell all_features mit 50qm, 2 Zi für Charlottenburg (einige Daten vorhanden):', 
      gbr_all_man.predict([ [50, 2] + steil_char])[0])
print('Manuell: Predict fuer Modell all_features mit 75qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      gbr_all_man.predict([ [75, 3] + steil_char])[0])
print('Manuell: Predict fuer Modell all_features mit 100qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      gbr_all_man.predict([ [100, 3] + steil_char])[0])

print('Q: Predict fuer Modell all_features mit 50qm, 2 Zi für Kreuzberg (wenige Daten vorhanden):', 
      gbr_all_q.predict([ [50, 2] + steil_xberg])[0])
print('Q: Predict fuer Modell all_features mit 75qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      gbr_all_q.predict([ [75, 3] + steil_xberg])[0])
print('Q: Predict fuer Modell all_features mit 100qm, 3 Zi für Kreuzberg (wenige Daten vorhanden):', 
      gbr_all_q.predict([ [100, 3] + steil_xberg])[0])

print('Q: Predict fuer Modell all_features mit 50qm, 2 Zi für Charlottenburg (einige Daten vorhanden):', 
      gbr_all_q.predict([ [50, 2] + steil_char])[0])
print('Q: Predict fuer Modell all_features mit 75qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      gbr_all_q.predict([ [75, 3] + steil_char])[0])
print('Q: Predict fuer Modell all_features mit 100qm, 3 Zi für Charlottenburg (einige Daten vorhanden):', 
      gbr_all_q.predict([ [100, 3] + steil_char])[0])

GradientBoostingRegressor Vorhersagen:
Manuell: Predict fuer Modell Groesse fuer 50qm: 305390.0249608745
Manuell: Predict fuer Modell Groesse fuer 75qm: 389446.74445063144
Manuell: Predict fuer Modell Groesse fuer 100qm: 887239.3552406145
Q: Predict fuer Modell Groesse fuer 50qm: 305281.11920124025
Q: Predict fuer Modell Groesse fuer 75qm: 477853.551553679
Q: Predict fuer Modell Groesse fuer 100qm: 508587.934015886
Manuell: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden): 454230.64692565193
Manuell: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden): 534512.1977706642
Manuell: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden): 454230.64692565193
Q: Predict fuer Modell Stadtteil für Buch (ganz wenige Daten vorhanden): 426704.1941785186
Q: Predict fuer Modell Stadtteil für Kreuzberg (wenige Daten vorhanden): 655071.3180880793
Q: Predict fuer Modell Stadtteil für Charlottenburg (einige Daten vorhanden): 479761.09537457896
Manu

#### Neues Feature Quadratmeterpreis einführen um mehrere Modelle für den selben Datensatz zu entwickeln

In [15]:
df_man['qmpreis'] = df_man.preis / df_man.groesse
df_man.head()

Unnamed: 0,plz,stadtteil,preis,groesse,zimmer,a_name,link,Biesdorf,Blankenfelde,Buch,...,Tegel,Tempelhof,Tiergarten,Treptow,Wedding,Weissensee,Westend,Wilmersdorf,Zehlendorf,qmpreis
0,10707,Wilmersdorf,440000,60.3,2.0,zentrale-lage-in-charlottenburg-bezugsfrei-sue...,https://www.ebay-kleinanzeigen.de/s-anzeige/ze...,0,0,0,...,0,0,0,0,0,0,0,1,0,7296.849088
1,12305,Tempelhof,485000,100.18,3.0,sonnige-grosszuegige-gartenwohnung-in-bester-l...,https://www.ebay-kleinanzeigen.de/s-anzeige/so...,0,0,0,...,0,1,0,0,0,0,0,0,0,4841.285686
2,13359,Gesundbrunnen,238000,68.0,3.0,eigentumswohnung-in-13359-berlin-gottschalkstr-,https://www.ebay-kleinanzeigen.de/s-anzeige/ei...,0,0,0,...,0,0,0,0,0,0,0,0,0,3500.0
3,12051,Neukölln,412000,67.0,3.0,provisionsfrei-bezugsfrei-3-zwhg-nahe-tempelho...,https://www.ebay-kleinanzeigen.de/s-anzeige/pr...,0,0,0,...,0,0,0,0,0,0,0,0,0,6149.253731
4,10318,Karlshorst,375255,88.5,3.0,nur-wenige-3-4-zi-wohnungen-uebrig-top-kapital...,https://www.ebay-kleinanzeigen.de/s-anzeige/nu...,0,0,0,...,0,0,0,0,0,0,0,0,0,4240.169492


In [18]:
df_nf = df_man.copy(deep=True)

df_nf['art'] = 'leer'
for c in range(len(df_nf)):
    if df_nf.iloc[c, df_nf.columns.get_loc('qmpreis')] < df_nf.qmpreis.mean() - 0.5 * df_nf.qmpreis.std():
        df_nf.iloc[c, df_nf.columns.get_loc('art')] = 'low'
    elif df_nf.iloc[c].qmpreis > df_nf.qmpreis.mean() + 0.5 * df_nf.qmpreis.std():
        df_nf.iloc[c, df_nf.columns.get_loc('art')] = 'high'
    else:
        df_nf.iloc[c, df_nf.columns.get_loc('art')] = 'mid'
df_nf = df_nf.join(pd.get_dummies(df_nf.art))

df_nf.head()

Unnamed: 0,plz,stadtteil,preis,groesse,zimmer,a_name,link,Biesdorf,Blankenfelde,Buch,...,Wedding,Weissensee,Westend,Wilmersdorf,Zehlendorf,qmpreis,art,high,low,mid
0,10707,Wilmersdorf,440000,60.3,2.0,zentrale-lage-in-charlottenburg-bezugsfrei-sue...,https://www.ebay-kleinanzeigen.de/s-anzeige/ze...,0,0,0,...,0,0,0,1,0,7296.849088,high,1,0,0
1,12305,Tempelhof,485000,100.18,3.0,sonnige-grosszuegige-gartenwohnung-in-bester-l...,https://www.ebay-kleinanzeigen.de/s-anzeige/so...,0,0,0,...,0,0,0,0,0,4841.285686,low,0,1,0
2,13359,Gesundbrunnen,238000,68.0,3.0,eigentumswohnung-in-13359-berlin-gottschalkstr-,https://www.ebay-kleinanzeigen.de/s-anzeige/ei...,0,0,0,...,0,0,0,0,0,3500.0,low,0,1,0
3,12051,Neukölln,412000,67.0,3.0,provisionsfrei-bezugsfrei-3-zwhg-nahe-tempelho...,https://www.ebay-kleinanzeigen.de/s-anzeige/pr...,0,0,0,...,0,0,0,0,0,6149.253731,mid,0,0,1
4,10318,Karlshorst,375255,88.5,3.0,nur-wenige-3-4-zi-wohnungen-uebrig-top-kapital...,https://www.ebay-kleinanzeigen.de/s-anzeige/nu...,0,0,0,...,0,0,0,0,0,4240.169492,low,0,1,0


#### Neue Dataframes erzeugen mit Klassifizierung low, mid, high 

In [21]:
df_high = df_nf[df_nf.high == 1]
df_mid = df_nf[df_nf.mid == 1]
df_low = df_nf[df_nf.low == 1]
print(f'Anzahl Datensätze: low:{len(df_low)} mid:{len(df_mid)} high:{len(df_high)}')

Anzahl Datensätze: low:305 mid:267 high:214


In [20]:
features_l = df_low.drop(['plz','stadtteil','a_name','link','preis','art','qmpreis','low','mid','high'], axis=1)
target_l = df_low.preis.to_frame()
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(
                                                    features_l, 
                                                    target_l,   
                                                    test_size=0.2,
                                                    random_state=42)
features_m = df_mid.drop(['plz','stadtteil','a_name','link','preis','art','qmpreis','low','mid','high'], axis=1)
target_m = df_mid.preis.to_frame()
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
                                                    features_m, 
                                                    target_m,   
                                                    test_size=0.2,
                                                    random_state=42)
features_h = df_high.drop(['plz','stadtteil','a_name','link','preis','art','qmpreis','low','mid','high'], axis=1)
target_h = df_high.preis.to_frame()
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
                                                    features_h, 
                                                    target_h,   
                                                    test_size=0.2,
                                                    random_state=42)
print(f'Anzahl Test-Datensätze: low:{len(X_train_l)} mid:{len(X_train_m)} high:{len(X_train_h)}')

Anzahl Test-Datensätze: low:244 mid:213 high:171


In [22]:
reg_l = LinearRegression()
reg_m = LinearRegression()
reg_h = LinearRegression()

reg_l.fit(X_train_h, y_train_h)
reg_m.fit(X_train_h, y_train_h)
reg_h.fit(X_train_h, y_train_h)

LinearRegression()

In [23]:
reg_r2 = {}

reg_r2['reg_train_low'] = r2_score(y_train_l, reg_l.predict(X_train_l))
reg_r2['reg_train_mid'] = r2_score(y_train_m, reg_m.predict(X_train_m))
reg_r2['reg_train_high'] = r2_score(y_train_h, reg_h.predict(X_train_h))

reg_r2['reg_test_low'] = r2_score(y_test_l, reg_l.predict(X_test_l))
reg_r2['reg_test_mid'] = r2_score(y_test_m, reg_m.predict(X_test_m))
reg_r2['reg_test_high'] = r2_score(y_test_h, reg_h.predict(X_test_h))

reg_r2

{'reg_train_low': -5.6461303523862374e+20,
 'reg_train_mid': -6.517158352741493e+19,
 'reg_train_high': 0.7283476856555764,
 'reg_test_low': -5.736000241870621e+20,
 'reg_test_mid': -5.18014902230448e+19,
 'reg_test_high': 0.7003480362382049}

In [24]:
reg_mae = {}

reg_mae['reg_mae_low'] = mean_absolute_error(reg_l.predict(X_test_l), y_test_l)
reg_mae['reg_mae_mid'] = mean_absolute_error(reg_m.predict(X_test_m), y_test_m)
reg_mae['reg_mae_high'] = mean_absolute_error(reg_h.predict(X_test_h), y_test_h)

reg_mae

{'reg_mae_low': 711974524738922.4,
 'reg_mae_mid': 235428035699501.12,
 'reg_mae_high': 81577.51162790698}

In [25]:
gbr_l = GradientBoostingRegressor()
gbr_m = GradientBoostingRegressor()
gbr_h = GradientBoostingRegressor()

gbr_l.fit(X_test_l, y_test_l)
gbr_m.fit(X_test_m, y_test_m)
gbr_h.fit(X_test_h, y_test_h)

GradientBoostingRegressor()

In [26]:
gbr_r2 = {}

gbr_r2['gbr_train_low'] = r2_score(y_train_l, gbr_l.predict(X_train_l))
gbr_r2['gbr_train_mid'] = r2_score(y_train_m, gbr_m.predict(X_train_m))
gbr_r2['gbr_train_high'] = r2_score(y_train_h, gbr_h.predict(X_train_h))

gbr_r2['gbr_test_low'] = r2_score(y_test_l, gbr_l.predict(X_test_l))
gbr_r2['gbr_test_mid'] = r2_score(y_test_m, gbr_m.predict(X_test_m))
gbr_r2['gbr_test_high'] = r2_score(y_test_h, gbr_h.predict(X_test_h))

gbr_r2

{'gbr_train_low': 0.7257421316917949,
 'gbr_train_mid': 0.8506100627184564,
 'gbr_train_high': 0.6306504998956466,
 'gbr_test_low': 0.9879542156236346,
 'gbr_test_mid': 0.993524860383692,
 'gbr_test_high': 0.9909989184198951}

In [27]:
gbr_mae = {}

gbr_mae['gbr_mae_low'] = mean_absolute_error(y_test_l, gbr_l.predict(X_test_l))
gbr_mae['gbr_mae_mid'] = mean_absolute_error(y_test_m, gbr_m.predict(X_test_m))
gbr_mae['gbr_mae_high'] = mean_absolute_error(y_test_h, gbr_h.predict(X_test_h))


gbr_mae

{'gbr_mae_low': 6338.271428719549,
 'gbr_mae_mid': 8205.64259311156,
 'gbr_mae_high': 13580.9563914444}

In [31]:
lasso_l = Lasso(alpha=1.0,
    fit_intercept=True,
    normalize=True,
    precompute=False,
    copy_X=True,
    max_iter=1000,
    tol=0.0001,
    warm_start=False,
    positive=False,
    random_state=42,
    selection='cyclic').fit(X_train_l, y_train_l)

lasso_m = Lasso(alpha=1.0,
    fit_intercept=True,
    normalize=True,
    precompute=False,
    copy_X=True,
    max_iter=1000,
    tol=0.0001,
    warm_start=False,
    positive=False,
    random_state=42,
    selection='cyclic').fit(X_train_m, y_train_m)

lasso_h = Lasso(alpha=1.0,
    fit_intercept=True,
    normalize=True,
    precompute=False,
    copy_X=True,
    max_iter=1000,
    tol=0.0001,
    warm_start=False,
    positive=False,
    random_state=42,
    selection='cyclic').fit(X_train_h, y_train_h)

In [32]:
lasso_r2 = {}

lasso_r2['lasso_train_low'] = r2_score(y_train_l, lasso_l.predict(X_train_l))
lasso_r2['lasso_train_mid'] = r2_score(y_train_m, lasso_m.predict(X_train_m))
lasso_r2['lasso_train_high'] = r2_score(y_train_h, lasso_h.predict(X_train_h))

lasso_r2['lasso_test_low'] = r2_score(y_test_l, lasso_l.predict(X_test_l))
lasso_r2['lasso_test_mid'] = r2_score(y_test_m, lasso_m.predict(X_test_m))
lasso_r2['lasso_test_high'] = r2_score(y_test_h, lasso_h.predict(X_test_h))

lasso_r2

{'lasso_train_low': 0.7845480248279124,
 'lasso_train_mid': 0.9076036158144396,
 'lasso_train_high': 0.7283473566226484,
 'lasso_test_low': 0.8071803275212428,
 'lasso_test_mid': 0.8616025451347769,
 'lasso_test_high': 0.7004690433915217}

In [33]:
lasso_mae = {}

lasso_mae['lasso_mae_low'] = mean_absolute_error(y_test_l, lasso_l.predict(X_test_l))
lasso_mae['lasso_mae_mid'] = mean_absolute_error(y_test_m, lasso_m.predict(X_test_m))
lasso_mae['lasso_mae_high'] = mean_absolute_error(y_test_h, lasso_h.predict(X_test_h))


lasso_mae

{'lasso_mae_low': 25903.108649689762,
 'lasso_mae_mid': 40240.18439000245,
 'lasso_mae_high': 81569.48058428025}

In [34]:
rid_l = Ridge(
    alpha=1.0,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    max_iter=None,
    tol=0.001,
    solver='auto',
    random_state=None,
).fit(X_train_l, y_train_l)

rid_m = Ridge(
    alpha=1.0,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    max_iter=None,
    tol=0.001,
    solver='auto',
    random_state=None,
).fit(X_train_m, y_train_m)

rid_h = Ridge(
    alpha=1.0,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    max_iter=None,
    tol=0.001,
    solver='auto',
    random_state=None,
).fit(X_train_h, y_train_h)

In [35]:
rid_r2 = {}

rid_r2['rid_train_low'] = r2_score(y_train_l, rid_l.predict(X_train_l))
rid_r2['rid_train_mid'] = r2_score(y_train_m, rid_m.predict(X_train_m))
rid_r2['rid_train_high'] = r2_score(y_train_h, rid_h.predict(X_train_h))

rid_r2['rid_test_low'] = r2_score(y_test_l, rid_l.predict(X_test_l))
rid_r2['rid_test_mid'] = r2_score(y_test_m, rid_m.predict(X_test_m))
rid_r2['rid_test_high'] = r2_score(y_test_h, rid_h.predict(X_test_h))

rid_r2

{'rid_train_low': 0.7808188551954821,
 'rid_train_mid': 0.9055766550138027,
 'rid_train_high': 0.7257673580168358,
 'rid_test_low': 0.8081337894930463,
 'rid_test_mid': 0.8765884312388006,
 'rid_test_high': 0.7059848535852241}

In [36]:
rid_mae = {}

rid_mae['rid_mae_low'] = mean_absolute_error(y_test_l, rid_l.predict(X_test_l))
rid_mae['rid_mae_mid'] = mean_absolute_error(y_test_m, rid_m.predict(X_test_m))
rid_mae['rid_mae_high'] = mean_absolute_error(y_test_h, rid_h.predict(X_test_h))


rid_mae

{'rid_mae_low': 26039.876544393246,
 'rid_mae_mid': 38764.52212009642,
 'rid_mae_high': 81097.05284647923}

#### Modellauswertung

In [38]:
# modellname, trainscore, testscore, mae
df_fazit = pd.DataFrame(data={
    'Modell' : ['LinearRegression Low', 'LinearRegression Mid', 'LinearRegression High',
                'GradientBoostRegressor Low', 'GradientBoostRegressor Mid', 'GradientBoostRegressor High',  
                'Lasso Low', 'Lasso Mid', 'Lasso High',
                'Ridge Low', 'Ridge Mid', 'Ridge High'],
    'Train Score' : [reg_r2['reg_train_low'], reg_r2['reg_train_mid'], reg_r2['reg_train_high'],
                     gbr_r2['gbr_train_low'], gbr_r2['gbr_train_mid'], gbr_r2['gbr_train_high'],
                     lasso_r2['lasso_train_low'], lasso_r2['lasso_train_mid'], lasso_r2['lasso_train_high'],
                     rid_r2['rid_train_low'], rid_r2['rid_train_mid'], rid_r2['rid_train_high']
                    ],
    'Test Score' : [ reg_r2['reg_test_low'], reg_r2['reg_test_mid'], reg_r2['reg_test_high'],
                     gbr_r2['gbr_test_low'], gbr_r2['gbr_test_mid'], gbr_r2['gbr_test_high'],
                     lasso_r2['lasso_test_low'], lasso_r2['lasso_test_mid'], lasso_r2['lasso_test_high'],
                     rid_r2['rid_test_low'], rid_r2['rid_test_mid'], rid_r2['rid_test_high']
                   ],
    'Mean Absolut Error' : [reg_mae['reg_mae_low'], reg_mae['reg_mae_mid'], reg_mae['reg_mae_high'],
                            gbr_mae['gbr_mae_low'], gbr_mae['gbr_mae_mid'], gbr_mae['gbr_mae_high'],
                            lasso_mae['lasso_mae_low'], lasso_mae['lasso_mae_mid'], lasso_mae['lasso_mae_high'],
                            rid_mae['rid_mae_low'], rid_mae['rid_mae_mid'], rid_mae['rid_mae_high']]
})
df_fazit

Unnamed: 0,Modell,Train Score,Test Score,Mean Absolut Error
0,LinearRegression Low,-5.64613e+20,-5.736e+20,711974500000000.0
1,LinearRegression Mid,-6.517158e+19,-5.180149e+19,235428000000000.0
2,LinearRegression High,0.7283477,0.700348,81577.51
3,GradientBoostRegressor Low,0.7257421,0.9879542,6338.271
4,GradientBoostRegressor Mid,0.8506101,0.9935249,8205.643
5,GradientBoostRegressor High,0.6306505,0.9909989,13580.96
6,Lasso Low,0.784548,0.8071803,25903.11
7,Lasso Mid,0.9076036,0.8616025,40240.18
8,Lasso High,0.7283474,0.700469,81569.48
9,Ridge Low,0.7808189,0.8081338,26039.88


#### Modelle speichern

In [39]:
shortDate = datetime.today().strftime('%Y-%m-%d')

joblib.dump(gbr, 'gbr_model' + shortDate)
joblib.dump(gbr_l, 'gbr_model_low_' + shortDate)
joblib.dump(gbr_m, 'gbr_model_med_' + shortDate)
joblib.dump(gbr_h, 'gbr_model_high_' + shortDate)

joblib.dump(lasso_l, 'lasso_model_low_' + shortDate)
joblib.dump(lasso_m, 'lasso_model_med_' + shortDate)
joblib.dump(lasso_h, 'lasso_model_high_' + shortDate)

joblib.dump(rid_l, 'rid_model_low_' + shortDate)
joblib.dump(rid_m, 'rid_model_med_' + shortDate)
joblib.dump(rid_h, 'rid_model_high_' + shortDate)

['rid_model_high_2022-07-15']