# Germany Rental Prediction - Cleaning, EDA and Prediction

## Purpose from this notebook.

Creating the model and deploy to heroku

# Basic data handling and inspection

Import all important libraries in this kernel

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import time
import datetime
from datetime import date
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import lightgbm as lgb
from scipy.stats import norm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)

Load the dataset to the kernel

In [18]:
predict_df = pd.read_csv('berlin_predict.csv')
predict_df.head()

Unnamed: 0,heatingType,totalRent,livingSpace,condition,typeOfFlat,noRooms,regio2,additioncost
0,central_heating,840.0,86.0,well_kept,ground_floor,4.0,Dortmund,245.0
1,floor_heating,1300.0,83.8,first_time_use,apartment,3.0,Dresden,335.0
2,self_contained_central_heating,903.0,84.97,refurbished,apartment,3.0,Other,138.0
3,self_contained_central_heating,380.0,62.0,fully_renovated,apartment,2.0,Mittelsachsen_Kreis,70.0
4,oil_heating,690.0,53.0,well_kept,roof_storey,2.0,Other,110.0


In [15]:
df2.head()

Unnamed: 0,heatingType,totalRent,livingSpace,condition,typeOfFlat,noRooms,regio2,additioncost
0,central_heating,840.0,86.0,well_kept,ground_floor,4.0,Dortmund,245.0
1,floor_heating,1300.0,83.8,first_time_use,apartment,3.0,Dresden,335.0
2,self_contained_central_heating,903.0,84.97,refurbished,apartment,3.0,Other,138.0
3,self_contained_central_heating,380.0,62.0,fully_renovated,apartment,2.0,Mittelsachsen_Kreis,70.0
4,oil_heating,690.0,53.0,well_kept,roof_storey,2.0,Other,110.0


# Machine Learning

If it's an object or bool type (True,False). Code below will create the dummies for all of the categorical.

In [19]:
columns = []
for cols in predict_df.columns:
    if predict_df[cols].dtype == 'object' or predict_df[cols].dtype == 'bool':
        columns.append(cols)
dummies_feature = pd.get_dummies(predict_df[columns],prefix='',prefix_sep='')
dummies_feature.head()

Unnamed: 0,central_heating,combined_heat_and_power_plant,district_heating,floor_heating,gas_heating,heat_pump,night_storage_heater,oil_heating,self_contained_central_heating,wood_pellet_heating,Other,first_time_use,first_time_use_after_refurbishment,fully_renovated,mint_condition,modernized,refurbished,well_kept,apartment,ground_floor,half_basement,loft,maisonette,other,penthouse,raised_ground_floor,roof_storey,terraced_flat,Berlin,Chemnitz,Dortmund,Dresden,Duisburg,Düsseldorf,Essen,Frankfurt_am_Main,Gelsenkirchen,Halle_Saale,Hamburg,Köln,Leipzig,Leipzig_Kreis,Magdeburg,Mittelsachsen_Kreis,München,Other.1,Recklinghausen_Kreis,Zwickau,Zwickau_Kreis
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


Combine those columns together and drop the categorical columns that we created the dummies

In [20]:
predict_df = predict_df.drop(columns=columns)
predict_df = pd.concat([predict_df, dummies_feature], axis=1)
predict_df.head()

Unnamed: 0,totalRent,livingSpace,noRooms,additioncost,central_heating,combined_heat_and_power_plant,district_heating,floor_heating,gas_heating,heat_pump,night_storage_heater,oil_heating,self_contained_central_heating,wood_pellet_heating,Other,first_time_use,first_time_use_after_refurbishment,fully_renovated,mint_condition,modernized,refurbished,well_kept,apartment,ground_floor,half_basement,loft,maisonette,other,penthouse,raised_ground_floor,roof_storey,terraced_flat,Berlin,Chemnitz,Dortmund,Dresden,Duisburg,Düsseldorf,Essen,Frankfurt_am_Main,Gelsenkirchen,Halle_Saale,Hamburg,Köln,Leipzig,Leipzig_Kreis,Magdeburg,Mittelsachsen_Kreis,München,Other.1,Recklinghausen_Kreis,Zwickau,Zwickau_Kreis
0,840.0,86.0,4.0,245.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1300.0,83.8,3.0,335.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,903.0,84.97,3.0,138.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,380.0,62.0,2.0,70.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,690.0,53.0,2.0,110.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [21]:
predict_df = predict_df*1

I want to normalization the columns that's int or float type

In [64]:
# for cols in predict_df.columns:
#     if predict_df[cols].dtype == 'int64' or predict_df[cols].dtype == 'float64':
#         if cols != 'totalRent':
#             predict_df[cols] = ((predict_df[cols] - predict_df[cols].mean())/(predict_df[cols].std()))

# predict_df            

Move the 'totalRent' column to the first column to make it easier for split the data.

In [22]:
move = predict_df['totalRent']
predict_df.drop(labels=['totalRent'], axis=1,inplace = True)
predict_df.insert(0, 'totalRent', move)
predict_df.head()

Unnamed: 0,totalRent,livingSpace,noRooms,additioncost,central_heating,combined_heat_and_power_plant,district_heating,floor_heating,gas_heating,heat_pump,night_storage_heater,oil_heating,self_contained_central_heating,wood_pellet_heating,Other,first_time_use,first_time_use_after_refurbishment,fully_renovated,mint_condition,modernized,refurbished,well_kept,apartment,ground_floor,half_basement,loft,maisonette,other,penthouse,raised_ground_floor,roof_storey,terraced_flat,Berlin,Chemnitz,Dortmund,Dresden,Duisburg,Düsseldorf,Essen,Frankfurt_am_Main,Gelsenkirchen,Halle_Saale,Hamburg,Köln,Leipzig,Leipzig_Kreis,Magdeburg,Mittelsachsen_Kreis,München,Other.1,Recklinghausen_Kreis,Zwickau,Zwickau_Kreis
0,840.0,86.0,4.0,245.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1300.0,83.8,3.0,335.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,903.0,84.97,3.0,138.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,380.0,62.0,2.0,70.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,690.0,53.0,2.0,110.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


### Spliting the data into train and test

In [23]:
X = predict_df.iloc[:,1:] # Select all the columns, that's not totalRent
y = predict_df.iloc[:,0] # Select only totalRent
X_val = X.values
y_val = y.values
x_train, x_test, y_train, y_test = train_test_split(X_val, y_val, test_size = 0.25, random_state = 123)

## Light Gradient Boost
I've found this model is like Xgboost and it's run much faster, so now I'm trying as much as I could to use this libraries to improve my skills

In [24]:
d_train = lgb.Dataset(x_train, label=y_train) # Load the dataset and test

# parameters for this model
params = {
        'n_estimators': 10000,
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'learning_rate': 0.01,
        'subsample': 0.72,
        'subsample_freq': 4,
        'feature_fraction': 0.4,
        'lambda_l1': 1,
        'lambda_l2': 1,
        'seed': 46,
        }

clf = lgb.train(params, d_train, 100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 614
[LightGBM] [Info] Number of data points in the train set: 119250, number of used features: 52
[LightGBM] [Info] Start training from score 713.479124


Check the data is what'we expected or not.

In [25]:
y_pred = clf.predict(x_test)
y_pred

array([575.12818746, 557.5903899 , 604.37419539, ..., 408.82477424,
       592.72104894, 571.78245478])

Create the RMSE function to check the score.

In [26]:
def compute_rmse(model, X, y_true, name):
    y_pred = model.predict(X)
    mse = mean_squared_error(y_true, y_pred)
    rmse = pow(mse,0.5)
    print(f'Root Mean Squared Error for {name}: {rmse}')

In [27]:
compute_rmse(clf, x_train, y_train, 'Training Set')
compute_rmse(clf, x_test, y_test, 'Test Set')

Root Mean Squared Error for Training Set: 130.61192426059708
Root Mean Squared Error for Test Set: 139.25853425522854


The result of the LGBM is working pretty well and we could use for this model in real life situation and help the other to know how much money their should pay for the apartment they're looking for.

In [28]:
predict_df.head()

Unnamed: 0,totalRent,livingSpace,noRooms,additioncost,central_heating,combined_heat_and_power_plant,district_heating,floor_heating,gas_heating,heat_pump,night_storage_heater,oil_heating,self_contained_central_heating,wood_pellet_heating,Other,first_time_use,first_time_use_after_refurbishment,fully_renovated,mint_condition,modernized,refurbished,well_kept,apartment,ground_floor,half_basement,loft,maisonette,other,penthouse,raised_ground_floor,roof_storey,terraced_flat,Berlin,Chemnitz,Dortmund,Dresden,Duisburg,Düsseldorf,Essen,Frankfurt_am_Main,Gelsenkirchen,Halle_Saale,Hamburg,Köln,Leipzig,Leipzig_Kreis,Magdeburg,Mittelsachsen_Kreis,München,Other.1,Recklinghausen_Kreis,Zwickau,Zwickau_Kreis
0,840.0,86.0,4.0,245.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1300.0,83.8,3.0,335.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,903.0,84.97,3.0,138.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,380.0,62.0,2.0,70.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,690.0,53.0,2.0,110.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


### Create the predict function.
Below, I will use the model that we have created to use by input variables below to calculate the rental price.

In [45]:
def predict_price(livingSpace,noRooms,additionCost,heating_type,condition,typeOfFlat,regio2):
    heatingIndex = np.where(X.columns == heating_type)[0][0]
    conIndex = np.where(X.columns == condition)[0][0]
    flatTypeIndex = np.where(X.columns == typeOfFlat)[0][0]
    regionIndex = np.where(X.columns == regio2)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = livingSpace
    x[1] = noRooms
    x[2] = additionCost
  

    if heatingIndex >= 0:
        x[heatingIndex] = 1
    if conIndex >= 0:
        x[conIndex] = 1
    if flatTypeIndex >= 0:
        x[flatTypeIndex] = 1
    if regionIndex >= 0:
        x[regionIndex] = 1

    return  clf.predict([x])[0]
 

In [49]:
sampleprice = predict_price(55,4,200,'central_heating','mint_condition','apartment','Berlin')
print(f'Example price :{sampleprice}')

Example price :984.5185251539239


Let's try some rows to make sure our models is working properly

In [38]:
predict_df.head()

Unnamed: 0,totalRent,livingSpace,noRooms,additioncost,central_heating,combined_heat_and_power_plant,district_heating,floor_heating,gas_heating,heat_pump,night_storage_heater,oil_heating,self_contained_central_heating,wood_pellet_heating,Other,first_time_use,first_time_use_after_refurbishment,fully_renovated,mint_condition,modernized,refurbished,well_kept,apartment,ground_floor,half_basement,loft,maisonette,other,penthouse,raised_ground_floor,roof_storey,terraced_flat,Berlin,Chemnitz,Dortmund,Dresden,Duisburg,Düsseldorf,Essen,Frankfurt_am_Main,Gelsenkirchen,Halle_Saale,Hamburg,Köln,Leipzig,Leipzig_Kreis,Magdeburg,Mittelsachsen_Kreis,München,Other.1,Recklinghausen_Kreis,Zwickau,Zwickau_Kreis
0,840.0,86.0,4.0,245.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1300.0,83.8,3.0,335.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,903.0,84.97,3.0,138.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,380.0,62.0,2.0,70.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,690.0,53.0,2.0,110.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [51]:
result1 = predict_price(86,4,245,'central_heating','well_kept','ground_floor','Dortmund')
print(f'Price for the example price is 840 but our model predicted at: {result1}')

Price for the example price is 840 but our model predicted at: 800.8695516478672


## Saving the model for further use.

In [169]:
import pickle
with open('german_home_prices_model.pickle','wb') as f:
    pickle.dump(clf,f)

In [170]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

# Summary

This is the end of the kernel, if you love this kernel or could study something from this please upvote! it means a lot for my future opportunity. Moreover, feel free to comment on my mistakes because it would be surely help me to improve my mistakes.

Thanks for viewing!