Rental model
============

Will experiment with creating a rental model here

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

In [2]:
# get 
!wget https://raw.githubusercontent.com/magnuspaal/germany-rental-ml/master/data/rental/rental_location_data_nan.csv -O rental_location_data_nan.csv

--2020-12-14 15:36:34--  https://raw.githubusercontent.com/magnuspaal/germany-rental-ml/master/data/rental/rental_location_data_nan.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46950277 (45M) [text/plain]
Saving to: ‘rental_location_data_nan.csv’


2020-12-14 15:36:35 (68.3 MB/s) - ‘rental_location_data_nan.csv’ saved [46950277/46950277]



In [3]:
rental_location_data = pd.read_csv("rental_location_data_nan.csv")

#drop index column
rental_location_data = rental_location_data.drop(rental_location_data.columns[0], axis=1)
assert rental_location_data.shape[1] == 44

In [4]:
from sklearn.impute import SimpleImputer

nominal = ['regio1', 'heatingType', 'telekomTvOffer', 'newlyConst', 'balcony', 'firingTypes', 'hasKitchen', 'cellar', 'condition', 
           'interiorQual', 'geo_plz', 'petsAllowed', 'lift', 'typeOfFlat', 'garden', 'regio2', 'regio3', 'energyEfficiencyClass', 'scoutId']

numeric = ['serviceCharge', 'picturecount', 'pricetrend', 'telekomUploadSpeed', 'totalRent', 'yearConstructed', 'noParkSpaces', 'yearConstructedRange', 'baseRent',
           'livingSpace', 'baseRentRange', 'noRooms', 'thermalChar', 'floor', 'numberOfFloors', 'noRoomsRange', 'livingSpaceRange', 'heatingCosts', 'lastRefurbish', 
           'electricityBasePrice', 'electricityKwhPrice']

columns = nominal + numeric

rental_location_data[['cellar', 'lift', 'hasKitchen', 'newlyConst', 'balcony', 'garden']] = rental_location_data[['cellar', 'lift', 'hasKitchen', 'newlyConst', 'balcony', 'garden']].astype('int32')

## Remove all rows with totalRent NAN
rental_location_data = rental_location_data[rental_location_data['totalRent'] < 2500]

## Use Simple Imputer to replace all missing nominal values with the most frequent value.
for col in nominal:
  imp_mfreq = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=rental_location_data[col].mode()[0])
  rental_location_data[[col]] = imp_mfreq.fit_transform(rental_location_data[[col]])
    
## Use Simple Imputer to replace all missing numeric values with the mean value.
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
rental_location_data[numeric] = imp_mean.fit_transform(rental_location_data[numeric])

# Convert columns to better data types.
rental_location_data[['yearConstructed', 'noRooms', 'floor', 'lastRefurbish', 'noParkSpaces', 'numberOfFloors']] = rental_location_data[['yearConstructed', 'noRooms', 'floor', 'lastRefurbish', 'noParkSpaces', 'numberOfFloors']].astype('int32')

In [5]:
quantiles = rental_location_data['totalRent'].quantile([0.25, 0.50, 0.75])

# Distribute price classes based on quantiles
def price_class_quantile(row):
  total_rent = row['totalRent']
  for idx, quantile in enumerate(quantiles):
    # if rent is smaller or equal to quantile, it's in that quantile
    if (total_rent <= quantile):
      # 0,1,2... -> class of 1,2,3..
      return idx + 1
  
  #if we're here it's bigger than the biggest quantile. 
  # return largest quantile (which is last index + 1, so just the length) + 1 
  return len(quantiles) + 1

rental_location_data['priceClass'] = rental_location_data.apply(lambda row: price_class_quantile(row), axis=1)

In [179]:
X = rental_location_data[[
                          'heatingType', 'firingTypes', 'energyEfficiencyClass', ## Energy and heating
                          'hasKitchen', 'cellar', 'garden', 'balcony', 'lift', 'petsAllowed', ## Quality of life
                          'livingSpace', 'condition', 'interiorQual', 'noRooms', 'typeOfFlat', ## characteristics
                          'telekomUploadSpeed', ## Internet
                          'yearConstructed', 'newlyConst', 'lastRefurbish',  ## Construction
 #                         'city', 
                          'lat',
                          'lon',
                          'zip',
                          'regio1',
                          'regio2',## Location
                          ]]

X = pd.get_dummies(X)
y = rental_location_data[['totalRent']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [180]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

standard = StandardScaler()
standard_y = StandardScaler()

standard.fit(X_train)
standard_y.fit(y_train)

def get_transformed_x(x, scaler=standard):
  if (scaler == None):
    return x
  return scaler.transform(x)

def get_transformed_y(y, scaler=standard_y):
  if (scaler == None):
    return y
  return scaler.transform(y)

def get_scores(y_t, y_p, model=None, X=None):
  print("Squared MSE:", np.sqrt(mean_squared_error(y_t, y_p)))
  print("y mean:", y.mean()[0])
  print("y standard deviation:", y.std()[0])
  if(model != None): 
    print("model score:", model.score(get_transformed_x(X), get_transformed_y(y_t)))
  print("mean absolute error:", mean_absolute_error(y_t, y_p))
  print("r2 score:", r2_score(y_t, y_p))

# Regression (totalRent, baseRent?)

## Random Forest Regressor

In [181]:
%%time

from sklearn.ensemble import RandomForestRegressor

model_randomforest = RandomForestRegressor(random_state=0)
model_randomforest.fit(get_transformed_x(X_train), get_transformed_y(y_train))

  """


CPU times: user 6min 52s, sys: 132 ms, total: 6min 53s
Wall time: 6min 54s


In [182]:
y_pred = standard_y.inverse_transform(model_randomforest.predict(X_test))
get_scores(y_test, y_pred, model_randomforest, X_test)

Squared MSE: 919.3444128090356
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
model score: 0.8956955531803882
mean absolute error: 848.0657158400313
r2 score: -3.4739015034301657


## GradientBoostingRegressor

In [183]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

model_gbr = GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)
model_gbr.fit(get_transformed_x(X_train), get_transformed_y(y_train))

  y = column_or_1d(y, warn=True)


CPU times: user 3min 32s, sys: 56.4 ms, total: 3min 32s
Wall time: 3min 32s


In [184]:
y_pred = standard_y.inverse_transform(model_gbr.predict(get_transformed_x(X_test)))
get_scores(y_test, y_pred, model_gbr, X_test)

Squared MSE: 389.8728726190394
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
model score: 0.19540751401148138
mean absolute error: 300.6146235366138
r2 score: 0.19540751401148126


## SGDRegressor

In [185]:
%%time
from sklearn.linear_model import SGDRegressor

model_sgd = SGDRegressor()
model_sgd.fit(get_transformed_x(X_train), get_transformed_y(y_train))

  y = column_or_1d(y, warn=True)


CPU times: user 1.81 s, sys: 1.86 ms, total: 1.81 s
Wall time: 1.82 s


In [186]:
y_pred = standard_y.inverse_transform(model_sgd.predict(get_transformed_x(X_test)))
get_scores(y_test, y_pred, model_sgd, X_test)

Squared MSE: 184717503835015.97
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
model score: -1.8061176582817513e+23
mean absolute error: 13106596169003.918
r2 score: -1.806117658281752e+23


## XGBoost

### XGBRFRegressor

In [187]:
import xgboost as xgb

In [188]:
%%time

model_xgb = xgb.XGBRFRegressor(random_state=42, learning_rate=0.25, max_depth=4, min_child_weight=5, gamma=0.0, n_estimators=500)
model_xgb.fit(get_transformed_x(X_train), get_transformed_y(y_train))

CPU times: user 9min 24s, sys: 130 ms, total: 9min 24s
Wall time: 9min 26s


In [189]:
y_pred = standard_y.inverse_transform(model_xgb.predict(get_transformed_x(X_test)))
get_scores(y_test, y_pred, model_xgb, X_test)

Squared MSE: 402.7613857111737
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
model score: 0.1413313580528296
mean absolute error: 345.6610268432887
r2 score: 0.14133138537983303


In [190]:
y_pred_train = model_xgb.predict(get_transformed_x(X_train))
get_scores(y_train, standard_y.inverse_transform(y_pred_train))

Squared MSE: 404.3479668381874
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
mean absolute error: 346.7278647978438
r2 score: 0.1439116356406851


### XGBRegressor

In [191]:
model_xgbr = xgb.XGBRegressor(learning_rate=0.3, max_depth=4, min_child_weight=5, gamma=0.0, n_estimators=500)
model_xgbr.fit(get_transformed_x(X_train), get_transformed_y(y_train))



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.0,
             importance_type='gain', learning_rate=0.3, max_delta_step=0,
             max_depth=4, min_child_weight=5, missing=None, n_estimators=500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [192]:
y_pred_train = model_xgbr.predict(get_transformed_x(X_train))
get_scores(y_train, standard_y.inverse_transform(y_pred_train), model_xgbr, X_train)

Squared MSE: 120.09972729653921
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
model score: 0.9244747604005437
mean absolute error: 79.9182538229951
r2 score: 0.9244747607958428


In [193]:
y_pred_val =  standard_y.inverse_transform(model_xgbr.predict(get_transformed_x(X_val)))
get_scores(y_val, y_pred_val, model_xgbr, X_val)

Squared MSE: 135.7082501531147
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
model score: 0.9050229929575538
mean absolute error: 88.85047181439403
r2 score: 0.9050229932431725


In [194]:
y_pred_test =  standard_y.inverse_transform(model_xgbr.predict(get_transformed_x(X_test)))
get_scores(y_test, y_pred_test, model_xgbr, X_test)

Squared MSE: 133.28147004767405
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
model score: 0.905969339538154
mean absolute error: 87.40204325990547
r2 score: 0.9059693399128315


In [195]:
pd.set_option('display.max_rows', 20)
df = pd.DataFrame({'index': X_test.index, 'totalRent': y_pred_test}, columns=['index', 'totalRent'])  
df.set_index('index', inplace=True)   
df.head(20)

Unnamed: 0_level_0,totalRent
index,Unnamed: 1_level_1
5310,1148.771851
146809,621.941772
105875,407.02243
113893,1173.446167
178533,1200.54895
33269,1379.738281
162728,600.859009
156965,275.312683
99771,562.184143
43127,551.829712


In [196]:
y_test.head(20)

Unnamed: 0,totalRent
5310,1037.0
146809,670.0
105875,393.0
113893,1368.28
178533,1015.0
33269,1550.0
162728,577.0
156965,280.0
99771,595.46
43127,547.0


## Neural networks

In [197]:
X_nn = rental_location_data[[
                          'hasKitchen', 
                          'balcony', 
                          'lift', 
                          'noRooms',
                          'yearConstructed', 
                          'newlyConst', 
                          'zip', 
                          'heatingType', 
                          'condition',
                          'interiorQual', 
                          'regio1',
                          'regio2'
                          ]]

X_nn = pd.get_dummies(X_nn)
##Predict rent
y_nn = rental_location_data[['totalRent']]

X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_nn, y_nn, test_size=0.3, random_state=42)

X_train_nn, X_val_nn, y_train_nn, y_val_nn = train_test_split(X_train_nn, y_train_nn, test_size=0.25, random_state=42)

standard_nn = StandardScaler()
standard_nn_y = StandardScaler()
standard_nn.fit(X_train_nn)
standard_nn_y.fit(y_train_nn)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [198]:
from keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from keras.models import Sequential
from keras.optimizers import Adam, SGD

model_nn = Sequential()

model_nn.add(Dense(X_train_nn.shape[1], input_dim=X_train_nn.shape[1], kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(256, kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(128, kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(64, kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(32, kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(16, kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(8, kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(4, kernel_initializer='normal', activation='relu'))
model_nn.add(Dense(1, kernel_initializer='normal'))

model_nn.compile(loss='mean_squared_error', optimizer=Adam(lr=0.001))

In [199]:
%%time

hist = model_nn.fit(get_transformed_x(X_train_nn, scaler=standard_nn), get_transformed_y(y_train_nn, scaler=standard_nn_y), validation_data=(get_transformed_x(X_val_nn, scaler=standard_nn), get_transformed_y(y_val_nn, scaler=standard_nn_y)), epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 50.5 s, sys: 2.76 s, total: 53.3 s
Wall time: 33.3 s


In [200]:
y_pred_nn = standard_nn_y.inverse_transform(model_nn.predict(get_transformed_x(X_test_nn,scaler=standard_nn)))
get_scores(y_test_nn, y_pred_nn)

Squared MSE: 210.97317928807408
y mean: 801.7546597120286
y standard deviation: 436.6622295774987
mean absolute error: 145.87198413539895
r2 score: 0.7643950386283729


In [201]:
y_pred_nn

array([[1180.2953 ],
       [ 821.43774],
       [ 432.74887],
       ...,
       [1534.9017 ],
       [ 752.97534],
       [1649.9119 ]], dtype=float32)

In [202]:
y_test_nn

Unnamed: 0,totalRent
5310,1037.00
146809,670.00
105875,393.00
113893,1368.28
178533,1015.00
...,...
71898,848.00
95122,895.00
5424,1160.00
113793,750.00


# Classification (Price class)

In [278]:
##Predict price class
y_class = rental_location_data[['priceClass']]

X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X, y_class, test_size=0.3, random_state=42)

standard_class = StandardScaler()

standard_class.fit(X_class_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

## Random Forest Classifier

In [240]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [268]:
param_grid = {
                 'n_estimators': [100],
                 'max_depth': [25, 30],
                 'bootstrap': [False],
                 'criterion': ['entropy']
             }

grid_clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=4, verbose=False)
grid_clf.fit(get_transformed_x(X_class_train[0:10000], scaler=standard_class), y_class_train['priceClass'][0:10000])

grid_clf.best_params_

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 30,
 'n_estimators': 100}

In [284]:
%%time
model_class_rfc = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=30, n_estimators=200)

model_class_rfc.fit(get_transformed_x(X_class_train, scaler=standard_class), y_class_train['priceClass'])

CPU times: user 3min 32s, sys: 1.93 s, total: 3min 34s
Wall time: 3min 34s


In [285]:
y_pred = model_class_rfc.predict(get_transformed_x(X_class_test))
accuracy_score(y_class_test, y_pred)

0.7767459292883022

In [286]:
y_pred

array([4, 2, 1, ..., 4, 3, 4])

## SGD

In [209]:
from sklearn.linear_model import SGDClassifier

model_class_sgd = SGDClassifier()

In [210]:
model_class_sgd.fit(get_transformed_x(X_class_train, scaler=standard_class), y_class_train)

  y = column_or_1d(y, warn=True)


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [211]:
accuracy_score(y_class_test, model_class_sgd.predict(get_transformed_x(X_class_test, scaler=standard_class)))

0.6527714154832799

#XGBoost classifier

In [213]:
import xgboost as xgb

In [214]:
%%time

model_class_xgb = xgb.XGBClassifier(random_state=42)
model_class_xgb.fit(get_transformed_x(X_class_train, scaler=standard_class), y_class_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


CPU times: user 8min 14s, sys: 694 ms, total: 8min 15s
Wall time: 8min 16s


In [215]:
accuracy_score(y_class_test, model_class_xgb.predict(get_transformed_x(X_class_test, scaler=standard_class)))

0.7209636446924582

## Neural Network

In [216]:
##Predict price class
y_class = rental_location_data[['priceClass']]

X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_nn, y_class, test_size=0.3, random_state=42)

X_class_train, X_class_val, y_class_train, y_class_val = train_test_split(X_class_train, y_class_train, test_size=0.25, random_state=42)

standard_class = StandardScaler()

standard_class.fit(X_class_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [217]:
y_class_train_v = pd.get_dummies(y_class_train.astype(object))
y_class_val_v = pd.get_dummies(y_class_val.astype(object))
y_class_test_v = pd.get_dummies(y_class_test.astype(object))

In [218]:
from keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from keras.models import Sequential
from keras.optimizers import Adam

model_nn_class = Sequential()

model_nn_class.add(Dense(X_class_train.shape[1], input_dim=X_class_train.shape[1], kernel_initializer='normal', activation='relu'))
model_nn_class.add(Dense(64, kernel_initializer='normal', activation='relu'))
model_nn_class.add(Dense(32, kernel_initializer='normal', activation='relu'))
model_nn_class.add(Dense(16, kernel_initializer='normal', activation='relu'))
model_nn_class.add(Dense(8, kernel_initializer='normal', activation='relu'))
model_nn_class.add(Dense(4, kernel_initializer='normal'))

model_nn_class.compile(loss='mean_squared_error', optimizer=Adam(lr=0.01))

In [219]:
%%time
hist = model_nn_class.fit(get_transformed_x(X_class_train, scaler=standard_class), y_class_train_v, validation_data=(get_transformed_x(X_class_val, scaler=standard_class), y_class_val_v), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 45.4 s, sys: 2.64 s, total: 48 s
Wall time: 32.7 s


In [220]:
y_pred = model_nn_class.predict(get_transformed_x(X_class_val, scaler=standard_class))

y_classes = []

for pred in y_pred:
  y_classes.append(np.argmax(pred) + 1)

In [221]:
accuracy_score(y_class_val, y_classes)

0.6181688381224344