# Elisa real estate, finding a model

Source: https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv

Libraries :
OneHotencoding
XGboost
Lasso
Ridge
Random forest
sklearn.preprocessing.LabelEncoder

Expected result: compare the prediction performance of the models, create best simulation and prediction

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
df = pd.read_csv('./data/may.csv', index_col=0)
df.head(4)

Unnamed: 0,Url,Source,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Area,Fully equipped kitchen,...,Terrace Area,Garden,Garden Area,Surface of the land,Surface area of the plot of land,Number of facades,Swimming pool,State of the building,Province,Region
0,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,319799.0,regular sale,1.0,,0.0,...,11.0,0.0,,,,,0.0,,Antwerp,Vlaams
1,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,291999.0,regular sale,1.0,,0.0,...,6.0,0.0,,,,,0.0,,Antwerp,Vlaams
2,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,764999.0,regular sale,2.0,153.0,0.0,...,62.0,0.0,,,,,0.0,,Antwerp,Vlaams
3,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,660264.0,regular sale,3.0,,0.0,...,160.0,0.0,,,,,0.0,,Antwerp,Vlaams


## Data cleaning

 - Existence of duplicated columns ?
 - Get rid of title column ? not needed -> np.array
 - Remove URL column which is there for debugging purpose
 - Remove columns with missing values with threshold more than 80% Nan or equivalent
 - Check if the type of data is homogenous per column if not clean, check unique
 - Check abnomalies and outliers
 

In [2]:
# get rid of unuseful data for ML
df.drop(['Url', 'Source', 'Locality','Province', 'Region'], axis=1, inplace = True)

In [3]:
# df.duplicated().any() 
df.drop_duplicates(keep = 'first', inplace=True)

In [4]:
# get rid of columns with less than 20% data
df.dropna(thresh=len(df)*0.8, axis='columns', inplace = True)

In [5]:
# get rid of outliers
# df.sort_values(by='Price', ascending = False).head(4)
df = df[(df.Price >= 50000) & (df.Price <= 10000000)] 

In [6]:
# data engineering - is null 
df.isnull().any()

Type of property           True
Price                     False
Type of sale              False
Number of rooms            True
Area                       True
Fully equipped kitchen    False
Open fire                 False
Terrace                   False
Garden                    False
Swimming pool             False
dtype: bool

In [7]:
# examine data of remaining columns 
for columns in df: 
    if(len(df[columns].unique()) < 200): 
        print(df[columns].unique())
df.columns

['apartment' nan 'house']
['regular sale' 'public sale']
[  1.   2.   3.  18.   4.  nan   5.   6.  10.   7. 125.  14.   9.  32.
  12.   8.  16.  19.  35.  11.  15.  13.  50.  20.  25.  24.  21.  30.
   0.  47.  66.  22.  36.  40.  41.  34.  17.  28.  46.  63. 165.  27.
  99.  70.  37.  80.  39.  38.  90.  33.  23.]
[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]


Index(['Type of property', 'Price', 'Type of sale', 'Number of rooms', 'Area',
       'Fully equipped kitchen', 'Open fire', 'Terrace', 'Garden',
       'Swimming pool'],
      dtype='object')

In [8]:
# Get rid of nan, delete rows
df = df.dropna()
df.isnull().any()

Type of property          False
Price                     False
Type of sale              False
Number of rooms           False
Area                      False
Fully equipped kitchen    False
Open fire                 False
Terrace                   False
Garden                    False
Swimming pool             False
dtype: bool

In [9]:
# Count remaining rows
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59306 entries, 2 to 73502
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Type of property        59306 non-null  object 
 1   Price                   59306 non-null  float64
 2   Type of sale            59306 non-null  object 
 3   Number of rooms         59306 non-null  float64
 4   Area                    59306 non-null  float64
 5   Fully equipped kitchen  59306 non-null  float64
 6   Open fire               59306 non-null  float64
 7   Terrace                 59306 non-null  float64
 8   Garden                  59306 non-null  float64
 9   Swimming pool           59306 non-null  float64
dtypes: float64(8), object(2)
memory usage: 5.0+ MB


In [10]:
# get rid of unuseful data for ML
df.drop(['Type of property', 'Type of sale'], axis=1, inplace = True)

In [11]:
df

Unnamed: 0,Price,Number of rooms,Area,Fully equipped kitchen,Open fire,Terrace,Garden,Swimming pool
2,764999.0,2.0,153.0,0.0,0.0,1.0,0.0,0.0
4,294999.0,2.0,80.0,0.0,0.0,0.0,0.0,0.0
6,233999.0,2.0,90.0,0.0,0.0,0.0,0.0,0.0
7,329899.0,1.0,87.0,0.0,0.0,1.0,0.0,0.0
9,359899.0,1.0,95.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
73496,334900.0,3.0,165.0,0.0,0.0,0.0,0.0,0.0
73497,340500.0,3.0,167.0,0.0,0.0,0.0,0.0,0.0
73500,307242.0,3.0,150.0,0.0,0.0,0.0,0.0,0.0
73501,315000.0,3.0,150.0,0.0,0.0,1.0,0.0,0.0


## Create train, test
WARNING 
- never take outliers before the train set, they are part of the tests values
- only clean wrong data, not possible data, error data, etc..
- clean outliers in the train data and monitor over/under fitting aftwards

In [12]:
# create the matrix of x and the y target
X = df.iloc[:,1:] 
y = df.iloc[:,:1] 

print(X.shape) 
print(y.shape)


(59306, 7)
(59306, 1)


In [15]:
# split into train and test data
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40, stratify = y) # 0,1 class can not be stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
len(X_train)

41514

In [17]:
# measure the accuracy of the train test
# .score(X_train, y_train)

## Application of ML

- multiple regression
- polynomial
- random forest
- xgboost
- ridge
LATER
- libraries cleaning and engineering by themselves


### multiple linear regression
$$
\begin{bmatrix}
y^{(1)}\\
y^{(2)}\\
y^{(3)}\\
... \\
y^{(m)}\\
\end{bmatrix}
=
\begin{bmatrix}
x^{(1)}_1, x^{(1)}_2, 1\\
x^{(2)}_1, x^{(2)}_2, 1\\
x^{(3)}_1, x^{(3)}_2, 1\\
x^{(m)}_1,x^{(m)}_2,  1\\
\end{bmatrix}
.
\begin{bmatrix}
a\\
b\\
c\\
\end{bmatrix}
$$

In [None]:
# linear regression
# with sklearn, no need to add 1 to the matrix
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [None]:
# performance
regressor.score(X_train, y_train)

In [None]:
# visualizing results


In [None]:
# play with number of iterations


In [None]:
# calculate overfitting and underfitting


### polynomial
$$ X =
\begin{bmatrix}
x^{(1)}_1, x^{(1)2}_2, ..., x^{(m)k}_{n}, 1\\
x^{(2)}_1, x^{(2)2}_2, ..., x^{(m)k}_{n}, 1\\
x^{(3)}_1, x^{(3)2}_2, ..., x^{(m)k}_{n}, 1\\
x^{(m)}_1,x^{(m)k}_2, ..., x^{(m)k}_{n}, 1\\
\end{bmatrix}
$$
is it applicable here ?

In [None]:
# multiple linear regression using SciKitLearn


## XGboost

INSTALL package via conda install xgboost

TODO to be used only with 0.5+ score ?


In [21]:
from xgboost import XGBRegressor
my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(X_train, y_train, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
# make predictions
predictions = my_model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error : 136926.98771656503


In [27]:
# finetuning the model
my_model = XGBRegressor(n_estimators=1000)
my_model.fit(X_train, y_train, early_stopping_rounds=50, 
             eval_set=[(X_test, y_test)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
# make predictions
predictions = my_model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error : 136338.05219779254


In [30]:
# mesure performance
my_model.score(X_test, y_test)

0.507525855860931