# Elisa real estate, finding a model

Source: https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv

Libraries :
OneHotencoding
XGboost
Lasso
Ridge
Random forest
sklearn.preprocessing.LabelEncoder

Expected result: compare the prediction performance of the models, create best simulation and prediction

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
df = pd.read_csv('./data/may.csv', index_col=0)
df.head()
df.shape

(73510, 23)

## Data cleaning

 - Existence of duplicated columns ?
 - Get rid of title column ? not needed -> np.array
 - Remove URL column which is there for debugging purpose
 - Remove columns with missing values with threshold more than 80% Nan or equivalent
 - Check if the type of data is homogenous per column if not clean, check unique
 - Check abnomalies and outliers
 
## Data engineering

- Digitalisation of the concrete variables (one, dummies,.. )
- Hyperparameters selection libraries
 

In [2]:
# get rid of unuseful data for ML
df.drop(['Url', 'Source', 'Locality','Province', 'Region'], axis=1, inplace = True)

In [3]:
# df.duplicated().any() 
df.drop_duplicates(keep = 'first', inplace=True)
df.shape

(71533, 18)

In [4]:
# get rid of columns with less than 20% data
df.dropna(thresh=len(df)*0.8, axis='columns', inplace = True)

In [5]:
# get rid of outliers
# df.sort_values(by='Price', ascending = False).head(4)
df = df[(df.Price >= 40000) & (df.Price <= 10000000)] 
df.shape

(70697, 10)

In [6]:
# examine data of remaining columns 
for columns in df: 
    if(len(df[columns].unique()) < 400): 
        print(f"\n---- {columns} ----\n")
        print(df[columns].unique())


---- Type of property ----

['apartment' nan 'house']

---- Type of sale ----

['regular sale' 'public sale']

---- Number of rooms ----

[  1.   2.   3.  18.   4.  nan   5.   6.  10.   7. 125.  14.   9.  32.
  12.   8.  16.  19.  35.  11.  15.  13.  50.  20.  25.  24.  21.  30.
   0.  47.  66.  22.  36.  40.  41.  34.  17.  28.  46.  63. 165.  27.
  99.  70.  37.  80.  39.  38.  90.  33.  23.]

---- Fully equipped kitchen ----

[0. 1.]

---- Open fire ----

[0. 1.]

---- Terrace ----

[1. 0.]

---- Garden ----

[0. 1.]

---- Swimming pool ----

[0. 1.]


In [7]:
# keep regular sales and drop public sales
# df.groupby([df['Type of sale'] == 'public sale']).count()
df = df[df['Type of sale'] == 'regular sale']
df.shape

(69708, 10)

In [8]:
# clean number of rooms 
# df.groupby(df['Number of rooms'] ).sum()
# df.groupby(df['Number of rooms'] > 9).count()
df =df[df['Number of rooms'] != 0] # more than 1250
df =df[df['Number of rooms'] < 9] # less than 300
df.shape

(67127, 10)

In [9]:
# clea n area
df =df[df['Area'] > 20]
df =df[df['Area'] < 1500]
df.sort_values(by = 'Area')
df.shape

(57929, 10)

In [10]:
# data engineering - is null 
df.isnull().any()


Type of property           True
Price                     False
Type of sale              False
Number of rooms           False
Area                      False
Fully equipped kitchen    False
Open fire                 False
Terrace                   False
Garden                    False
Swimming pool             False
dtype: bool

In [11]:
# Get rid of nan, delete rows
df = df.dropna()
df.isnull().any()
df.shape

(57669, 10)

In [12]:
df.groupby(df['Number of rooms'] < 1).count()
df.shape

(57669, 10)

In [13]:
# get rid of unuseful data for ML
df.drop(['Type of property', 'Type of sale'], axis=1, inplace = True)

In [14]:
df

Unnamed: 0,Price,Number of rooms,Area,Fully equipped kitchen,Open fire,Terrace,Garden,Swimming pool
2,764999.0,2.0,153.0,0.0,0.0,1.0,0.0,0.0
4,294999.0,2.0,80.0,0.0,0.0,0.0,0.0,0.0
6,233999.0,2.0,90.0,0.0,0.0,0.0,0.0,0.0
7,329899.0,1.0,87.0,0.0,0.0,1.0,0.0,0.0
9,359899.0,1.0,95.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
73496,334900.0,3.0,165.0,0.0,0.0,0.0,0.0,0.0
73497,340500.0,3.0,167.0,0.0,0.0,0.0,0.0,0.0
73500,307242.0,3.0,150.0,0.0,0.0,0.0,0.0,0.0
73501,315000.0,3.0,150.0,0.0,0.0,1.0,0.0,0.0


## Create train, test
WARNING 
- never take outliers before the train set, they are part of the tests values
- only clean wrong data, not possible data, error data, etc..
- clean outliers in the train data and monitor over/under fitting aftwards

In [15]:
# create the matrix of x and the y target
X = df.iloc[:,1:] 
y = df.iloc[:,:1] 

print(X.shape) 
print(y.shape)


(57669, 7)
(57669, 1)


In [16]:
# split into train and test data
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40, stratify = y) # 0,1 class can not be stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
len(X_train)

40368

In [17]:
# TODO measure the accuracy of the train test


## Application of ML

choosing your model : https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

- multiple regression
- polynomial
- random forest
- xgboost
- ridge
LATER
- libraries cleaning and engineering by themselves


### multiple linear regression
$$
\begin{bmatrix}
y^{(1)}\\
y^{(2)}\\
y^{(3)}\\
... \\
y^{(m)}\\
\end{bmatrix}
=
\begin{bmatrix}
x^{(1)}_1, x^{(1)}_2, 1\\
x^{(2)}_1, x^{(2)}_2, 1\\
x^{(3)}_1, x^{(3)}_2, 1\\
x^{(m)}_1,x^{(m)}_2,  1\\
\end{bmatrix}
.
\begin{bmatrix}
a\\
b\\
c\\
\end{bmatrix}
$$

In [18]:
# multiple linear regression
# with sklearn, no need to add 1 to the matrix
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [19]:
# performance 0,4037 with number of rooms > 7
regressor.score(X_train, y_train)

0.3791834141329893

In [20]:
# visualizing results
y = 
df.predict

SyntaxError: invalid syntax (<ipython-input-20-c970368786ce>, line 2)

In [21]:
# play with number of iterations


In [22]:
# calculate overfitting and underfitting


### polynomial
$$ X =
\begin{bmatrix}
x^{(1)}_1, x^{(1)2}_2, ..., x^{(m)k}_{n}, 1\\
x^{(2)}_1, x^{(2)2}_2, ..., x^{(m)k}_{n}, 1\\
x^{(3)}_1, x^{(3)2}_2, ..., x^{(m)k}_{n}, 1\\
x^{(m)}_1,x^{(m)k}_2, ..., x^{(m)k}_{n}, 1\\
\end{bmatrix}
$$
is it applicable here ?

In [23]:
# polynomial regression using SciKitLearn


## XGboost

INSTALL package via conda install xgboost

TODO to be used only with 0.5+ score ?


In [24]:
from xgboost import XGBRegressor
my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(X_train, y_train, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
# make predictions
predictions = my_model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))
my_model.score(X_test, y_test)

Mean Absolute Error : 131874.8300558855


0.47194785111756665

In [26]:
# finetuning the model
my_model = XGBRegressor(n_estimators=1000)
my_model.fit(X_train, y_train, early_stopping_rounds=50, 
             eval_set=[(X_test, y_test)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [27]:
# make predictions
predictions = my_model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error : 131763.91402456866


In [28]:
# mesure performance
my_model.score(X_test, y_test)

0.48413906714819377

## History of performance

#### Options: 0 < rooms < 9, 20 < areas < 1500, 20000 < price < 10 000 000

MultiLinReg = 0,41 ; XGBoost = 0,48   




# Predict with unseen values



In [29]:
y_new = np.array([[2,80,1,0,1,0,1]])
y_pred = my_model.predict(y_new)

ValueError: feature_names mismatch: ['Number of rooms', 'Area', 'Fully equipped kitchen', 'Open fire', 'Terrace', 'Garden', 'Swimming pool'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6']
expected Area, Swimming pool, Terrace, Open fire, Number of rooms, Garden, Fully equipped kitchen in input data
training data did not have the following fields: f5, f2, f6, f1, f4, f3, f0