In [96]:
import pandas as pd
import numpy as np
from math import pi

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from stat_inference_helpers import evaluate_model

In [97]:
data_info = pd.read_parquet("./data/data_info.parquet")
data = pd.read_csv("./data/data_train.csv")
results = pd.read_csv("./data/results.csv")
results

Unnamed: 0,Model,N folds,List of used features,RMSE,MAE,r2_coef_determination,explained_variance,corr_coef,VIFs
0,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_Z...",234975,106951,0.85844,0.859858,0.9091,"[4.15, 7.91, 2.94, 3.99]"
1,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_N...",179096,83554,0.876582,0.878965,0.9279,"[7.0, 13.67, 4.33, 2.72, 2.41, 2.12, 2.15, 4.94]"
2,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'Region_1',...",233503,103755,0.858437,0.859853,0.915,"[4.15, 7.91, inf, inf, 3.99]"
3,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'Region_1']",174485,93129,0.853501,0.855203,0.9147,"[3.91, 5.32, 1.78]"
4,LinearRegression(),4,"['log_Area', 'District_Zuid-Oost', 'District_t...",259138,126636,0.787772,0.788273,0.8649,"[2.95, 1.27, 2.68]"


There still might be room for improvement. First, let's start with the last point of the Statistical Inference Summary: **Cut the maximum `Price`**.

But before, let's discover how many observations lay bihind the threshold of 1M, 1.5M and 2M:

In [98]:
total_n = data.shape[0]

above_1M = (data[data['Price']>1000000]).shape[0]
above_1andHalfM = (data[data['Price']>1500000]).shape[0]
above_2M = (data[data['Price']>2000000]).shape[0]

print(f'There are {round((above_1M*100/total_n), 2)}% of points with price above 1M')
print(f'There are {round((above_1andHalfM*100/total_n), 2)}% of points with price above 1.5M')
print(f'There are {round((above_2M*100/total_n), 2)}% of points with price above 2M')

There are 11.26% of points with price above 1M
There are 4.48% of points with price above 1.5M
There are 1.9% of points with price above 2M


11% is too much to cut off, but it's possible to try with 1.5M and 2M.

**1.** Cut off all datapoints with `Price` > 1.5M

In [99]:
data6 = data.copy()
data6.drop(data[data['Price']> 1500000].index, inplace=True)
data6

Unnamed: 0,Price,Area,Price per sqm,log_Price,log_Area,Manhattan_distance,Zip region,Zip num,District_v1,District_v2
0,385000.0,49,7857.142857,12.860999,3.891820,0.051544,0,1094,Oost,the_rest
1,930000.0,123,7560.975610,13.742940,4.812184,0.029115,0,1079,Zuid,Centrum+Zuid
2,500000.0,70,7142.857143,13.122363,4.248495,0.038423,0,1051,West,the_rest
3,400000.0,107,3738.317757,12.899220,4.672829,0.107977,0,1067,Nieuw-West,the_rest
4,475000.0,98,4846.938776,13.071070,4.584967,0.067423,0,1061,Nieuw-West,the_rest
...,...,...,...,...,...,...,...,...,...,...
732,1025000.0,135,7592.592593,13.840203,4.905275,0.029167,0,1077,Zuid,Centrum+Zuid
733,915000.0,88,10397.727273,13.726679,4.477337,0.017201,0,1015,Centrum,Centrum+Zuid
734,690000.0,100,6900.000000,13.444447,4.605170,0.039710,0,1014,West,the_rest
735,450000.0,60,7500.000000,13.017003,4.094345,0.045514,0,1093,Oost,the_rest


Run the last model on new data:

In [100]:
X6 = data6.drop(columns=['Price', 'Area', 'Price per sqm', 'log_Price', 'Manhattan_distance', 'Zip region', 'Zip num', 'District_v1'])

district = pd.get_dummies(X6['District_v2'], drop_first=True, dtype=int, prefix='District')
X6 = pd.concat([X6, district], axis = 1).drop(columns=['District_v2'])

y6 = data6['log_Price']
model6 = LinearRegression()

In [101]:
rmse6, mae6, r2_coef_determination6, explained_variance6, corr_coef6 = evaluate_model(model6, X6, y6, results, cv = 4)
results

Unnamed: 0,Model,N folds,List of used features,RMSE,MAE,r2_coef_determination,explained_variance,corr_coef,VIFs
0,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_Z...",234975,106951,0.85844,0.859858,0.9091,"[4.15, 7.91, 2.94, 3.99]"
1,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_N...",179096,83554,0.876582,0.878965,0.9279,"[7.0, 13.67, 4.33, 2.72, 2.41, 2.12, 2.15, 4.94]"
2,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'Region_1',...",233503,103755,0.858437,0.859853,0.915,"[4.15, 7.91, inf, inf, 3.99]"
3,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'Region_1']",174485,93129,0.853501,0.855203,0.9147,"[3.91, 5.32, 1.78]"
4,LinearRegression(),4,"['log_Area', 'District_Zuid-Oost', 'District_t...",259138,126636,0.787772,0.788273,0.8649,"[2.95, 1.27, 2.68]"
5,LinearRegression(),4,"[log_Area, District_Zuid-Oost, District_the_rest]",129224,92521,0.7392,0.739592,0.83,"[3.23, 1.32, 2.91]"


Even though the r2 and percentage of explained variance are now worse, both **errors** has been **signiffacantly redused**.

**2.** Cut off all datapoints with `Price` > 2M

In [102]:
data7 = data.copy()
data7.drop(data[data['Price']> 2000000].index, inplace=True)
data7.shape

(723, 10)

In [103]:
X7 = data7.drop(columns=['Price', 'Area', 'Price per sqm', 'log_Price', 'Manhattan_distance', 'Zip region', 'Zip num', 'District_v1'])

district = pd.get_dummies(X7['District_v2'], drop_first=True, dtype=int, prefix='District')
X7 = pd.concat([X7, district], axis = 1).drop(columns=['District_v2'])

y7 = data7['log_Price']
model7 = LinearRegression()

In [104]:
rmse7, mae7, r2_coef_determination7, explained_variance7, corr_coef7 = evaluate_model(model7, X7, y7, results, cv = 4)
results

Unnamed: 0,Model,N folds,List of used features,RMSE,MAE,r2_coef_determination,explained_variance,corr_coef,VIFs
0,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_Z...",234975,106951,0.85844,0.859858,0.9091,"[4.15, 7.91, 2.94, 3.99]"
1,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_N...",179096,83554,0.876582,0.878965,0.9279,"[7.0, 13.67, 4.33, 2.72, 2.41, 2.12, 2.15, 4.94]"
2,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'Region_1',...",233503,103755,0.858437,0.859853,0.915,"[4.15, 7.91, inf, inf, 3.99]"
3,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'Region_1']",174485,93129,0.853501,0.855203,0.9147,"[3.91, 5.32, 1.78]"
4,LinearRegression(),4,"['log_Area', 'District_Zuid-Oost', 'District_t...",259138,126636,0.787772,0.788273,0.8649,"[2.95, 1.27, 2.68]"
5,LinearRegression(),4,"[log_Area, District_Zuid-Oost, District_the_rest]",129224,92521,0.7392,0.739592,0.83,"[3.23, 1.32, 2.91]"
6,LinearRegression(),4,"[log_Area, District_Zuid-Oost, District_the_rest]",161482,106708,0.761442,0.762273,0.8421,"[3.11, 1.3, 2.81]"


There is no signifficant difference between cut off of 1.5M and 2M, so we'll keep 2M to be able to predict more cases with the same accuracy.

**Regularized Linear Regression**

Let us now try to use regularized versions of Linear Regression: Lasso, Ridge and ElasticNet Regression. Since these models can carry out feature selection (embedded methods), we can relax the condition of having a feature set with a VIF score > 5 and let the model decide the optimal weights for each feature. We will use the following set of features: 
- log(Area)
- Manhattan_distance
- Zip region
- District_v2