In [1]:
import pandas as pd
import numpy as np
from math import pi

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

from eda_helpers import normality_check, count_outliers
from stat_inference_helpers import custom_corr, evaluate_model

In [92]:
data_info = pd.read_parquet("./data/data_info.parquet")
data = pd.read_csv("./data/data_train.csv")
results = pd.read_csv("./data/results.csv")
results

Unnamed: 0,Model,N folds,List of used features,RMSE,MAE,r2_coef_determination,explained_variance,corr_coef
0,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_Z...",286941,116868,0.85844,0.859858,0.9093
1,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_N...",265134,104945,0.876582,0.878965,0.9262


There still might be room for improvement. First, let's start with the last point of Statistical inference summary: **Cut the maximum `Price`**.

But before, let's discover how many observations lay bihind the threshold of 1M, 1.5M and 2M:

In [93]:
total_n = data.shape[0]

above_1M = (data[data['Price']>1000000]).shape[0]
above_1andHalfM = (data[data['Price']>1500000]).shape[0]
above_2M = (data[data['Price']>2000000]).shape[0]

print(f'There are {round((above_1M*100/total_n), 2)}% of points with price above 1M')
print(f'There are {round((above_1andHalfM*100/total_n), 2)}% of points with price above 1.5M')
print(f'There are {round((above_2M*100/total_n), 2)}% of points with price above 2M')

There are 11.26% of points with price above 1M
There are 4.48% of points with price above 1.5M
There are 1.9% of points with price above 2M


11% is too much to cut off, but it's possible to try with 1.5M and 2M.

**1.** Cut off all datapoints with `Price` > 1.5M

In [94]:
data3 = data.copy()
data3.drop(data[data['Price']> 1500000].index, inplace=True)
data3

Unnamed: 0,Price,Area,Price per sqm,log_Price,log_Area,Manhattan_distance,Zip num,District_v1,District_v2
0,385000.0,49,7857.142857,12.860999,3.891820,0.051544,1094,Oost,the rest
1,930000.0,123,7560.975610,13.742940,4.812184,0.029115,1079,Zuid,Centrum + Zuid
2,500000.0,70,7142.857143,13.122363,4.248495,0.038423,1051,West,the rest
3,400000.0,107,3738.317757,12.899220,4.672829,0.107977,1067,Nieuw-West,the rest
4,475000.0,98,4846.938776,13.071070,4.584967,0.067423,1061,Nieuw-West,the rest
...,...,...,...,...,...,...,...,...,...
732,1025000.0,135,7592.592593,13.840203,4.905275,0.029167,1077,Zuid,Centrum + Zuid
733,915000.0,88,10397.727273,13.726679,4.477337,0.017201,1015,Centrum,Centrum + Zuid
734,690000.0,100,6900.000000,13.444447,4.605170,0.039710,1014,West,the rest
735,450000.0,60,7500.000000,13.017003,4.094345,0.045514,1093,Oost,the rest


Run the last model on new data:

In [95]:
X3 = data3.drop(columns=['Price', 'Area', 'Price per sqm', 'log_Price',  'Zip num', 'District_v2'])

district = pd.get_dummies(X3['District_v1'], drop_first=True, dtype=int, prefix='District')
X3 = pd.concat([X3, district], axis = 1).drop(columns=['District_v1'])

y3 = data3['log_Price']
model3 = LinearRegression()

In [96]:
rmse3, mae3, r2_coef_determination3, explained_variance3, corr_coef3 = evaluate_model(model3, X3, y3, results, cv = 4)
results

Unnamed: 0,Model,N folds,List of used features,RMSE,MAE,r2_coef_determination,explained_variance,corr_coef
0,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_Z...",286941,116868,0.85844,0.859858,0.9093
1,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_N...",265134,104945,0.876582,0.878965,0.9262
2,LinearRegression(),4,"[log_Area, Manhattan_distance, District_Nieuw-...",119131,69983,0.856018,0.857758,0.9311


Even though the r2 and percentage of explained variance are now worse, both **errors** has been **signiffacantly redused**.

**2.** Cut off all datapoints with `Price` > 2M

In [97]:
data4 = data.copy()
data4.drop(data[data['Price']> 2000000].index, inplace=True)
data4.shape

(723, 9)

In [98]:
X4 = data4.drop(columns=['Price', 'Area', 'Price per sqm', 'log_Price',  'Zip num', 'District_v2'])

district = pd.get_dummies(X4['District_v1'], drop_first=True, dtype=int, prefix='District')
X4 = pd.concat([X4, district], axis = 1).drop(columns=['District_v1'])

y4 = data4['log_Price']
model4 = LinearRegression()

In [99]:
rmse4, mae4, r2_coef_determination4, explained_variance4, corr_coef4 = evaluate_model(model4, X4, y4, results, cv = 4)
results

Unnamed: 0,Model,N folds,List of used features,RMSE,MAE,r2_coef_determination,explained_variance,corr_coef
0,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_Z...",286941,116868,0.85844,0.859858,0.9093
1,LinearRegression(),4,"['log_Area', 'Manhattan_distance', 'District_N...",265134,104945,0.876582,0.878965,0.9262
2,LinearRegression(),4,"[log_Area, Manhattan_distance, District_Nieuw-...",119131,69983,0.856018,0.857758,0.9311
3,LinearRegression(),4,"[log_Area, Manhattan_distance, District_Nieuw-...",130708,74594,0.863495,0.866804,0.9371


There is no signifficant difference between cut off of 1.5M and 2M, so we'll keep 2M to be able to predict more cases with the same accuracy.

# Model with price per sqm

The assumption here is that the price per squre meter can be easier to predict. The logerithm of actions is the following:
1. Use price per sqm as a target
2. As features use different sets of [Area, Room, Distance] features
3. Predicted price per sqm multiply by Area and get final Price of the house
4. Measure model accuracy