In [2]:
import pandas as pd
import numpy as np
from math import pi

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

from eda_helpers import normality_check, count_outliers
from stat_inference_helpers import custom_corr, evaluate_model

In [6]:
data_info = pd.read_parquet("./data/data_info.parquet")
data = pd.read_csv("./data/data_train.csv")
results = pd.read_csv("./data/results.csv")
data

Unnamed: 0,Price,Area,Room,Price per sqm,log_Price,log_Area,Manhattan_distance,Zip num,District_v1,District_v2
0,385000.0,49,3,7857.142857,12.860999,3.891820,0.051544,1094,Oost,the rest
1,930000.0,123,4,7560.975610,13.742940,4.812184,0.029115,1079,Zuid,Centrum + Zuid
2,500000.0,70,3,7142.857143,13.122363,4.248495,0.038423,1051,West,the rest
3,400000.0,107,6,3738.317757,12.899220,4.672829,0.107977,1067,Nieuw-West,the rest
4,475000.0,98,4,4846.938776,13.071070,4.584967,0.067423,1061,Nieuw-West,the rest
...,...,...,...,...,...,...,...,...,...,...
731,1025000.0,135,4,7592.592593,13.840203,4.905275,0.029167,1077,Zuid,Centrum + Zuid
732,915000.0,88,5,10397.727273,13.726679,4.477337,0.017201,1015,Centrum,Centrum + Zuid
733,690000.0,100,3,6900.000000,13.444447,4.605170,0.039710,1014,West,the rest
734,450000.0,60,4,7500.000000,13.017003,4.094345,0.045514,1093,Oost,the rest


There still might be room for improvement. First, let's **do something important**

In [8]:
data2 = data.copy()
data2

Unnamed: 0,Price,Area,Room,Price per sqm,log_Price,log_Area,Manhattan_distance,Zip num,District_v1,District_v2
0,385000.0,49,3,7857.142857,12.860999,3.891820,0.051544,1094,Oost,the rest
1,930000.0,123,4,7560.975610,13.742940,4.812184,0.029115,1079,Zuid,Centrum + Zuid
2,500000.0,70,3,7142.857143,13.122363,4.248495,0.038423,1051,West,the rest
3,400000.0,107,6,3738.317757,12.899220,4.672829,0.107977,1067,Nieuw-West,the rest
4,475000.0,98,4,4846.938776,13.071070,4.584967,0.067423,1061,Nieuw-West,the rest
...,...,...,...,...,...,...,...,...,...,...
731,1025000.0,135,4,7592.592593,13.840203,4.905275,0.029167,1077,Zuid,Centrum + Zuid
732,915000.0,88,5,10397.727273,13.726679,4.477337,0.017201,1015,Centrum,Centrum + Zuid
733,690000.0,100,3,6900.000000,13.444447,4.605170,0.039710,1014,West,the rest
734,450000.0,60,4,7500.000000,13.017003,4.094345,0.045514,1093,Oost,the rest


# Model with price per sqm

The assumption here is that the price per squre meter can be easier to predict. The logerithm of actions is the following:
1. Use price per sqm as a target
2. As features use different sets of [Area, Room, Distance] features
3. Predicted price per sqm multiply by Area and get final Price of the house
4. Measure model accuracy

In [570]:
data3 = pd.DataFrame()
data3['Price'] = data['Price']
data3['Price per sqm'] = data['Price per sqm']
data3['Area'] = data['Area']
data3['Room'] = data['Room']
data3['Manhattan_distance'] = data2['Manhattan_distance']
data3


Unnamed: 0,Price,Price per sqm,Area,Room,Manhattan_distance
0,385000.0,7857.142857,49,3,0.051544
1,930000.0,7560.975610,123,4,0.029115
2,500000.0,7142.857143,70,3,0.038423
3,400000.0,3738.317757,107,6,0.107977
4,475000.0,4846.938776,98,4,0.067423
...,...,...,...,...,...
731,1025000.0,7592.592593,135,4,0.029167
732,915000.0,10397.727273,88,5,0.017201
733,690000.0,6900.000000,100,3,0.039710
734,450000.0,7500.000000,60,4,0.045514


In [571]:
custom_corr(data3, data_info, features=['Price per sqm', 'Manhattan_distance', 'Room', 'Area'])

Unnamed: 0,method,feature1,feature2,r-value,p-value,stat-sign,N
0,Spearman,Price per sqm,Manhattan_distance,-0.802376,1.078904e-166,True,736
1,Spearman,Price per sqm,Room,-0.234372,1.215174e-10,True,736
2,Spearman,Price per sqm,Area,-0.21936,1.805924e-09,True,736
3,Spearman,Manhattan_distance,Room,0.142645,0.0001031104,True,736
4,Spearman,Manhattan_distance,Area,0.069462,0.05962902,False,736
5,Spearman,Room,Area,0.772415,7.512523e-147,True,736


Room corralates with Area and Manhattan_distance that is why it will be removed

In [572]:
# create features and target to feed to model
X3 = data3.drop(columns=['Price per sqm', 'Price', 'Room'])
y3 = np.log(data3['Price per sqm'])
# y3 = data3['Price per sqm']

# create model
model3 = LinearRegression()
model3.fit(X3, y3)

pred_log_price_sqm = model3.predict(X3)
pred_price_sqm = np.exp(pred_log_price_sqm)
pred_price = pred_price_sqm * data3['Area'] 

In [573]:
rmse3, mae3, r_value3, corr3 = evaluate_model('Linear Regression', list(X3.columns), 'log(Price per sqm)', data3['Price'], pred_price, results)
results

Unnamed: 0,Model,List of used features,Target,RMSE,MAE,r_value,corr_coef
0,Linear Regression,"[Zip region, log(Area)]",log(Price),325388.852788,159374.459167,0.642526,0.813826
1,Linear Regression,"[log_Manhattan_distance, log_Area]",log(Price),237602.416923,113390.013018,0.807277,0.898157
2,Linear Regression,"[Area, Manhattan_distance]",log(Price per sqm),226039.024618,102747.579333,0.824916,0.910482
