In [None]:
# import sys
# !{sys.executable} -m pip install pygam

## import packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pygam import LinearGAM, s, te
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

## load dataset

In [None]:
original_dat = pd.read_csv('single_housing.csv')

In [None]:
original_dat

In [None]:
#create duplicate dataset
dat = original_dat

## data cleaning

In [None]:
sns.boxplot(data=dat, x='Price')

In [None]:
#remove outliers
percentile25 = dat['Price'].quantile(0.25)
percentile75 = dat['Price'].quantile(0.75)
IQR = percentile75 - percentile25
dat = dat.loc[(dat['Price']>percentile25-1.5*IQR) & (dat['Price']<percentile75+1.5*IQR),]

In [None]:
#number of rows removed
len(original_dat) - len(dat)

In [None]:
#remove non-numeric columns
dat = dat.drop(['Street','City','State','Zip','geoadd','CheckAddDuplicate'], axis=1)

In [None]:
dat.describe()

In [None]:
dat.isna().sum()

In [None]:
sns.histplot(data=dat, x='Price', bins=10)

## PCA matrix completion

In [None]:
#fill NAs temporary
dat = dat.fillna(dat.mean())

## linear regression

In [None]:
#prep the data
# X = dat[['Latitude','Longitude']].values
# X = dat[['SqFt','Acreage','Beds','Baths']].values
X = dat.iloc[: , 1:].values
y = dat['Price'].values

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [None]:
#fit the model
linear_reg = LinearRegression().fit(X_train, y_train)

In [None]:
#evaluate the model
test_preds = linear_reg.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, test_preds))

In [None]:
print('The out-of-sample error: ' + str(round(rmse,3)))

## GAM

In [None]:
#prep the data
# X = dat[['Latitude','Longitude']].values
# X = dat[['SqFt','Acreage','Beds','Baths']].values
X = dat.iloc[: , 1:].values
y = dat['Price'].values

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [None]:
#fit the model
# gam_reg = LinearGAM(s(0) + s(1) + te(0,1)).fit(X, y)
gam_reg = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)).fit(X, y)

In [None]:
#evaluate the model
test_preds = gam_reg.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, test_preds))

In [None]:
print('The out-of-sample error: ' + str(round(rmse,3)))

## KNN

In [None]:
#prep the data
# X = dat[['Latitude','Longitude']].values
X = dat[['SqFt','Acreage','Beds','Baths']].values
# X = dat.iloc[: , 1:].values
y = dat['Price'].values

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [None]:
#tune the model
outsample = []
k = list(range(1,20)) + list(range(20,100,5)) + list(range(100,501,100))

for i in range(len(k)):
    knn_model = KNeighborsRegressor(n_neighbors=k[i])
    knn_model.fit(X_train, y_train)
    #out-of-sample
    test_preds = knn_model.predict(X_test)
    outsample.append(sqrt(mean_squared_error(y_test, test_preds)))

In [None]:
knn_dat = pd.DataFrame({'k':k, 'out-of-sample error':outsample})
sns.lineplot(data=knn_dat, x='k', y='out-of-sample error')

In [None]:
#finalize the model
optimalk = k[outsample.index(min(outsample))]
print('The best k for out-of-sample prediction: ' + str(optimalk))
print('The best out-of-sample error: ' + str(round(min(outsample),3)))

## random forest

In [None]:
#prep the data
# X = dat[['SqFt','Acreage','Beds','Baths']].values
X = dat.iloc[: , 1:].values
y = dat['Price'].values

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [None]:
#tune the model
maxfeatures = [2,3,4,5,6]
samplesleaf = list(range(1,15)) + list(range(15,50,5))
bestmaxfeature = 99999
bestsamplesleaf = 99999
best_outsample = 99999

for i in tqdm(range(len(maxfeatures))):
    for j in range(len(samplesleaf)):
        rf_model = RandomForestRegressor(n_estimators=1000, 
                                         max_features=maxfeatures[i], 
                                         min_samples_leaf=samplesleaf[j])
        rf_model.fit(X_train, y_train)
        #out-of-sample
        test_preds = rf_model.predict(X_test)
        rmse = sqrt(mean_squared_error(y_test, test_preds))
        if rmse < best_outsample:
            bestmaxfeature = maxfeatures[i]
            bestsamplesleaf = samplesleaf[j]
            best_outsample = rmse

In [None]:
print('The best max_features for out-of-sample prediction: ' + str(bestmaxfeature))
print('The best min_samples_leaf for out-of-sample prediction: ' + str(bestsamplesleaf))
print('The best out-of-sample error: ' + str(round(best_outsample,3)))

In [None]:
#thoughts
#1. PCA matrix completion to fill missing values
#2. linear_reg, GAM, KNN and rf to predict prices
#3. learn boosting trees and neural networks to predict prices

#aside
#the dataset is raw, probably need to remove the outliers
#the ultimate goal is to try to make better predictions on price than we did in summer