In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sklearn
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, PolynomialFeatures, scale
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import datetime
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('600K US Housing Properties.csv', low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   property_url        600000 non-null  object 
 1   property_id         600000 non-null  int64  
 2   address             600000 non-null  object 
 3   street_name         599869 non-null  object 
 4   apartment           14815 non-null   object 
 5   city                599999 non-null  object 
 6   state               599999 non-null  object 
 7   latitude            529122 non-null  float64
 8   longitude           529122 non-null  float64
 9   postcode            599970 non-null  object 
 10  price               600000 non-null  float64
 11  bedroom_number      443845 non-null  float64
 12  bathroom_number     471733 non-null  float64
 13  price_per_unit      435365 non-null  float64
 14  living_space        447847 non-null  float64
 15  land_space          515119 non-nul

In [4]:
description = df.describe(include='all')
description

Unnamed: 0,property_url,property_id,address,street_name,apartment,city,state,latitude,longitude,postcode,...,property_type,property_status,year_build,total_num_units,listing_age,RunDate,agency_name,agent_name,agent_phone,is_owned_by_zillow
count,600000,600000.0,600000,599869,14815.0,599999,599999,529122.0,529122.0,599970.0,...,600000,600000,0.0,0.0,600000.0,600000,444524,0.0,0.0,600000.0
unique,600000,,598588,339224,2664.0,7977,25,,,10820.0,...,7,2,,,,1,34372,,,
top,https://www.zillow.com/homedetails/3-Plat-83-1...,,"(undisclosed Address), Rockport, TX 78382",(undisclosed Address),1.0,Chicago,TX,,,84043.0,...,SINGLE_FAMILY,FOR_SALE,,,,2022-04-24 07:34:15,Coldwell Banker Realty,,,
freq,1,,36,1713,312.0,14138,146636,,,1102.0,...,354366,383365,,,,600000,5936,,,
mean,,888504200.0,,,,,,36.282379,-105.813906,,...,,,,,-1.0,,,,,0.000498
std,,972470800.0,,,,,,5.673355,13.464633,,...,,,,,0.0,,,,,0.022318
min,,27.0,,,,,,18.985142,-165.40825,,...,,,,,-1.0,,,,,0.0
25%,,54021430.0,,,,,,32.612112,-117.346079,,...,,,,,-1.0,,,,,0.0
50%,,206609000.0,,,,,,35.403568,-101.897378,,...,,,,,-1.0,,,,,0.0
75%,,2066867000.0,,,,,,39.661674,-95.354245,,...,,,,,-1.0,,,,,0.0


In [5]:
df.columns

Index(['property_url', 'property_id', 'address', 'street_name', 'apartment',
       'city', 'state', 'latitude', 'longitude', 'postcode', 'price',
       'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space',
       'land_space', 'land_space_unit', 'broker_id', 'property_type',
       'property_status', 'year_build', 'total_num_units', 'listing_age',
       'RunDate', 'agency_name', 'agent_name', 'agent_phone',
       'is_owned_by_zillow'],
      dtype='object')

In [6]:
df = df.drop(df[df.property_type == 'LOT'].index)
df = df.drop(df[df.living_space == 0].index)
df = df.drop(df[df.price == 0].index)
df = df[df.state == 'TX']

In [7]:
#Land space is an important metric for the price of the house but this dataset has multiple units
#All the values with 'acres' unit changed to 'sqft'
df.loc[df['land_space_unit']=='acres', 'land_space'] = df['land_space']*43560.00
df.loc[df['land_space_unit']=='acres', 'land_space_unit']='sqft'
df[['land_space','land_space_unit']]

Unnamed: 0,land_space,land_space_unit
180271,6969.600,sqft
180273,12632.400,sqft
180275,11325.600,sqft
180277,15246.000,sqft
180279,6000.000,sqft
...,...,...
446346,8145.720,sqft
446348,3920.400,sqft
446350,4356.000,sqft
446352,4299.372,sqft


In [8]:
df = df.drop(columns=['property_url', 'property_id', 'address', 'street_name', 'apartment', 'city', 'state', 'latitude', 'longitude','price_per_unit', 'land_space_unit', 'broker_id','property_status', 'year_build', 'total_num_units', 'listing_age',
       'RunDate', 'agency_name', 'agent_name', 'agent_phone',
       'is_owned_by_zillow'])

In [9]:
description = df.describe(include='all')
description

Unnamed: 0,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type
count,113297.0,113297.0,110799.0,112676.0,110949.0,91274.0,113297
unique,1661.0,,,,,,6
top,75126.0,,,,,,SINGLE_FAMILY
freq,964.0,,,,,,101682
mean,,519182.6,3.440031,2.679117,2484.81,291517.4,
std,,783671.0,1.023723,1.195078,37478.52,9626307.0,
min,,1.0,0.0,0.0,1.0,-10890.0,
25%,,272800.0,3.0,2.0,1590.0,6124.134,
50%,,380000.0,3.0,3.0,2052.0,8424.504,
75%,,549900.0,4.0,3.0,2698.0,17697.34,


In [10]:
df.isna().sum()

postcode               0
price                  0
bedroom_number      2498
bathroom_number      621
living_space        2348
land_space         22023
property_type          0
dtype: int64

In [11]:
df['bedroom_number'] = df['bedroom_number'].fillna(df['bedroom_number'].median())
df['bathroom_number'] = df['bathroom_number'].fillna(df['bathroom_number'].median())
df['living_space'] = df['living_space'].fillna(df['living_space'].median())
df['land_space'] = df['land_space'].fillna(df['land_space'].median())

In [12]:
df.isna().sum()

postcode           0
price              0
bedroom_number     0
bathroom_number    0
living_space       0
land_space         0
property_type      0
dtype: int64

In [13]:
# A crucial categorical feature 'property_type' is categorical variable so I wanted to transform that to a numeric one
dummies=pd.get_dummies(df[['postcode','property_type']], drop_first=True)
dummies

Unnamed: 0,postcode_11111,postcode_73870,postcode_75001,postcode_75002,postcode_75006,postcode_75007,postcode_75009,postcode_75010,postcode_75013,postcode_75017,...,postcode_79934,postcode_79935,postcode_79936,postcode_79938,postcode_97316,property_type_CONDO,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180273,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180275,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180277,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
df = pd.concat([df,dummies],axis='columns')
df

Unnamed: 0,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type,postcode_11111,postcode_73870,postcode_75001,...,postcode_79934,postcode_79935,postcode_79936,postcode_79938,postcode_97316,property_type_CONDO,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180271,79903,239500.0,5.0,3.0,1692.0,6969.600,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180273,79925,165000.0,4.0,2.0,1650.0,12632.400,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180275,79905,118000.0,4.0,1.0,1918.0,11325.600,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180277,79903,414700.0,4.0,3.0,3119.0,15246.000,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180279,79905,260000.0,3.0,3.0,3267.0,6000.000,MULTI_FAMILY,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,75023,393000.0,3.0,2.0,1717.0,8145.720,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446348,75023,315000.0,3.0,3.0,1754.0,3920.400,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446350,75023,360000.0,4.0,3.0,2068.0,4356.000,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446352,75023,447000.0,4.0,3.0,2111.0,4299.372,SINGLE_FAMILY,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
df = df.drop(columns=['postcode','property_type'])
df

Unnamed: 0,price,bedroom_number,bathroom_number,living_space,land_space,postcode_11111,postcode_73870,postcode_75001,postcode_75002,postcode_75006,...,postcode_79934,postcode_79935,postcode_79936,postcode_79938,postcode_97316,property_type_CONDO,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180271,239500.0,5.0,3.0,1692.0,6969.600,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180273,165000.0,4.0,2.0,1650.0,12632.400,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180275,118000.0,4.0,1.0,1918.0,11325.600,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180277,414700.0,4.0,3.0,3119.0,15246.000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180279,260000.0,3.0,3.0,3267.0,6000.000,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,393000.0,3.0,2.0,1717.0,8145.720,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446348,315000.0,3.0,3.0,1754.0,3920.400,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446350,360000.0,4.0,3.0,2068.0,4356.000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446352,447000.0,4.0,3.0,2111.0,4299.372,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [16]:
df.dtypes

price                          float64
bedroom_number                 float64
bathroom_number                float64
living_space                   float64
land_space                     float64
                                ...   
property_type_CONDO              uint8
property_type_MANUFACTURED       uint8
property_type_MULTI_FAMILY       uint8
property_type_SINGLE_FAMILY      uint8
property_type_TOWNHOUSE          uint8
Length: 1670, dtype: object

In [17]:
df['price_per_sq_foot'] = df['price']/df['living_space']

In [18]:
q_low = df["bedroom_number"].quantile(0.01)
q_hi  = df["bedroom_number"].quantile(0.99)

df = df[(df["bedroom_number"] < q_hi) & (df["bedroom_number"] > q_low)]

In [19]:
q_low = df["bathroom_number"].quantile(0.01)
q_hi  = df["bathroom_number"].quantile(0.99)

df = df[(df["bathroom_number"] < q_hi) & (df["bathroom_number"] > q_low)]

In [20]:
q_low = df["living_space"].quantile(0.01)
q_hi  = df["living_space"].quantile(0.99)

df = df[(df["living_space"] < q_hi) & (df["living_space"] > q_low)]

In [21]:
q_low = df["land_space"].quantile(0.01)
q_hi  = df["land_space"].quantile(0.99)

df = df[(df["land_space"] < q_hi) & (df["land_space"] > q_low)]

In [22]:
q_low = df["price"].quantile(0.01)
q_hi  = df["price"].quantile(0.99)

df = df[(df["price"] < q_hi) & (df["price"] > q_low)]

In [23]:
q_low = df["price_per_sq_foot"].quantile(0.01)
q_hi  = df["price_per_sq_foot"].quantile(0.99)

df = df[(df["price_per_sq_foot"] < q_hi) & (df["price_per_sq_foot"] > q_low)]

In [24]:
description = df.describe(include='all')
description

Unnamed: 0,price,bedroom_number,bathroom_number,living_space,land_space,postcode_11111,postcode_73870,postcode_75001,postcode_75002,postcode_75006,...,postcode_79935,postcode_79936,postcode_79938,postcode_97316,property_type_CONDO,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE,price_per_sq_foot
count,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,...,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0,97970.0
mean,423947.6,3.46945,2.673614,2180.043187,26995.36,1e-05,1e-05,0.000122,0.001317,0.000735,...,0.000133,0.000623,0.00247,1e-05,0.022813,0.020915,0.009472,0.917485,0.029203,193.412469
std,219255.4,0.712618,0.860693,752.69166,87699.42,0.003195,0.003195,0.011067,0.036263,0.0271,...,0.011519,0.024945,0.049639,0.003195,0.149308,0.143099,0.096864,0.275149,0.168375,67.283412
min,76000.0,2.0,1.0,841.0,1454.904,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.511464
25%,278000.0,3.0,2.0,1619.0,6534.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,149.907749
50%,375000.0,3.0,3.0,2051.0,8424.504,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,182.256264
75%,514900.0,4.0,3.0,2628.0,10672.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,222.222222
max,1629000.0,5.0,5.5,4854.0,1181347.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,517.16032


In [25]:
features=df.describe().columns.drop(['price','price_per_sq_foot'])
features

Index(['bedroom_number', 'bathroom_number', 'living_space', 'land_space',
       'postcode_11111', 'postcode_73870', 'postcode_75001', 'postcode_75002',
       'postcode_75006', 'postcode_75007',
       ...
       'postcode_79934', 'postcode_79935', 'postcode_79936', 'postcode_79938',
       'postcode_97316', 'property_type_CONDO', 'property_type_MANUFACTURED',
       'property_type_MULTI_FAMILY', 'property_type_SINGLE_FAMILY',
       'property_type_TOWNHOUSE'],
      dtype='object', length=1669)

In [26]:
targets=['price','price_per_sq_foot']
targets

['price', 'price_per_sq_foot']

In [27]:
X=df[features]
X

Unnamed: 0,bedroom_number,bathroom_number,living_space,land_space,postcode_11111,postcode_73870,postcode_75001,postcode_75002,postcode_75006,postcode_75007,...,postcode_79934,postcode_79935,postcode_79936,postcode_79938,postcode_97316,property_type_CONDO,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180271,5.0,3.0,1692.0,6969.600,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180273,4.0,2.0,1650.0,12632.400,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180277,4.0,3.0,3119.0,15246.000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180279,3.0,3.0,3267.0,6000.000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
180281,4.0,2.0,1800.0,6098.400,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,3.0,2.0,1717.0,8145.720,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446348,3.0,3.0,1754.0,3920.400,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446350,4.0,3.0,2068.0,4356.000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446352,4.0,3.0,2111.0,4299.372,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [28]:
y=df[targets[0]]
y

180271    239500.0
180273    165000.0
180277    414700.0
180279    260000.0
180281    174950.0
            ...   
446346    393000.0
446348    315000.0
446350    360000.0
446352    447000.0
446354    377000.0
Name: price, Length: 97970, dtype: float64

In [29]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42)
len(X_train),len(X_test),len(y_train),len(y_test)

(68579, 29391, 68579, 29391)

In [30]:
numeric=['bedroom_number', 'bathroom_number', 'living_space', 'land_space']
sc=StandardScaler()
X_train[numeric]=sc.fit_transform(X_train[numeric])
X_test[numeric]=sc.transform(X_test[numeric])

**Linear Regression Model**

In [31]:
model = LinearRegression()
model.fit(X_train , y_train)

LinearRegression()

In [32]:
y_pred = model.predict(X_test)

In [33]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))
print(metrics.mean_squared_error(y_test,y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

3288691304147.942
1.5162697973816646e+28
123136907439713.81


In [34]:
model.score(X_train , y_train)

0.7561690487431427

**Ridge Regression Model**

In [35]:
clf = Ridge(alpha=0.1)
clf.fit(X_train , y_train)

Ridge(alpha=0.1)

In [36]:
clf.score(X_train , y_train)

0.756147034854444

**Lasso Regression Model**

In [37]:
clf1 = Lasso(alpha=0.1)
clf1.fit(X_train , y_train)

Lasso(alpha=0.1)

In [38]:
clf.score(X_train , y_train)

0.756147034854444