In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sklearn
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, PolynomialFeatures, scale
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import datetime
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('600K US Housing Properties.csv', low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   property_url        600000 non-null  object 
 1   property_id         600000 non-null  int64  
 2   address             600000 non-null  object 
 3   street_name         599869 non-null  object 
 4   apartment           14815 non-null   object 
 5   city                599999 non-null  object 
 6   state               599999 non-null  object 
 7   latitude            529122 non-null  float64
 8   longitude           529122 non-null  float64
 9   postcode            599970 non-null  object 
 10  price               600000 non-null  float64
 11  bedroom_number      443845 non-null  float64
 12  bathroom_number     471733 non-null  float64
 13  price_per_unit      435365 non-null  float64
 14  living_space        447847 non-null  float64
 15  land_space          515119 non-nul

In [4]:
description = df.describe(include='all')
description

Unnamed: 0,property_url,property_id,address,street_name,apartment,city,state,latitude,longitude,postcode,...,property_type,property_status,year_build,total_num_units,listing_age,RunDate,agency_name,agent_name,agent_phone,is_owned_by_zillow
count,600000,600000.0,600000,599869,14815.0,599999,599999,529122.0,529122.0,599970.0,...,600000,600000,0.0,0.0,600000.0,600000,444524,0.0,0.0,600000.0
unique,600000,,598588,339224,2664.0,7977,25,,,10820.0,...,7,2,,,,1,34372,,,
top,https://www.zillow.com/homedetails/3-Plat-83-1...,,"(undisclosed Address), Rockport, TX 78382",(undisclosed Address),1.0,Chicago,TX,,,84043.0,...,SINGLE_FAMILY,FOR_SALE,,,,2022-04-24 07:34:15,Coldwell Banker Realty,,,
freq,1,,36,1713,312.0,14138,146636,,,1102.0,...,354366,383365,,,,600000,5936,,,
mean,,888504200.0,,,,,,36.282379,-105.813906,,...,,,,,-1.0,,,,,0.000498
std,,972470800.0,,,,,,5.673355,13.464633,,...,,,,,0.0,,,,,0.022318
min,,27.0,,,,,,18.985142,-165.40825,,...,,,,,-1.0,,,,,0.0
25%,,54021430.0,,,,,,32.612112,-117.346079,,...,,,,,-1.0,,,,,0.0
50%,,206609000.0,,,,,,35.403568,-101.897378,,...,,,,,-1.0,,,,,0.0
75%,,2066867000.0,,,,,,39.661674,-95.354245,,...,,,,,-1.0,,,,,0.0


In [5]:
df.columns

Index(['property_url', 'property_id', 'address', 'street_name', 'apartment',
       'city', 'state', 'latitude', 'longitude', 'postcode', 'price',
       'bedroom_number', 'bathroom_number', 'price_per_unit', 'living_space',
       'land_space', 'land_space_unit', 'broker_id', 'property_type',
       'property_status', 'year_build', 'total_num_units', 'listing_age',
       'RunDate', 'agency_name', 'agent_name', 'agent_phone',
       'is_owned_by_zillow'],
      dtype='object')

In [6]:
df = df.drop(df[df.living_space == 0].index)
df = df.drop(df[df.price == 0].index)
df = df[df.state == 'TX']

In [7]:
#Land space is an important metric for the price of the house but this dataset has multiple units
#All the values with 'acres' unit changed to 'sqft'
df.loc[df['land_space_unit']=='acres', 'land_space'] = df['land_space']*43560.00
df.loc[df['land_space_unit']=='acres', 'land_space_unit']='sqft'
df[['land_space','land_space_unit']]

Unnamed: 0,land_space,land_space_unit
180271,6969.600,sqft
180273,12632.400,sqft
180275,11325.600,sqft
180277,15246.000,sqft
180279,6000.000,sqft
...,...,...
446346,8145.720,sqft
446348,3920.400,sqft
446350,4356.000,sqft
446352,4299.372,sqft


In [8]:
df = df.drop(columns=['property_url', 'property_id', 'address', 'street_name', 'apartment', 'city', 'state', 'price_per_unit', 'land_space_unit', 'broker_id','property_status', 'year_build', 'total_num_units', 'listing_age',
       'RunDate', 'agency_name', 'agent_name', 'agent_phone',
       'is_owned_by_zillow'])

In [9]:
description = df.describe(include='all')
description

Unnamed: 0,latitude,longitude,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type
count,126313.0,126313.0,145843.0,145843.0,112708.0,126246.0,111778.0,123145.0,145843
unique,,,1752.0,,,,,,7
top,,,75126.0,,,,,,SINGLE_FAMILY
freq,,,981.0,,,,,,101682
mean,30.819355,-97.079339,,493435.3,3.385208,2.392402,3492.593,770400.1,
std,1.745501,1.954543,,1029319.0,1.10577,1.400852,80718.37,65637400.0,
min,25.860826,-106.63189,,1.0,0.0,0.0,1.0,-10890.0,
25%,29.698868,-97.983025,,210500.0,3.0,2.0,1590.0,6621.12,
50%,30.307894,-96.92956,,348000.0,3.0,2.0,2054.5,10200.0,
75%,32.52481,-95.52519,,524900.0,4.0,3.0,2704.0,44866.8,


In [10]:
df.isna().sum()

latitude           19530
longitude          19530
postcode               0
price                  0
bedroom_number     33135
bathroom_number    19597
living_space       34065
land_space         22698
property_type          0
dtype: int64

In [11]:
df_isna=df.isna()
df_isna

Unnamed: 0,latitude,longitude,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type
180271,False,False,False,False,False,False,False,False,False
180273,False,False,False,False,False,False,False,False,False
180275,False,False,False,False,False,False,False,False,False
180277,False,False,False,False,False,False,False,False,False
180279,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...
446346,False,False,False,False,False,False,False,False,False
446348,False,False,False,False,False,False,False,False,False
446350,False,False,False,False,False,False,False,False,False
446352,False,False,False,False,False,False,False,False,False


In [12]:
df_isna.columns=[c for c in df_isna.columns]
df_isna

Unnamed: 0,latitude,longitude,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type
180271,False,False,False,False,False,False,False,False,False
180273,False,False,False,False,False,False,False,False,False
180275,False,False,False,False,False,False,False,False,False
180277,False,False,False,False,False,False,False,False,False
180279,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...
446346,False,False,False,False,False,False,False,False,False
446348,False,False,False,False,False,False,False,False,False
446350,False,False,False,False,False,False,False,False,False
446352,False,False,False,False,False,False,False,False,False


In [13]:
df_isna.columns=[c+'_isna' for c in df_isna.columns]
df_isna

Unnamed: 0,latitude_isna,longitude_isna,postcode_isna,price_isna,bedroom_number_isna,bathroom_number_isna,living_space_isna,land_space_isna,property_type_isna
180271,False,False,False,False,False,False,False,False,False
180273,False,False,False,False,False,False,False,False,False
180275,False,False,False,False,False,False,False,False,False
180277,False,False,False,False,False,False,False,False,False
180279,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...
446346,False,False,False,False,False,False,False,False,False
446348,False,False,False,False,False,False,False,False,False
446350,False,False,False,False,False,False,False,False,False
446352,False,False,False,False,False,False,False,False,False


In [14]:
df=pd.concat([df.fillna(0), df_isna.astype(float)], axis=1)
df

Unnamed: 0,latitude,longitude,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type,latitude_isna,longitude_isna,postcode_isna,price_isna,bedroom_number_isna,bathroom_number_isna,living_space_isna,land_space_isna,property_type_isna
180271,31.786737,-106.428020,79903,239500.0,5.0,3.0,1692.0,6969.600,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180273,31.784021,-106.404450,79925,165000.0,4.0,2.0,1650.0,12632.400,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180275,31.761911,-106.432330,79905,118000.0,4.0,1.0,1918.0,11325.600,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180277,31.790165,-106.435960,79903,414700.0,4.0,3.0,3119.0,15246.000,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180279,31.775480,-106.438230,79905,260000.0,0.0,0.0,3267.0,6000.000,MULTI_FAMILY,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,33.059647,-96.710945,75023,393000.0,3.0,2.0,1717.0,8145.720,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446348,33.044098,-96.707720,75023,315000.0,3.0,3.0,1754.0,3920.400,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446350,33.069946,-96.712920,75023,360000.0,4.0,3.0,2068.0,4356.000,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446352,33.070004,-96.713900,75023,447000.0,4.0,3.0,2111.0,4299.372,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df['postcode']=df['postcode'].astype(str)

In [16]:
df.groupby(['postcode'])['latitude'].median()

postcode
 78245    29.396170
11111     33.599007
73870     27.943542
75001     32.953060
75002     33.092088
            ...    
79935     31.772724
79936     31.764378
79938     31.788981
79968     31.771887
97316     29.350270
Name: latitude, Length: 1752, dtype: float64

In [17]:
df.head()

Unnamed: 0,latitude,longitude,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type,latitude_isna,longitude_isna,postcode_isna,price_isna,bedroom_number_isna,bathroom_number_isna,living_space_isna,land_space_isna,property_type_isna
180271,31.786737,-106.42802,79903,239500.0,5.0,3.0,1692.0,6969.6,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180273,31.784021,-106.40445,79925,165000.0,4.0,2.0,1650.0,12632.4,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180275,31.761911,-106.43233,79905,118000.0,4.0,1.0,1918.0,11325.6,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180277,31.790165,-106.43596,79903,414700.0,4.0,3.0,3119.0,15246.0,SINGLE_FAMILY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180279,31.77548,-106.43823,79905,260000.0,0.0,0.0,3267.0,6000.0,MULTI_FAMILY,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [18]:
# A crucial categorical feature 'property_type' is categorical variable so I wanted to transform that to a numeric one
dummies=pd.get_dummies(df[['postcode','property_type']], drop_first=True)
dummies

Unnamed: 0,postcode_11111,postcode_73870,postcode_75001,postcode_75002,postcode_75006,postcode_75007,postcode_75009,postcode_75010,postcode_75013,postcode_75017,...,postcode_79936,postcode_79938,postcode_79968,postcode_97316,property_type_CONDO,property_type_LOT,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180273,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180275,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180277,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
180279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [19]:
df = pd.concat([df,dummies],axis='columns')
df

Unnamed: 0,latitude,longitude,postcode,price,bedroom_number,bathroom_number,living_space,land_space,property_type,latitude_isna,...,postcode_79936,postcode_79938,postcode_79968,postcode_97316,property_type_CONDO,property_type_LOT,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180271,31.786737,-106.428020,79903,239500.0,5.0,3.0,1692.0,6969.600,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0
180273,31.784021,-106.404450,79925,165000.0,4.0,2.0,1650.0,12632.400,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0
180275,31.761911,-106.432330,79905,118000.0,4.0,1.0,1918.0,11325.600,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0
180277,31.790165,-106.435960,79903,414700.0,4.0,3.0,3119.0,15246.000,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0
180279,31.775480,-106.438230,79905,260000.0,0.0,0.0,3267.0,6000.000,MULTI_FAMILY,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,33.059647,-96.710945,75023,393000.0,3.0,2.0,1717.0,8145.720,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0
446348,33.044098,-96.707720,75023,315000.0,3.0,3.0,1754.0,3920.400,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0
446350,33.069946,-96.712920,75023,360000.0,4.0,3.0,2068.0,4356.000,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0
446352,33.070004,-96.713900,75023,447000.0,4.0,3.0,2111.0,4299.372,SINGLE_FAMILY,0.0,...,0,0,0,0,0,0,0,0,1,0


In [20]:
df = df.drop(columns=['postcode','property_type'])
df

Unnamed: 0,latitude,longitude,price,bedroom_number,bathroom_number,living_space,land_space,latitude_isna,longitude_isna,postcode_isna,...,postcode_79936,postcode_79938,postcode_79968,postcode_97316,property_type_CONDO,property_type_LOT,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180271,31.786737,-106.428020,239500.0,5.0,3.0,1692.0,6969.600,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180273,31.784021,-106.404450,165000.0,4.0,2.0,1650.0,12632.400,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180275,31.761911,-106.432330,118000.0,4.0,1.0,1918.0,11325.600,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180277,31.790165,-106.435960,414700.0,4.0,3.0,3119.0,15246.000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180279,31.775480,-106.438230,260000.0,0.0,0.0,3267.0,6000.000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,33.059647,-96.710945,393000.0,3.0,2.0,1717.0,8145.720,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
446348,33.044098,-96.707720,315000.0,3.0,3.0,1754.0,3920.400,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
446350,33.069946,-96.712920,360000.0,4.0,3.0,2068.0,4356.000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
446352,33.070004,-96.713900,447000.0,4.0,3.0,2111.0,4299.372,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [21]:
df.dtypes

latitude                       float64
longitude                      float64
price                          float64
bedroom_number                 float64
bathroom_number                float64
                                ...   
property_type_LOT                uint8
property_type_MANUFACTURED       uint8
property_type_MULTI_FAMILY       uint8
property_type_SINGLE_FAMILY      uint8
property_type_TOWNHOUSE          uint8
Length: 1773, dtype: object

In [22]:
df["price_per_sqfoot"] = df["price"] / df["living_space"]

In [23]:
q_low = df["bedroom_number"].quantile(0.01)
q_hi  = df["bedroom_number"].quantile(0.99)

df = df[(df["bedroom_number"] < q_hi) & (df["bedroom_number"] > q_low)]

In [24]:
q_low = df["bathroom_number"].quantile(0.01)
q_hi  = df["bathroom_number"].quantile(0.99)

df = df[(df["bathroom_number"] < q_hi) & (df["bathroom_number"] > q_low)]

In [25]:
q_low = df["living_space"].quantile(0.01)
q_hi  = df["living_space"].quantile(0.99)

df = df[(df["living_space"] < q_hi) & (df["living_space"] > q_low)]

In [26]:
q_low = df["land_space"].quantile(0.01)
q_hi  = df["land_space"].quantile(0.99)

df = df[(df["land_space"] < q_hi) & (df["land_space"] > q_low)]

In [27]:
q_low = df["price"].quantile(0.01)
q_hi  = df["price"].quantile(0.99)

df = df[(df["price"] < q_hi) & (df["price"] > q_low)]

In [28]:
q_low = df["price_per_sqfoot"].quantile(0.01)
q_hi  = df["price_per_sqfoot"].quantile(0.99)

df = df[(df["price_per_sqfoot"] < q_hi) & (df["price_per_sqfoot"] > q_low)]

In [29]:
description = df.describe(include='all')
description

Unnamed: 0,latitude,longitude,price,bedroom_number,bathroom_number,living_space,land_space,latitude_isna,longitude_isna,postcode_isna,...,postcode_79938,postcode_79968,postcode_97316,property_type_CONDO,property_type_LOT,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE,price_per_sqfoot
count,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,...,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0,66105.0
mean,27.632022,-87.07743,406923.2,3.372165,2.60742,2091.147054,29266.31,0.103608,0.103608,0.0,...,0.002405,0.0,1.5e-05,0.019545,0.000166,0.025807,0.00478,0.914182,0.035383,194.195588
std,9.532899,29.661268,193011.8,0.587137,0.692579,630.602613,87607.87,0.304754,0.304754,0.0,...,0.048985,0.0,0.003889,0.13843,0.012899,0.158562,0.068975,0.280097,0.184748,66.850146
min,0.0,-106.62442,98500.0,1.0,1.5,946.0,0.0385,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76.601671
25%,29.521435,-97.82725,274900.0,3.0,2.0,1615.0,5989.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,149.617258
50%,30.102903,-96.78416,362000.0,3.0,2.0,1993.0,7797.24,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,182.5
75%,32.44479,-95.41458,495000.0,4.0,3.0,2487.0,12283.92,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,223.564307
max,36.461475,0.0,1449000.0,4.0,4.0,4145.0,1038035.0,1.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,499.990654


In [30]:
features=df.describe().columns.drop(['price', 'price_per_sqfoot'])
features

Index(['latitude', 'longitude', 'bedroom_number', 'bathroom_number',
       'living_space', 'land_space', 'latitude_isna', 'longitude_isna',
       'postcode_isna', 'price_isna',
       ...
       'postcode_79936', 'postcode_79938', 'postcode_79968', 'postcode_97316',
       'property_type_CONDO', 'property_type_LOT',
       'property_type_MANUFACTURED', 'property_type_MULTI_FAMILY',
       'property_type_SINGLE_FAMILY', 'property_type_TOWNHOUSE'],
      dtype='object', length=1772)

In [31]:
targets=['price','price_per_sqfoot']
targets

['price', 'price_per_sqfoot']

In [32]:
X=df[features]
X

Unnamed: 0,latitude,longitude,bedroom_number,bathroom_number,living_space,land_space,latitude_isna,longitude_isna,postcode_isna,price_isna,...,postcode_79936,postcode_79938,postcode_79968,postcode_97316,property_type_CONDO,property_type_LOT,property_type_MANUFACTURED,property_type_MULTI_FAMILY,property_type_SINGLE_FAMILY,property_type_TOWNHOUSE
180273,31.784021,-106.404450,4.0,2.0,1650.0,12632.400,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180277,31.790165,-106.435960,4.0,3.0,3119.0,15246.000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180281,31.785316,-106.411150,4.0,2.0,1800.0,6098.400,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180288,31.799797,-106.437600,3.0,3.0,2790.0,6098.400,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
180292,31.789478,-106.423775,2.0,2.0,1772.0,8712.000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446346,33.059647,-96.710945,3.0,2.0,1717.0,8145.720,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
446348,33.044098,-96.707720,3.0,3.0,1754.0,3920.400,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
446350,33.069946,-96.712920,4.0,3.0,2068.0,4356.000,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
446352,33.070004,-96.713900,4.0,3.0,2111.0,4299.372,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [33]:
y=df[targets[0]]
y

180273    165000.0
180277    414700.0
180281    174950.0
180288    249000.0
180292    178000.0
            ...   
446346    393000.0
446348    315000.0
446350    360000.0
446352    447000.0
446354    377000.0
Name: price, Length: 66105, dtype: float64

In [34]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=42)
len(X_train),len(X_test),len(y_train),len(y_test)

(46273, 19832, 46273, 19832)

In [35]:
numeric=['latitude', 'longitude', 'bedroom_number', 'bathroom_number', 'living_space', 'land_space']
sc=StandardScaler()
X_train[numeric]=sc.fit_transform(X_train[numeric])
X_test[numeric]=sc.transform(X_test[numeric])

**Linear Regression Model**

In [36]:
model = LinearRegression()
model.fit(X_train , y_train)

LinearRegression()

In [37]:
y_pred = model.predict(X_test)

In [38]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))
print(metrics.mean_squared_error(y_test,y_pred))
print(metrics.mean_squared_error(y_test,y_pred, squared=False))

652448256938.0724
7.822693450319426e+26
27969078372945.05


In [39]:
model.score(X_train , y_train)

0.7463989084863549