# Pre-processing and Training Data Development - Capstone Two - King County House Sales

In [1]:
# Import required python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load 'kc_house_data.csv'
df = pd.read_csv('/Users/mitchmodlich/Desktop/Springboard_Data_Science/Springboard/KingCountyHouseSales/data/raw/kc_house_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [4]:
# Create binary columns for renovated and basement
df['renovated'] = 0
df['basement'] = 0

df.loc[df['yr_renovated'] != 0, 'renovated'] = 1
df.loc[df['sqft_basement'] != 0, 'basement'] = 1

In [5]:
# Set yr_renovated to yr_built where yr_renovated == 0
df.loc[df['yr_renovated'] == 0, 'yr_renovated'] = df['yr_built']

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Convert yr_renovated to datetime
df['yr_renovated'] = pd.to_datetime(df['yr_renovated'], format='%Y')

# Convert yr_built to datetime
df['yr_built'] = pd.to_datetime(df['yr_built'], format='%Y')

In [6]:
# Convert datetimes back to int and create new feature columns for scaling
df['date_int'] = df['date'].astype('int')
df['yr_built_int'] = df['yr_built'].astype('int')
df['yr_renovated_int'] = df['yr_renovated'].astype('int')

In [7]:
# Convert view, condition, grade, zipcode to categorical and one-hot encode
df[['view', 'condition', 'grade', 'zipcode']] = df[['view', 'condition', 'grade', 'zipcode']].astype('category')
df = pd.get_dummies(df, drop_first=True)

In [8]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,sqft_above,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,1180,...,0,0,0,0,0,0,1,0,0,0
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,2170,...,0,0,0,0,0,0,0,0,0,0
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,770,...,0,0,0,0,0,0,0,0,0,0
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,1050,...,0,0,0,0,0,0,0,0,0,0
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,1680,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Set Y to price values
y = df['price']

In [10]:
X = df.iloc[:,1:].drop(columns=['price', 'date', 'yr_built', 'yr_renovated'])

In [11]:
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,sqft_above,sqft_basement,lat,long,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
0,3,1.00,1180,5650,1.0,0,1180,0,47.5112,-122.257,...,0,0,0,0,0,0,1,0,0,0
1,3,2.25,2570,7242,2.0,0,2170,400,47.7210,-122.319,...,0,0,0,0,0,0,0,0,0,0
2,2,1.00,770,10000,1.0,0,770,0,47.7379,-122.233,...,0,0,0,0,0,0,0,0,0,0
3,4,3.00,1960,5000,1.0,0,1050,910,47.5208,-122.393,...,0,0,0,0,0,0,0,0,0,0
4,3,2.00,1680,8080,1.0,0,1680,0,47.6168,-122.045,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,3,2.50,1530,1131,3.0,0,1530,0,47.6993,-122.346,...,0,0,0,0,0,0,0,0,0,0
21609,4,2.50,2310,5813,2.0,0,2310,0,47.5107,-122.362,...,1,0,0,0,0,0,0,0,0,0
21610,2,0.75,1020,1350,2.0,0,1020,0,47.5944,-122.299,...,0,0,0,0,0,0,0,0,0,0
21611,3,2.50,1600,2388,2.0,0,1600,0,47.5345,-122.069,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Create list of names
names = df.iloc[:,1:].drop(columns=['price', 'date', 'yr_built', 'yr_renovated']).columns

In [13]:
names

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'sqft_above', 'sqft_basement', 'lat', 'long',
       ...
       'zipcode_98146', 'zipcode_98148', 'zipcode_98155', 'zipcode_98166',
       'zipcode_98168', 'zipcode_98177', 'zipcode_98178', 'zipcode_98188',
       'zipcode_98198', 'zipcode_98199'],
      dtype='object', length=105)

In [14]:
# Create scaler object
scaler = StandardScaler()

In [15]:
X = scaler.fit_transform(X)

In [16]:
X

array([[-0.39873715, -1.44746357, -0.97983502, ..., -0.0795761 ,
        -0.11456529, -0.12200584],
       [-0.39873715,  0.1756067 ,  0.53363434, ..., -0.0795761 ,
        -0.11456529, -0.12200584],
       [-1.47395936, -1.44746357, -1.42625404, ..., -0.0795761 ,
        -0.11456529, -0.12200584],
       ...,
       [-1.47395936, -1.77207762, -1.15404732, ..., -0.0795761 ,
        -0.11456529, -0.12200584],
       [-0.39873715,  0.50022075, -0.52252773, ..., -0.0795761 ,
        -0.11456529, -0.12200584],
       [-1.47395936, -1.77207762, -1.15404732, ..., -0.0795761 ,
        -0.11456529, -0.12200584]])

In [17]:
X = pd.DataFrame(X, columns=names)

In [18]:
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,sqft_above,sqft_basement,lat,long,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
0,-0.398737,-1.447464,-0.979835,-0.228321,-0.915427,-0.087173,-0.734708,-0.658681,-0.352572,-0.306079,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,9.027312,-0.079576,-0.114565,-0.122006
1,-0.398737,0.175607,0.533634,-0.189885,0.936506,-0.087173,0.460841,0.245141,1.161568,-0.746341,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
2,-1.473959,-1.447464,-1.426254,-0.123298,-0.915427,-0.087173,-1.229834,-0.658681,1.283537,-0.135655,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
3,0.676485,1.149449,-0.130550,-0.244014,-0.915427,-0.087173,-0.891699,1.397515,-0.283288,-1.271816,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
4,-0.398737,-0.149007,-0.435422,-0.169653,-0.915427,-0.087173,-0.130895,-0.658681,0.409550,1.199335,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,-0.398737,0.500221,-0.598746,-0.337424,2.788439,-0.087173,-0.312039,-0.658681,1.004958,-0.938069,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
21609,0.676485,0.500221,0.250539,-0.224386,0.936506,-0.087173,0.629908,-0.658681,-0.356180,-1.051685,...,8.604949,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
21610,-1.473959,-1.772078,-1.154047,-0.332137,0.936506,-0.087173,-0.927928,-0.658681,0.247888,-0.604321,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
21611,-0.398737,0.500221,-0.522528,-0.307076,0.936506,-0.087173,-0.227505,-0.658681,-0.184414,1.028910,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006


In [19]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [20]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,sqft_above,sqft_basement,lat,long,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
167,0.676485,0.500221,0.653405,-0.256110,0.936506,-0.087173,1.076729,-0.658681,0.035706,0.631254,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
12412,0.676485,-0.473621,0.283204,-0.242083,-0.915427,-0.087173,-0.746784,1.985000,0.826696,-0.753442,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
7691,0.676485,0.175607,0.119880,-0.146959,0.936506,-0.087173,0.484993,-0.658681,-1.341309,-0.497806,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
12460,-1.473959,-0.473621,0.196098,6.418588,0.936506,-0.087173,0.122706,0.177355,0.547396,1.944940,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
9099,0.676485,-0.473621,0.032774,6.050501,-0.915427,-0.087173,0.388383,-0.658681,0.699676,1.334254,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,-0.398737,-0.798235,-1.175824,-0.197804,-0.915427,-0.087173,-0.952080,-0.658681,1.113936,-0.746341,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
21575,-0.398737,0.500221,1.096558,-0.243966,0.936506,-0.087173,1.568233,-0.658681,-1.895580,-0.959372,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
5390,-0.398737,0.500221,0.043662,-0.249326,0.936506,-0.087173,0.400460,-0.658681,0.872886,1.291648,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
860,-2.549182,-1.772078,-1.850897,-0.002583,-0.915427,-0.087173,-1.700807,-0.658681,-0.570527,-0.774745,...,-0.116212,-0.051423,-0.145157,-0.10905,8.907622,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006


In [21]:
X_test

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,sqft_above,sqft_basement,lat,long,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
735,0.676485,0.175607,-0.010779,-0.150025,0.936506,-0.087173,0.340078,-0.658681,-0.875087,0.368517,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
2830,1.751707,1.149449,0.892947,-0.202247,-0.915427,-0.087173,0.050248,1.759044,0.854121,-0.504907,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
4106,0.676485,0.500221,1.840227,-0.101739,0.936506,-0.087173,2.393040,-0.658681,0.032819,0.602850,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
16218,-0.398737,1.798677,2.700400,-0.012047,0.936506,-0.087173,3.347064,-0.658681,1.006401,-0.100150,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
19964,-0.398737,0.500221,0.511858,-0.234937,0.936506,-0.087173,0.919738,-0.658681,0.755248,0.929496,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12606,-0.398737,0.500221,0.141657,-0.195728,0.936506,-0.087173,0.509146,-0.658681,-0.883026,0.581547,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
14393,-0.398737,-0.149007,-0.293874,-0.256086,-0.915427,-0.087173,-0.976233,1.216751,0.872886,-1.030382,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
6899,-0.398737,-1.447464,-1.012500,-0.200557,-0.915427,-0.087173,-0.770936,-0.658681,1.204871,-0.888362,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006
85,-0.398737,-0.798235,0.065439,-0.277815,0.936506,-0.087173,0.134782,-0.116388,0.531519,-1.072988,...,-0.116212,-0.051423,-0.145157,-0.10905,-0.112263,-0.109267,-0.110775,-0.079576,-0.114565,-0.122006


In [22]:
y_train

167      807100.0
12412    570000.0
7691     320000.0
12460    649000.0
9099     568000.0
           ...   
11964    378000.0
21575    399950.0
5390     575000.0
860      245000.0
15795    315000.0
Name: price, Length: 15129, dtype: float64

In [23]:
y_test

735       365000.0
2830      865000.0
4106     1038000.0
16218    1490000.0
19964     711000.0
           ...    
12606     412000.0
14393     760000.0
6899      410500.0
85        940000.0
21363     410000.0
Name: price, Length: 6484, dtype: float64