# Prepare data

In [1]:
# import basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pickle

# import machine learning packages
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('df_house.csv',index_col=0)
df.head()

Unnamed: 0,index,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sales_order
0,0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,...,1180,0,1955,1955,98178,47.5112,-122.257,1340,5650,1
1,1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,...,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,1
2,2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,...,770,0,1933,1933,98028,47.7379,-122.233,2720,8062,1
3,3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,...,1050,910,1965,1965,98136,47.5208,-122.393,1360,5000,1
4,4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,...,1680,0,1987,1987,98074,47.6168,-122.045,1800,7503,1


In [3]:
df.columns

Index(['index', 'id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'sales_order'],
      dtype='object')

# Make new columns, sold_month from column date for sold dates.

In [4]:
# Extract month from date column
from calendar import month_name

df.date = pd.to_datetime(df.date)
df['sold_month'] = df.date.map(lambda x: month_name[x.month])

In [5]:
df.sold_month.unique()

array(['October', 'December', 'February', 'May', 'June', 'January',
       'April', 'March', 'July', 'August', 'November', 'September'],
      dtype=object)

The most of values of 'yr_renovated' (renovated years) were missing and I filled the missing values with built years before. I will remove the yr_renovated columns, but a new column 'renovated' will be added to show whether a house is renovated or not.

In [6]:

# 1 for renovated houses and 0 for else
df['renovated'] = (df.yr_built != df.yr_renovated)*1

# Remove unnecessary columns

In [7]:
# Delete some columns from the dataframe
df = df.drop(['id','date', 'yr_renovated', 'zipcode'], axis=1)

In [8]:
# columns left
df.columns

Index(['index', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'lat', 'long', 'sqft_living15',
       'sqft_lot15', 'sales_order', 'sold_month', 'renovated'],
      dtype='object')

# One-hot encoding

In [9]:
# one-hot encoding for categorical columns
df = pd.get_dummies(df, drop_first=True) # April column was dropped
df.head()

Unnamed: 0,index,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sold_month_December,sold_month_February,sold_month_January,sold_month_July,sold_month_June,sold_month_March,sold_month_May,sold_month_November,sold_month_October,sold_month_September
0,0,221900.0,3,1.0,1180,5650,1.0,0,0,3,...,0,0,0,0,0,0,0,0,1,0
1,1,538000.0,3,2.25,2570,7242,2.0,0,0,3,...,1,0,0,0,0,0,0,0,0,0
2,2,180000.0,2,1.0,770,10000,1.0,0,0,3,...,0,1,0,0,0,0,0,0,0,0
3,3,604000.0,4,3.0,1960,5000,1.0,0,0,5,...,1,0,0,0,0,0,0,0,0,0
4,4,510000.0,3,2.0,1680,8080,1.0,0,0,3,...,0,1,0,0,0,0,0,0,0,0



The categorical column 'sold_month' was transformed to 11 columns by one-hot encoding. The column for April was dropped using the argument 'drop_first=True'

# Split training-test sets

In [10]:
# Predictors X and target y
y = df.price.values
X = df.drop('price', axis=1).values

In [11]:
# check the size of y
len(y)

21597

In [12]:

# check the size of X
len(X), len(X[0])

(21597, 30)

In [13]:
# Split training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

20% of rows were saved for testing and 80% left will be used for training.

In [15]:
# Make function that print the tuned parameters and score
def print_results(cv):
    print("Best Hyper-parameters found: {}".format(cv.best_params_)) 
    print("Best RMSE score on development set: {:.0f}".format(np.sqrt(-cv.best_score_)))
    RMSE = np.sqrt(-cv.score(X_test, y_test))
    print("\nRMSE on test set: {:.0f}".format(RMSE))
    r_squared = r2_score(y_test,cv.predict(X_test))
    print("R^2 on test set: {:.4f}".format(r_squared))

# Feature scaling

In [18]:
scaler = StandardScaler()
# transform data
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)