In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [2]:
houses_holdout = pd.read_csv('data/kc_house_data_test_features.csv',index_col=0)

In [3]:
from pickle import load
final_scaler = load(open('pickle/scaler.pickle','rb'))
final_model = load(open('pickle/model.pickle','rb'))
final_features = load(open('pickle/features.pickle','rb'))

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

#### Dealing with outliers

In [4]:
#capping bedrooms at 9
houses_holdout.bedrooms = np.where(houses_holdout.bedrooms > 9, 9,houses_holdout.bedrooms)

#capping bedrooms at 6
houses_holdout.bathrooms = np.where(houses_holdout.bathrooms > 6, 6,houses_holdout.bathrooms)

#capping sqft_living at 10,000
houses_holdout.sqft_living = np.where(houses_holdout.sqft_living >10000,10000,houses_holdout.sqft_living)

#capping sqft_lot at 95th percentile
houses_holdout.sqft_lot = np.where(houses_holdout.sqft_lot >houses_holdout.sqft_lot.quantile(.95),
                                 houses_holdout.sqft_lot.quantile(.95),houses_holdout.sqft_lot)

#capping sqft_basement at 95th percentile
houses_holdout.sqft_basement = np.where(houses_holdout.sqft_basement >houses_holdout.sqft_basement.quantile(.95),
                                 houses_holdout.sqft_basement.quantile(.95),houses_holdout.sqft_basement)

#capping sqft_lot15 at 95th percentile
houses_holdout.sqft_lot15 = np.where(houses_holdout.sqft_lot15 >houses_holdout.sqft_lot15.quantile(.95),
                                 houses_holdout.sqft_lot15.quantile(.95),houses_holdout.sqft_lot15)


#### Converting `date` to Datetime object

In [5]:
#changing date to datetime object
import datetime as dt

#Cut out timestamp
houses_holdout['date'] = houses_holdout['date'].str.slice(stop=8)

#convert to datetime
houses_holdout['date'] = pd.to_datetime(houses_holdout['date'], errors='coerce', yearfirst=True)
houses_holdout['month_sold'] = houses_holdout['date'].apply(lambda x:x.month)

#### Creating Years Old

In [6]:
houses_holdout['yrs_old'] = houses_holdout.date.dt.year - houses_holdout.yr_built

#### Creating Years since last renovation or build if not renovated:
Making the year renovated feature more useful by identifying how long since the last renovation

In [7]:
houses_holdout['yrs_last_ren'] = houses_holdout.date.dt.year - houses_holdout.yr_renovated

#replace new houses with renovation scores of zero
conditions = [houses_holdout.yrs_last_ren == 2015,
             houses_holdout.yrs_last_ren == 2014]
choices = [0,0]
houses_holdout['yrs_last_ren'] = np.select(conditions,choices,default=houses_holdout['yrs_last_ren'])

#### Renovated recently or not:
Does a house that was renovated in the last 10 years have a differnce in price from those that were not?

In [8]:
houses_holdout['renovated'] = np.where(houses_holdout.yr_renovated <= 10,0,1)

#### Bedrooms per bathroom: 
This could indicate the number of people who have to share a bathroom

In [9]:
houses_holdout['bed_per_bath'] = houses_holdout.bedrooms / houses_holdout.bathrooms

#replace bed_per_bath scores with zero if divide by zerro error issue
conditions = [houses_holdout.bed_per_bath == np.nan,
             houses_holdout.bed_per_bath == np.inf,
             houses_holdout.bed_per_bath.isna()]
choices = [0,0,0]
houses_holdout['bed_per_bath'] = np.select(conditions,choices,default=houses_holdout['bed_per_bath'])

#### sqft_living per bedroom 
This could indicate the number of extra rooms (den, library, office, playroom, etc) or space shared per person

In [10]:
houses_holdout['sqft_per_bed'] = houses_holdout.sqft_living / houses_holdout.bedrooms

#replace bed_per_bath scores with zero if divide by zerro error issue
conditions = [houses_holdout.sqft_per_bed == np.nan,
             houses_holdout.sqft_per_bed == np.inf,
             houses_holdout.sqft_per_bed.isna()]
choices = [0,0,0]
houses_holdout['sqft_per_bed'] = np.select(conditions,choices,default=houses_holdout['sqft_per_bed'])

#### Using Longitute & Latitude to identify City/Neighborhood:
Most people don't think in zipcode or longitude/latitude but understand municipalities/neighborhoods to dictate how expensive a house should be. For example, in New York, it's commonly understood that Manhattan is more expensive area to live than the Bronx.

In [11]:
import reverse_geocode

#retreiving location dictionaries
houses_holdout['location'] = reverse_geocode.search(list(zip(houses_holdout.lat,houses_holdout.long)))

#islolating city specifically
houses_holdout['city'] = houses_holdout['location'].apply(lambda x: x['city'])

#Divide Seattle by zipcode to approximate urban/population dense areas/neighborhood differences
houses_holdout['zipcode'] = houses_holdout['zipcode'].astype(str)
houses_holdout['city'] = np.where(houses_holdout['city']=='Seattle','Seattle - ' + houses_holdout["zipcode"],houses_holdout['city'])

#### Creating Dummy Variables For Cities

In [12]:
from sklearn.preprocessing import OneHotEncoder

#encoder for dummy variables
cat_encoder = OneHotEncoder()
city_dummies = cat_encoder.fit_transform(houses_holdout[['city']])

#turn spaces and dashes in column names to underscores
city_names = []
for city in cat_encoder.categories_[0].tolist():
    city_names.append(city.replace(' ','_').replace('-','_'))

#create dummy dataframe and concat with original dataframe
dummy_df = pd.DataFrame(city_dummies.toarray(),columns=city_names)
houses_holdout_dumb = pd.concat([houses_holdout,dummy_df],axis=1) 

#### Scaling Latitude/Longitude

In [13]:
#fit scaler to training set
#Set X to features
X = houses_holdout_dumb[['lat','long']]

#scale using stdScale
X_scaled = final_scaler.transform(X)

houses_holdout_dumb['stdscaled_lat'] = X_scaled[:, 0]
houses_holdout_dumb['stdscaled_long'] = X_scaled[:, 1]

## Step 3: Predict the holdout set

In [14]:
#add cities not present in data
houses_holdout_dumb['Maplewood'] = 0
houses_holdout_dumb['Seattle___98106'] = 0
houses_holdout_dumb['Southworth'] = 0
houses_holdout_dumb['Riverbend'] = 0

In [15]:
final_pred = final_model.predict(houses_holdout_dumb[final_features])
final_pred

array([597881.50174867, 625072.41548459, 346458.85491639, ...,
       347867.36083621, 454350.14411901, 354604.25734489])

## Step 4: Export your predictions

In [16]:
np.savetxt("housing_preds_Mitch_Krieger.csv", final_pred, delimiter=",")