In [19]:
import pandas as pd
import os
pd.set_option('display.max_columns', 300)
import pickle
from sklearn.preprocessing import StandardScaler

## Reading in hold out data, scalers, and best model

In [20]:
holdout = pd.read_csv('data/kc_house_data_test_features.csv')

In [21]:
final_scaler = pickle.load(open( "archive/scaler.pickle", "rb" ))
final_model = pickle.load(open( "archive/model.pickle", "rb" ))

## Adapting Holdout Set

### Cleaning

In [22]:
test = holdout[holdout.sqft_living < 2000]
new_test = holdout[holdout.sqft_living > 1000]
new_value = new_test.bedrooms.median()
holdout.bedrooms = holdout.bedrooms.apply(lambda x: new_value if x == 33 else x)

In [23]:
holdout = holdout.drop(columns=['Unnamed: 0','id'])

In [24]:
holdout['date_year'] = holdout.date.apply(lambda x: int(x[:4]))
holdout['date_month'] = holdout.date.apply(lambda x: int(x[4:6]))
holdout['date_day'] = holdout.date.apply(lambda x: int(x[6:8]))
holdout.drop('date',axis=1, inplace=True)

### Features

#### Combining Features

In [25]:
holdout['rel_to_neighbors'] = ((holdout.sqft_living/holdout.sqft_living15) + (holdout.sqft_lot/holdout.sqft_lot15))/2
holdout['size_quality'] = (holdout.condition + holdout.grade) * holdout.sqft_living
holdout['property_score'] = (holdout.view / 4)*(holdout.waterfront + 1) * holdout.sqft_lot
holdout['binary_reno'] = holdout.yr_renovated
holdout.binary_reno = holdout.binary_reno.apply(lambda x: 2 if x != 0 else 1)
holdout['reno_mult'] = ((holdout.binary_reno) * holdout.sqft_living)
holdout['grade_sqft_living'] = (holdout.grade**2) * holdout.sqft_living
holdout['categorical_proportions'] = ((holdout.grade / 13) + (holdout.condition / 5) + (holdout.view / 4)) * holdout.sqft_living

#### Dummies

In [26]:
zipcode_dummies = pd.get_dummies(holdout.zipcode)
merged = pd.concat([holdout,zipcode_dummies], axis='columns')
holdout = merged.drop(columns=['zipcode',98155])

grade_dummies = pd.get_dummies(holdout.grade)
merged = pd.concat([holdout,grade_dummies],axis='columns')
holdout = merged.drop(columns=['grade',13])

built_dummies = pd.get_dummies(holdout.yr_built)
merged = pd.concat([holdout,built_dummies],axis='columns')
holdout = merged.drop(columns=['yr_built',1951])

#### Scaling

In [27]:
%store -r selected_columns

In [28]:
holdout = holdout[selected_columns]

In [29]:
transformed_holdout = final_scaler.transform(holdout)

## Predicting on the holdout set

In [30]:
final_answers = final_model.predict(transformed_holdout)

In [31]:
final_answers = pd.DataFrame(final_answers)

In [32]:
final_answers.rename(columns={0:'price'})

Unnamed: 0,price
0,505810.440867
1,524675.339171
2,385538.844041
3,286471.958126
4,506700.551439
...,...
4318,568258.824749
4319,490326.490208
4320,403833.927444
4321,412520.877451


## Exporting predictions

In [33]:
final_answers.to_csv('housing_preds_michael_wirtz.csv')