In [61]:
# sklearn packages
import sklearn.metrics
from sklearn.feature_selection import RFE, SelectKBest, f_regression, RFECV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler ,minmax_scale, PolynomialFeatures, StandardScaler

# Statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statistics as stats
import statsmodels.api as sm

# Geolocation
import geopy
from geopy import distance

import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

# utility libraries 
from statsmodels.formula.api import ols
from scipy.stats import f_oneway, norm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
import pickle

import warnings 

sns.set_style('whitegrid')

# filter warnings
warnings.filterwarnings("ignore")

# magic inline
%matplotlib inline
pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [70]:
df = pd.read_csv('kc_house_data_train.csv')
holdout.head()

Unnamed: 0.1,Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
1,1,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918
2,2,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576
3,3,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565
4,4,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916


In [52]:
df.drop('date', axis = 1, inplace = True)

In [53]:
df.drop(columns = ['id'], axis = 1, inplace = True)

In [54]:
import pickle

final_scaler = pd.read_pickle("model.pickle")
final_model = pd.read_pickle("transform.pickle")

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [55]:
df[df.bedrooms > 15]
#df['bedrooms'] = df['bedrooms'].replace(['33'],'3')
df.at[8597, 'bedrooms'] = 3

df['bedrooms'] = np.where((df['bedrooms'] + 1)/(df['bathrooms'] + 1) > 4, 
                                      df['bedrooms'].median(), 
                                      df['bedrooms'])

In [56]:
df["renovated"] = df.yr_renovated.apply(lambda x: 1 if x > 0 else 0)
renovated = df['renovated']

In [57]:
df["basement"] = df.sqft_basement.apply(lambda x: 1 if x != 0 else 0)

In [58]:
df['bathrooms'] = np.where((df['bathrooms'] + 1)/(df['bathrooms'] + 1) > 4,
                           df['bathrooms'].median(),
                           df['bathrooms'])

In [62]:
from geopy import Point

In [69]:
distances=[]

for (lat, long) in list(zip(df['lat'],df["long"])):
    p1 = Point(f'{lat} {long}')
    pikes_place = Point("47.6086 -122.3401")
    distances.append(distance.distance(p1,pikes_place).miles)
df['distance_from_center'] = distances

ValueError: Failed to create Point instance from string: unknown format.

In [None]:
new_features = pd.DataFrame()
new_features['renovated'] = renovated
new_features['pike_place'] = distances

In [None]:
# Create dummy variables for zip code 
zip_dummies = pd.get_dummies(df['zipcode'].astype(str), dtype=int, drop_first=True)
df.drop(columns=['zipcode'],inplace=True)
new_cols = 'zip'+zip_dummies.columns
zip_dummies.columns = new_cols
zip_dummies.head()

## Step 3: Predict the holdout set

In [43]:
y_2 = df['price']
X_all = pd.concat([df.iloc[:,1:], zip_dummies, new_features], axis=1)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_all, y_2, test_size=0.2, random_state=42)
X_all.head()

KeyError: 'price'

In [None]:
scaler_2 = StandardScaler()

X_train_transformed_2 = scaler_2.fit_transform(X_train_2)   # Fitting and scaling to training data

X_test_transformed_2 = scaler_2.transform(X_test_2)    # Scaling to test data

model_2 = LinearRegression()

model_2.fit(X_train_transformed_2, y_train_2)   

y_pred_2 = model_2.predict(X_test_transformed_2)  

rmse_2 = np.sqrt(sklearn.metrics.mean_squared_error(y_test_2, y_pred_2))

print('R-squared: ' + str(model_2.score(X_test_transformed_2, y_test_2)))
print('RMSE: ' + str(rmse_2))

In [None]:
# final_answers = final_model.predict(transformed_holdout)

## Step 4: Export your predictions

In [None]:
# final_answer.to_csv('housing_preds_your_name.csv')