# IMPORTING

In [22]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols

In [23]:
%store -r data_3

# Alterations: Engineered

In [28]:
data_4 = data_3

In [29]:
#Reign in values that are greater than 5 standard deviation away from the median
data_4.sqft_living = data_4.sqft_living.apply(lambda x: 6520 if x > 6520 else x)

In [30]:
#Reign in values that are greater than 5 standard deviation away from the median
data_4.sqft_lot = data_4.sqft_lot.apply(lambda x: 219165 if x > 219165 else x)

In [32]:
#use grade and sqft living because two highest predictors
data_4['grade_sqft_living'] = data_4.grade * data_4.sqft_living

In [39]:
#Scale and add all categorical variables as sqft_living multiplier
data_4['categorical_proportions'] = ((data_4.grade / 13) + (data_4.condition / 5) + (data_4.view / 4)) * data_4.sqft_living

# Stratified Split

In [42]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_4, data_4['grade']):
    strat_train_set = data_4.loc[data_4.index.intersection(train_index)]
    strat_test_set = data_4.loc[data_4.index.intersection(test_index)]

# PREPARE DATA FOR MACHINE LEARNING

In [43]:
x_train = strat_train_set.drop('price', axis=1)
y_train = strat_train_set['price'].copy()

# LINEAR REGRESSION

In [44]:
lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [46]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [47]:
coefficients = lr.coef_
variables = np.array(x_train.columns)

array_to_dict(coefficients, variables)

{'bedrooms': -9334.410746536993,
 'bathrooms': 45175.43176948874,
 'sqft_living': -278.4190037986293,
 'sqft_lot': -0.14725265260810302,
 'floors': -17357.5605762322,
 'waterfront': 563014.7731378746,
 'view': -11170.935707876804,
 'condition': -53576.033087447315,
 'grade': 3724.638929369686,
 'sqft_above': -152.81410087677645,
 'sqft_basement': -156.05455410294354,
 'yr_built': -1767.0397441792522,
 'yr_renovated': 10.117280522891493,
 'zipcode': -4.1552993934601545e-08,
 'lat': 564120.7706866221,
 'long': -99696.3588432825,
 'sqft_living15': 47.51596087224948,
 'sqft_lot15': -0.023706015598240503,
 'date_year': 40813.59372744629,
 'date_month': 2263.582689431981,
 'date_day': -180.87914399086728,
 'rel_to_neighbors': -219.12645332106874,
 'size_quality': 33.824416639304914,
 'property_score': -0.21186219942324738,
 'floor_reno_mult': 18.07624968982666,
 'grade_sqft_living': 1.7378735672246985,
 'categorical_proportions': 78.68894995846225}

In [48]:
lr.intercept_

-117418355.41214092

In [49]:
y_pred = lr.predict(x_train)

In [50]:
lr.score(x_train, y_train)

0.748642930570433

In [51]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$186,290.20'