# IMPORTING

In [36]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols

In [37]:
%store -r data_3

# Alterations: Engineered

In [38]:
data_4 = data_3

In [39]:
#Reign in values that are greater than 5 standard deviation away from the median
data_4.sqft_living = data_4.sqft_living.apply(lambda x: 6520 if x > 6520 else x)

In [40]:
#Reign in values that are greater than 5 standard deviation away from the median
data_4.sqft_lot = data_4.sqft_lot.apply(lambda x: 219165 if x > 219165 else x)

In [41]:
#use grade and sqft living because two highest predictors
data_4['grade_sqft_living'] = data_4.grade * data_4.sqft_living

In [42]:
#Scale and add all categorical variables as sqft_living multiplier
data_4['categorical_proportions'] = ((data_4.grade / 13) + (data_4.condition / 5) + (data_4.view / 4)) * data_4.sqft_living

# New Dataframe

In [53]:
%store data_4

Stored 'data_4' (DataFrame)


# Stratified Split

In [44]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_4, data_4['grade']):
    strat_train_set = data_4.loc[data_4.index.intersection(train_index)]
    strat_test_set = data_4.loc[data_4.index.intersection(test_index)]

# PREPARE DATA FOR MACHINE LEARNING

In [45]:
x_train = strat_train_set.drop('price', axis=1)
y_train = strat_train_set['price'].copy()

# LINEAR REGRESSION

In [46]:
lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [47]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [48]:
coefficients = lr.coef_
variables = np.array(x_train.columns)

array_to_dict(coefficients, variables)

{'bedrooms': -10937.658816234009,
 'bathrooms': 44412.10385889842,
 'sqft_living': -265.74087652294196,
 'sqft_lot': -0.12954259567650905,
 'floors': -12968.814187622293,
 'waterfront': 557448.8126088587,
 'view': -5272.704229900541,
 'condition': -57847.195482421834,
 'grade': 2073.9456932816815,
 'sqft_above': -166.96448334054523,
 'sqft_basement': -167.6582408845224,
 'yr_built': -1915.6434218400093,
 'yr_renovated': 7.496789137106107,
 'zipcode': -541.9731820224451,
 'lat': 603129.1341005255,
 'long': -190489.52106929908,
 'sqft_living15': 41.79298453024673,
 'sqft_lot15': -0.027107591777243378,
 'date_year': 40013.374358887275,
 'date_month': 2056.9123015330324,
 'date_day': -209.04386468609113,
 'rel_to_neighbors': -703.9135368012384,
 'size_quality': 34.46745203778245,
 'property_score': -0.22449213185891612,
 'floor_reno_mult': 18.624649799281332,
 'grade_sqft_living': 1.6326530496606029,
 'categorical_proportions': 75.31340047976987}

In [49]:
lr.intercept_

-75272764.7092606

In [50]:
y_pred = lr.predict(x_train)

In [51]:
lr.score(x_train, y_train)

0.7523020991767342

In [52]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$184,929.26'