# IMPORTING

In [324]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols

In [325]:
%store -r data_4

In [326]:
data_4.head(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,sqft_lot15,date_year,date_month,date_day,rel_to_neighbors,size_quality,property_score,floor_reno_mult,grade_sqft_living,categorical_proportions
0,365000.0,4.0,2.25,2070,8893,2.0,0,0,4,8,...,7700,2014,10,6,1.010522,24840,0.0,4140.0,16560,2929.846154
1,865000.0,5.0,3.0,2900,6730,1.0,0,0,5,8,...,6283,2014,8,21,1.147387,37700,0.0,2900.0,23200,4684.615385
2,1038000.0,4.0,2.5,3770,10893,2.0,0,2,3,11,...,9685,2014,8,15,1.070451,52780,5446.5,7540.0,41470,7337.0


# Alterations: Engineered

In [327]:
data_5 = data_4

### Log Transformation

In [328]:
non_normal = ['sqft_above','sqft_lot15','sqft_lot','lat']

for feat in non_normal:
    data_5[feat] = data_5[feat].map(lambda x: np.log(x))

In [329]:
data_5.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,sqft_lot15,date_year,date_month,date_day,rel_to_neighbors,size_quality,property_score,floor_reno_mult,grade_sqft_living,categorical_proportions
0,365000.0,4.0,2.25,2070,9.09302,2.0,0,0,4,8,...,8.948976,2014,10,6,1.010522,24840,0.0,4140.0,16560,2929.846154
1,865000.0,5.0,3.0,2900,8.81433,1.0,0,0,5,8,...,8.745603,2014,8,21,1.147387,37700,0.0,2900.0,23200,4684.615385


In [330]:
%store data_5

Stored 'data_5' (DataFrame)


# Stratified Split

In [331]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_5, data_5['grade']):
    strat_train_set = data_5.loc[data_5.index.intersection(train_index)]
    strat_test_set = data_5.loc[data_5.index.intersection(test_index)]

# PREPARE DATA FOR MACHINE LEARNING

In [332]:
x_train = strat_train_set.drop('price', axis=1)
y_train = strat_train_set['price'].copy()

# LINEAR REGRESSION

In [333]:
lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [334]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [335]:
coefficients = lr.coef_
variables = np.array(x_train.columns)

array_to_dict(coefficients, variables)

{'bedrooms': -12332.872186153329,
 'bathrooms': 39703.93400505043,
 'sqft_living': -477.05120147325624,
 'sqft_lot': -13136.154962142624,
 'floors': -32149.65543910963,
 'waterfront': 561000.030847048,
 'view': -10097.30750518219,
 'condition': -36549.67183694116,
 'grade': -9187.34980233177,
 'sqft_above': 113438.47275785706,
 'sqft_basement': 47.26808465838967,
 'yr_built': -1961.8099971807198,
 'yr_renovated': 7.116854511486331,
 'zipcode': -537.6735652079752,
 'lat': 28297983.769401487,
 'long': -163987.3410473642,
 'sqft_living15': 47.86261653071176,
 'sqft_lot15': -14119.044303585528,
 'date_year': 38933.89334777798,
 'date_month': 1960.0056313075925,
 'date_day': -232.34634320233218,
 'rel_to_neighbors': 872.3800309420167,
 'size_quality': 22.74537887129175,
 'property_score': -0.11829441411843004,
 'floor_reno_mult': 18.97828083413161,
 'grade_sqft_living': 17.145557195191373,
 'categorical_proportions': 80.84638553862295}

In [336]:
lr.intercept_

-151262613.2007357

In [337]:
y_pred = lr.predict(x_train)
lr.score(x_train, y_train)

0.7544497430864989

In [338]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$184,125.80'