In [23]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols

In [24]:
%store -r data_final

In [25]:
train_set, test_set = train_test_split(data_final, test_size=0.2, random_state=42)
print(len(train_set), "train +", len(test_set), "test")

13832 train + 3458 test


In [26]:
x_train = train_set.drop('price', axis=1)
y_train = train_set['price'].copy()

In [27]:
lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [28]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [29]:
coefficients = lr.coef_
variables = np.array(x_train.columns)

array_to_dict(coefficients, variables)

{'bedrooms': -1774.1171670103868,
 'bathrooms': 28702.766584588047,
 'sqft_living': -899.9109102513144,
 'sqft_lot': 0.39015360756430084,
 'floors': -30647.0344046044,
 'waterfront': 646659.3260565281,
 'view': -19627.732872141114,
 'condition': -24699.94924707371,
 'grade': -27469.80331208996,
 'sqft_above': 426.6555029657011,
 'sqft_basement': 361.3323036755173,
 'yr_built': -203.30047037103645,
 'yr_renovated': -71.28698257576252,
 'lat': 217204.95521689422,
 'long': -217481.61028434665,
 'sqft_living15': 31.595565310675738,
 'sqft_lot15': 0.06617243640357628,
 'rel_to_neighbors': 5555.99133395878,
 'size_quality': 10.485062828316586,
 'property_score': -1.8379951164824888,
 'reno_mult': 93.10337830104982,
 'grade_sqft_living': 19.285554793808842,
 'categorical_proportions': 109.04236508613394,
 98001: -39204.54620011344,
 98002: -20057.89373061308,
 98003: -72024.51556517035,
 98004: 656162.5226957311,
 98005: 226381.25623023737,
 98006: 166097.89651896033,
 98007: 170762.950722857

In [30]:
lr.intercept_

-118115479.10400708

In [31]:
y_pred = lr.predict(x_train)
np.exp(lr.score(x_train, y_train))

2.340699005601468

In [32]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$145,169.32'