# IMPORTING

In [276]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [277]:
%store -r data

# ALTERATIONS: ENGINEERED

In [278]:
data_2 = data

In [262]:
#Helps to establish how the sqft_living and sqft_lot compares to neightbors
data_2['rel_to_neighbors'] = ((data_2.sqft_living/data_2.sqft_living15) + (data_2.sqft_lot/data_2.sqft_lot15))/2

In [263]:
#Using conditon and grade as multiplier for sqft_living
data_2['size_quality'] = (data_2.condition + data_2.grade) * data.sqft_living

In [279]:
#Assessing entire property using multiplier for sqft_lot
data_2['property_score'] = (data_2.view / 4)*(data_2.waterfront + 1) * data_2.sqft_lot

# New Dataframe

In [280]:
%store data_2

Stored 'data_2' (DataFrame)


# TRAIN-TEST SPLIT

In [281]:
train_set, test_set = train_test_split(data_2, test_size=0.2, random_state=42)
print(len(train_set), "train +", len(test_set), "test")

13832 train + 3458 test


# PREPARE DATA FOR MACHINE LEARNING

In [282]:
x_train = train_set.drop('price', axis=1)
y_train = train_set['price'].copy()

# LINEAR REGRESSION

In [283]:
lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [284]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [285]:
coefficients = lr.coef_
variables = np.array(x_train.columns)
array_to_dict(coefficients, variables)

{'bedrooms': -7351.875710535611,
 'bathrooms': 50077.72555127944,
 'sqft_living': -329.66042830759926,
 'sqft_lot': 0.28483242115045715,
 'floors': 19859.552404435723,
 'waterfront': 571149.0116614584,
 'view': 42246.568272724435,
 'condition': -60057.69114822505,
 'grade': -23282.93630350926,
 'sqft_above': -160.53559862584106,
 'sqft_basement': -169.1248291953024,
 'yr_built': -1783.8406358729208,
 'yr_renovated': 39.65176364088505,
 'zipcode': 1.8917489796876907e-10,
 'lat': 570191.5467505864,
 'long': -98447.63063613555,
 'sqft_living15': 33.05871610626503,
 'sqft_lot15': -0.4489345552116629,
 'date_year': 41130.13920026216,
 'date_month': 2047.999824991405,
 'date_day': -205.1566296059238,
 'rel_to_neighbors': -11068.906543457408,
 'size_quality': 53.1426622207279,
 'property_score': -0.2946456737457666}

In [286]:
lr.intercept_

-117993748.58028573

In [287]:
y_pred = lr.predict(x_train)

In [288]:
lr.score(x_train, y_train)

0.7451961594318894

In [289]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

In [290]:
rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$189,489.02'