# IMPORTING

In [296]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols

In [297]:
%store -r data_5

# Alterations: Engineered

In [298]:
data_6 = data_5

### Zip codes

In [299]:
zipcode_dummies = pd.get_dummies(data_6.zipcode)

In [300]:
merged = pd.concat([data_6,zipcode_dummies], axis='columns')

In [301]:
merged.shape

(17289, 98)

In [302]:
data_6 = merged.drop(columns=['zipcode',98155])

### Grade

In [303]:
grade_dummies = pd.get_dummies(data_6.grade)

In [304]:
merged = pd.concat([data_6,grade_dummies],axis='columns')
merged.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,4,5,6,7,8,9,10,11,12,13
0,365000.0,4.0,2.25,2070,9.09302,2.0,0,0,4,8,...,0,0,0,0,1,0,0,0,0,0
1,865000.0,5.0,3.0,2900,8.81433,1.0,0,0,5,8,...,0,0,0,0,1,0,0,0,0,0
2,1038000.0,4.0,2.5,3770,9.295876,2.0,0,2,3,11,...,0,0,0,0,0,0,0,1,0,0
3,1490000.0,3.0,3.5,4560,9.589325,2.0,0,2,3,12,...,0,0,0,0,0,0,0,0,1,0
4,711000.0,3.0,2.5,2550,8.5897,2.0,0,0,3,9,...,0,0,0,0,0,1,0,0,0,0


In [305]:
merged.shape

(17289, 107)

In [306]:
data_6 = merged.drop(columns=['grade',13])

### Yr_Built

In [307]:
data_6.yr_built.unique()

array([1986, 1977, 1997, 1990, 2004, 1970, 1987, 2005, 1909, 1930, 1963,
       1968, 2003, 1910, 2001, 1954, 1993, 1969, 1976, 2007, 1983, 1955,
       1981, 1972, 1904, 1942, 1973, 1918, 1961, 1962, 2000, 2013, 2014,
       2009, 1953, 1960, 1994, 1900, 1999, 1949, 1980, 1944, 1921, 1998,
       1925, 1950, 1985, 1974, 1926, 1967, 1975, 1912, 1966, 2006, 1991,
       1959, 1984, 1979, 1964, 1920, 1957, 1995, 1951, 1982, 1988, 1914,
       1917, 1908, 1947, 1971, 2015, 1937, 1958, 2002, 1956, 1938, 1948,
       2008, 2010, 1936, 1907, 2012, 1978, 1941, 1965, 1992, 1927, 1929,
       1903, 1928, 1915, 1923, 1946, 1911, 1996, 1952, 1940, 1916, 1945,
       1943, 1989, 1913, 1906, 1924, 1931, 1922, 1932, 1901, 1905, 2011,
       1939, 1919, 1933, 1935, 1902, 1934])

In [308]:
built_dummies = pd.get_dummies(data_6.yr_built)

In [309]:
merged = pd.concat([data_6,built_dummies],axis='columns')
merged.shape

(17289, 221)

In [310]:
data_6 = merged.drop(columns=['yr_built',1951])

# New Dataframe

In [311]:
%store data_6

Stored 'data_6' (DataFrame)


# Train-Test Split

In [312]:
train_set, test_set = train_test_split(data_6, test_size=0.2, random_state=42)
print(len(train_set), "train +", len(test_set), "test")

13831 train + 3458 test


# PREPARE DATA FOR MACHINE LEARNING

In [313]:
x_train = train_set.drop('price', axis=1)
y_train = train_set['price'].copy()

# LINEAR REGRESSION

In [314]:
lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [315]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [316]:
coefficients = lr.coef_
variables = np.array(x_train.columns)

array_to_dict(coefficients, variables)

{'bedrooms': -4381.293656422312,
 'bathrooms': 25874.232199669594,
 'sqft_living': -429.6284800792167,
 'sqft_lot': 39813.780088249034,
 'floors': -92028.0882147824,
 'waterfront': 629903.9042262799,
 'view': -27064.851572409276,
 'condition': -28192.217530420196,
 'sqft_above': 189352.44989315874,
 'sqft_basement': 50.69920800822729,
 'yr_renovated': -16.876947170792846,
 'lat': 12778536.634622756,
 'long': -271477.82557356096,
 'sqft_living15': 25.39069549800479,
 'sqft_lot15': -10523.628342440228,
 'date_year': 39564.248879379724,
 'date_month': 1613.3141834754038,
 'date_day': -102.3555252221156,
 'rel_to_neighbors': -3367.3530716590926,
 'size_quality': 13.823471309784509,
 'property_score': -0.26016674397396855,
 'floor_reno_mult': 28.368731428636238,
 'grade_sqft_living': 14.213809225388104,
 'categorical_proportions': 106.28501000883989,
 98001: -35791.36415089585,
 98002: -2781.1421355873354,
 98003: -52633.40492949772,
 98004: 680121.8610721886,
 98005: 242846.6521427796,
 98

In [317]:
lr.intercept_

-162850432.3139968

In [318]:
y_pred = lr.predict(x_train)
lr.score(x_train, y_train)

0.8585814372325556

In [319]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$140,647.18'