# IMPORTING

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../kc_house_data_train.csv')

# TRAIN-TEST SPLIT

In [57]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
print(len(train_set), "train +", len(test_set), "test")

13832 train + 3458 test


# PREPARE DATA FOR MACHINE LEARNING

## Data Alterations

Dropped columns: 'Unnamed: 0' and 'id'

Date column: made into three separate columns (date_year, date_month, date_day)

In [13]:
x_train = train_set.drop('price', axis=1)
y_train = train_set['price'].copy()

In [14]:
prep_x = x_train

In [15]:
prep_x['date_year'] = prep_x.date.apply(lambda x: int(x[:4]))
prep_x['date_month'] = prep_x.date.apply(lambda x: int(x[4:6]))
prep_x['date_day'] = prep_x.date.apply(lambda x: int(x[6:8]))
prep_x.drop('date',axis=1, inplace=True)

In [16]:
prep_x.drop(columns = ['Unnamed: 0','id'], inplace=True)

In [17]:
prep_x.head(1)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date_year,date_month,date_day
2498,3,2.5,2370,5353,2.0,0,0,3,8,2370,...,2009,0,98019,47.7333,-121.975,2130,6850,2014,6,26


In [18]:
x_train = prep_x

# LINEAR REGRESSION

In [19]:
lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [48]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [49]:
coefficients = lr.coef_
variables = np.array(data.columns)

array_to_dict(coefficients, variables)

{'Unnamed: 0': -35442.50643934974,
 'id': 42064.455976336205,
 'date': 114.98278805200471,
 'price': 0.12087656671440433,
 'bedrooms': 5719.5501495191,
 'bathrooms': 586618.3336714575,
 'sqft_living': 52077.201164474085,
 'sqft_lot': 29708.378612860455,
 'floors': 95394.38313534975,
 'waterfront': 74.42298247012005,
 'view': 40.55980539012618,
 'condition': -2642.624622718328,
 'grade': 18.709223856838122,
 'sqft_above': -567.500254358162,
 'sqft_basement': 604818.9798871927,
 'yr_built': -219199.77061419416,
 'yr_renovated': 19.000290992067022,
 'zipcode': -0.3500956846880697,
 'lat': 39051.915520845156,
 'long': 1506.5657036343177,
 'sqft_living15': -246.47685466137617}

In [50]:
lr.intercept_

-74063478.96691272

In [52]:
y_pred = lr.predict(x_train)

In [53]:
lr.score(x_train, y_train)

0.6986078692370692

In [55]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

In [56]:
rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$206,085.31'