# IMPORTING

In [99]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn.preprocessing import MinMaxScaler
from scipy import stats 
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('../kc_house_data_train.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,2591820310,20141006T000000,365000.0,4,2.25,2070,8893,2.0,0,...,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,1,7974200820,20140821T000000,865000.0,5,3.0,2900,6730,1.0,0,...,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283


In [100]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
print(len(train_set), "train +", len(test_set), "test")

13832 train + 3458 test


# PREPARE DATA FOR MACHINE LEARNING

## Data Alterations

Dropped Columns: 'Unnamed: 0'; 'id'; 'sqft_lot15'; 'sqft_above'; 'sqft_living15'

Date Column: made into three separate columns (date_year, date_month, date_day)

Bedroom Column: made into three bins (1: x<q1 2: q1<x<q3 3: x>q3)

Bathroom Column: made into three bins (1: x<q1 2: q1<x<q3 3: x>q3)

New Columns: 
 - sqft_above_to_living15 = sqft_above / sqft_living15
 - sqft_lot_bigger = 1 if sqft_lot > sqft_lot15 else 0


In [101]:
x_train = train_set.drop('price', axis=1)
y_train = train_set['price'].copy()

#### Column Modification

In [102]:
prep_x = x_train

prep_x['date_year'] = prep_x.date.apply(lambda x: int(x[:4]))
prep_x['date_month'] = prep_x.date.apply(lambda x: int(x[4:6]))
prep_x['date_day'] = prep_x.date.apply(lambda x: int(x[6:8]))
prep_x.drop('date',axis=1, inplace=True)

prep_x['sqft_above_to_living15'] = prep_x.sqft_above / prep_x.sqft_living15
prep_x['sqft_lot_bigger'] = prep_x.sqft_lot > prep_x.sqft_lot15
prep_x['sqft_lot_bigger'] = prep_x['sqft_lot_bigger'].apply(lambda x: int(x))

prep_x.head(1)

Unnamed: 0.1,Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,date_year,date_month,date_day,sqft_above_to_living15,sqft_lot_bigger
2498,2498,1776460190,3,2.5,2370,5353,2.0,0,0,3,...,98019,47.7333,-121.975,2130,6850,2014,6,26,1.112676,0


In [103]:
prep_x.bedrooms = prep_x.bedrooms.apply(lambda x: 6 if x>6 else x)

In [104]:
prep_x.bedrooms.value_counts()

3    6244
4    4388
2    1788
5    1036
6     236
1     133
0       7
Name: bedrooms, dtype: int64

In [105]:
prep_x.head(2)

Unnamed: 0.1,Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,date_year,date_month,date_day,sqft_above_to_living15,sqft_lot_bigger
2498,2498,1776460190,3,2.5,2370,5353,2.0,0,0,3,...,98019,47.7333,-121.975,2130,6850,2014,6,26,1.112676,0
10932,10932,2144800215,4,1.75,2080,13629,1.0,0,0,4,...,98178,47.4866,-122.232,1780,14659,2014,5,19,0.58427,0


In [106]:
def series_quartile(x):
    if x <= 1.5:
        return 0
    elif x > 1.5 and x <= 2.5:
        return 1
    else:
        return 2

prep_x['bathrooms'] = prep_x.bathrooms.apply(lambda x: series_quartile(x))

In [107]:
prep_x.head(2)

Unnamed: 0.1,Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,date_year,date_month,date_day,sqft_above_to_living15,sqft_lot_bigger
2498,2498,1776460190,3,1,2370,5353,2.0,0,0,3,...,98019,47.7333,-121.975,2130,6850,2014,6,26,1.112676,0
10932,10932,2144800215,4,1,2080,13629,1.0,0,0,4,...,98178,47.4866,-122.232,1780,14659,2014,5,19,0.58427,0


#### Dropping Columns

In [108]:
prep_x = prep_x.drop(columns = ['Unnamed: 0','id','sqft_lot15','sqft_above','sqft_living15'])

prep_x.head(2)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,date_year,date_month,date_day,sqft_above_to_living15,sqft_lot_bigger
2498,3,1,2370,5353,2.0,0,0,3,8,0,2009,0,98019,47.7333,-121.975,2014,6,26,1.112676,0
10932,4,1,2080,13629,1.0,0,0,4,7,1040,1955,0,98178,47.4866,-122.232,2014,5,19,0.58427,0


# LINEAR REGRESSION

In [109]:
x_train = prep_x

lr = LinearRegression()

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [110]:
def array_to_dict(labels,coeffs):
    '''
    This function takes two arrays as arguments and turns those 
    arrays into a dictionary so that it is easier to interpret the 
    coefficients from the sklearn analysis
    '''
    one_list = list(labels)
    two_list = list(coeffs)
    res = {} 
    for key in two_list: 
        for name in one_list: 
            res[key] = name 
            one_list.remove(name) 
            break 
    return res

In [111]:
coefficients = lr.coef_
variables = np.array(prep_x.columns)

array_to_dict(coefficients, variables)

{'bedrooms': -34745.68215351098,
 'bathrooms': 165.2894263320794,
 'sqft_living': 233.14180573915888,
 'sqft_lot': -0.022219952180359128,
 'floors': 20707.54814120738,
 'waterfront': 579726.5906951467,
 'view': 52059.1406737313,
 'condition': 32499.502969490943,
 'grade': 94945.61272968407,
 'sqft_basement': -53.57831344349128,
 'yr_built': -2309.457689280643,
 'yr_renovated': 26.53692431641653,
 'zipcode': -576.0557833368589,
 'lat': 604047.255586731,
 'long': -235894.88745034905,
 'date_year': 38989.84302137165,
 'date_month': 1468.1906833189703,
 'date_day': -222.84618818824805,
 'sqft_above_to_living15': -67177.00331504074,
 'sqft_lot_bigger': -10098.966345031637}

In [112]:
lr.intercept_

-75687739.13135213

In [113]:
y_pred = lr.predict(x_train)

In [114]:
lr.score(x_train, y_train)

0.6980168325109959

In [115]:
lr_rsme = mean_squared_error(y_train, y_pred)
lr_rmse = np.sqrt(lr_rsme)

rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$206,287.28'