# IMPORTING

In [100]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols

In [101]:
data = pd.read_csv('../data/kc_house_data_train.csv')

# Cleaning

In [102]:
#Bedrooms has one error value of 33
#Bedrooms most correlated with sqft_living
#Replace with median bedroom value for houses between 1000 and 2000 sqft_living

test = data[data.sqft_living < 2000]
new_test = data[data.sqft_living > 1000]
new_value = new_test.bedrooms.median()
data.bedrooms = data.bedrooms.apply(lambda x: new_value if x == 33 else x)

In [103]:
data = data.drop(columns=['Unnamed: 0','id'])

In [104]:
data['date_year'] = data.date.apply(lambda x: int(x[:4]))
data['date_month'] = data.date.apply(lambda x: int(x[4:6]))
data['date_day'] = data.date.apply(lambda x: int(x[6:8]))
data.drop('date',axis=1, inplace=True)

# Alterations: Engineered

### Combining Existing Features

In [106]:
data['size_quality'] = (data.condition + data.grade) * data.sqft_living

### Dummy Features

In [107]:
zipcode_dummies = pd.get_dummies(data.zipcode)
merged = pd.concat([data,zipcode_dummies], axis='columns')
data = merged.drop(columns=['zipcode',98155])

# Data Prep

In [116]:
x = data.drop('price', axis=1)
y = data['price'].copy()

# Linear Regression

In [128]:
lr = LinearRegression()

lr.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [129]:
y_pred = lr.predict(x)
lr.score(x, y)

0.8456562612241412

In [130]:
lr_rsme = mean_squared_error(y, y_pred)
lr_rmse = np.sqrt(lr_rsme)

rmse_dollars = '${:,.2f}'.format(lr_rmse)
rmse_dollars

'$146,660.11'