In [59]:
# bring in libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import sklearn
import joblib
import time

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge

%matplotlib inline 

import warnings 
warnings.filterwarnings("ignore")

In [2]:
filename = "https://library.startlearninglabs.uw.edu/DATASCI410/Datasets/kc_house_data.csv"

df = pd.read_csv(filename)

In [3]:
# No nulls

In [6]:
# outlier removal
numeric = ['price',
 'sqft_living',
 'sqft_lot',
 'sqft_above',
 'sqft_basement',
 'sqft_living15',
 'sqft_lot15']

def remove_outliers(df, col): 
    high = np.mean(df[col]) + 3 * np.std(df[col])
    low = np.mean(df[col]) - 3 * np.std(df[col])
    flag = (df.loc[:,col] < high) & (df.loc[:,col] > low)
    df.loc[~flag, col] = np.float('NaN') 
    return df 

# cleanse the outliers in the above numeric columns
for col in numeric:
    df = remove_outliers(df, col)

dfa = df.dropna()

In [7]:
# Re-Binning
for i in range(1,5):
    dfa.loc[dfa.loc[:,'grade'] == i, 'grade'] = 0

for i in range(5,9):
    dfa.loc[dfa.loc[:,'grade'] == i, 'grade'] = 1

for i in range(9,14):
    dfa.loc[dfa.loc[:,'grade'] == i, 'grade'] = 2

In [8]:
##### Feature extraction #####
# fix data
def fix_date(date):
    import re
    clean_date = re.sub('T000000', '', date)
    y = clean_date[:4]
    m = clean_date[4:6]
    d = clean_date[6:]
    return y + '-' + m + '-' + d

dfa['date'] = dfa['date'].apply(lambda x: fix_date(x))

# feature extraction: date 
dfa['year'] = dfa['date'].astype(str).str[:4]
dfa['month'] = dfa['date'].astype(str).str[5:7]
dfa['day'] = dfa['date'].astype(str).str[8:10]

In [11]:
# combine lat and long 
pca = PCA(n_components=1)
pc1 = pca.fit_transform(dfa[['long', 'lat']])

dfa['location'] = pc1

In [111]:
X = dfa[[
    'bedrooms',
    'bathrooms',
    'waterfront',
    'view',
    'grade',
    'lat',
    'sqft_living'
]]

X2 = dfa[['sqft_above', 
    'sqft_living15', 
    'sqft_living', 
    'sqft_basement', 
    'bedrooms', 
    'bathrooms', 
    'waterfront', 
    'floors', 
    'lat', 
    'view', 
    'grade']]

y = dfa['price'].to_numpy()

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, # features
                                                    y, # response 
                                                    test_size = 0.8, # 80/20 Split; Training 80%, Testing 20% 
                                                    random_state = 42) # seeding

In [113]:
linreg = LinearRegression()

linreg.fit(X_train, y_train)

LinearRegression()

In [114]:
preds = linreg.predict(X_test)
r2_score(y_test, preds)

0.6025747157300844

In [115]:
model_v1 = Pipeline([
    ('Scaler', StandardScaler()),
    ('Polynomial Features', PolynomialFeatures(include_bias=False)),
    ('Linear Regression', LinearRegression())
])

model_v1.fit(X_train, y_train)

Pipeline(steps=[('Scaler', StandardScaler()),
                ('Polynomial Features', PolynomialFeatures(include_bias=False)),
                ('Linear Regression', LinearRegression())])

In [116]:
preds_v1 = model_v1.predict(X_test)
r2_score(y_test, preds_v1)

0.6250382610418221

In [117]:
model_v2 = Pipeline([
    ('Scaler', StandardScaler()),
    ('Polynomial Features', PolynomialFeatures(include_bias=False)),
    ('Ridge', Ridge(alpha=50))
])

model_v2.fit(X_train, y_train)

Pipeline(steps=[('Scaler', StandardScaler()),
                ('Polynomial Features', PolynomialFeatures(include_bias=False)),
                ('Ridge', Ridge(alpha=50))])

In [118]:
preds_v2 = model_v2.predict(X_test)
r2_score(y_test, preds_v2)

0.6252064165343844

In [119]:
joblib.dump(model_v2, 'model_v2.joblib', compress=False)
print('model_v2.joblib saved')

model_v2.joblib saved
