In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats  
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.api as sms
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.dummy import DummyRegressor
# clean up 

# Import Data and Clean

In [2]:
data = pd.read_csv("data/kc_house_data.csv")
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [3]:
# linear regression prep
df=data.drop(['id','date'], axis=1).copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21597 non-null  float64
 1   bedrooms       21597 non-null  int64  
 2   bathrooms      21597 non-null  float64
 3   sqft_living    21597 non-null  int64  
 4   sqft_lot       21597 non-null  int64  
 5   floors         21597 non-null  float64
 6   waterfront     19221 non-null  float64
 7   view           21534 non-null  float64
 8   condition      21597 non-null  int64  
 9   grade          21597 non-null  int64  
 10  sqft_above     21597 non-null  int64  
 11  sqft_basement  21597 non-null  object 
 12  yr_built       21597 non-null  int64  
 13  yr_renovated   17755 non-null  float64
 14  zipcode        21597 non-null  int64  
 15  lat            21597 non-null  float64
 16  long           21597 non-null  float64
 17  sqft_living15  21597 non-null  int64  
 18  sqft_l

In [4]:
df.isna().sum()

price               0
bedrooms            0
bathrooms           0
sqft_living         0
sqft_lot            0
floors              0
waterfront       2376
view               63
condition           0
grade               0
sqft_above          0
sqft_basement       0
yr_built            0
yr_renovated     3842
zipcode             0
lat                 0
long                0
sqft_living15       0
sqft_lot15          0
dtype: int64

In [5]:
# replaced NaN's with 0
df.waterfront.fillna(value=0, inplace = True)
df.view.fillna(value=0, inplace = True)
df.yr_renovated.fillna(value=0, inplace = True)

In [6]:
# convert from object to float and replace '?'
df['sqft_basement'] = df['sqft_basement'].replace('?','0').astype(float)

In [7]:
# dropped outlier
df=df.drop(df[df['bedrooms']==33].index).copy()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21596 entries, 0 to 21596
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21596 non-null  float64
 1   bedrooms       21596 non-null  int64  
 2   bathrooms      21596 non-null  float64
 3   sqft_living    21596 non-null  int64  
 4   sqft_lot       21596 non-null  int64  
 5   floors         21596 non-null  float64
 6   waterfront     21596 non-null  float64
 7   view           21596 non-null  float64
 8   condition      21596 non-null  int64  
 9   grade          21596 non-null  int64  
 10  sqft_above     21596 non-null  int64  
 11  sqft_basement  21596 non-null  float64
 12  yr_built       21596 non-null  int64  
 13  yr_renovated   21596 non-null  float64
 14  zipcode        21596 non-null  int64  
 15  lat            21596 non-null  float64
 16  long           21596 non-null  float64
 17  sqft_living15  21596 non-null  int64  
 18  sqft_l

# Regression Modelling

In [9]:
x=df.drop('price', axis=1).copy()
y=df['price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

first_feature = 'sqft_living'

In [10]:
# 2nd attempt
# Baseline Model

baseline_model = LinearRegression()

baseline_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature]],
    y=y_train,
    return_train_score=True,
)

print("Train score:", baseline_scores["train_score"].mean())
print("Validation score:", baseline_scores["test_score"].mean())

Train score: 0.49246148492778125
Validation score: 0.49036568409166126


In [11]:
# 2nd attempt
# Test Baseline Model

baseline_model = LinearRegression()

baseline_scores = cross_validate(
    estimator=baseline_model,
    X=X_test[[first_feature]],
    y=y_test,
    return_train_score=True,
)

print("Train score:", baseline_scores["train_score"].mean())
print("Validation score:", baseline_scores["test_score"].mean())

Train score: 0.49386577337789894
Validation score: 0.4861124739468904


In [12]:
# 2nd attempt
# Cross-Validate

mse = make_scorer(mean_squared_error)

cv_5_results = cross_val_score(baseline_model, X_train, y_train, cv=5, scoring=mse)

cv_5_results

array([4.56688197e+10, 4.17201493e+10, 4.21526212e+10, 3.98921574e+10,
       3.55408993e+10])

In [20]:
sm.OLS(y, sm.add_constant(x)).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.701
Model:,OLS,Adj. R-squared:,0.7
Method:,Least Squares,F-statistic:,2805.0
Date:,"Thu, 15 Jul 2021",Prob (F-statistic):,0.0
Time:,08:21:29,Log-Likelihood:,-294350.0
No. Observations:,21596,AIC:,588700.0
Df Residuals:,21577,BIC:,588900.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.016e+06,2.93e+06,2.396,0.017,1.28e+06,1.28e+07
bedrooms,-3.927e+04,1978.769,-19.846,0.000,-4.31e+04,-3.54e+04
bathrooms,4.271e+04,3264.966,13.082,0.000,3.63e+04,4.91e+04
sqft_living,105.2067,18.065,5.824,0.000,69.797,140.616
sqft_lot,0.1257,0.048,2.624,0.009,0.032,0.220
floors,6907.6478,3597.715,1.920,0.055,-144.140,1.4e+04
waterfront,6.167e+05,1.81e+04,34.031,0.000,5.81e+05,6.52e+05
view,5.302e+04,2120.755,25.001,0.000,4.89e+04,5.72e+04
condition,2.677e+04,2344.542,11.417,0.000,2.22e+04,3.14e+04

0,1,2,3
Omnibus:,18265.348,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1809935.792
Skew:,3.541,Prob(JB):,0.0
Kurtosis:,47.286,Cond. No.,215000000.0


In [16]:
# OLD WAY

In [13]:
# basic model with sqft_living
x=df.drop('price', axis=1).copy()
y=df['price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


first_feature = 'sqft_living'
second_feature = 'bedrooms'
third_feature = 'waterfront'
fourth_feature = 'grade'
fifth_feature = 'yr_built'
sixth_feature = 'zipcode'
seventh_feature = 'bathrooms'
eighth_feature = 'sqft_lot'
ninth_feature = 'zipcode'

In [14]:
# before dummying
# cross val, run model w test data, train, test, and cross val score
baseline_model = LinearRegression()


baseline_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", baseline_scores["train_score"].mean())
print("Validation score:", baseline_scores["test_score"].mean())


NameError: name 'splitter' is not defined

In [None]:
second_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

second_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, second_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", second_scores["train_score"].mean())
print("Validation score:", second_scores["test_score"].mean())

In [None]:
third_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

third_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, second_feature, third_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", third_scores["train_score"].mean())
print("Validation score:", third_scores["test_score"].mean())

In [None]:
fourth_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

fourth_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, second_feature, third_feature, fourth_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", fourth_scores["train_score"].mean())
print("Validation score:", fourth_scores["test_score"].mean())

In [None]:
fourth_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

fourth_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, second_feature, third_feature, fourth_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", fourth_scores["train_score"].mean())
print("Validation score:", fourth_scores["test_score"].mean())

In [None]:
fifth_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

fifth_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, second_feature, third_feature, fourth_feature, fifth_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", fifth_scores["train_score"].mean())
print("Validation score:", fifth_scores["test_score"].mean())

In [None]:
sixth_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

sixth_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, second_feature, third_feature, fourth_feature, fifth_feature, sixth_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", sixth_scores["train_score"].mean())
print("Validation score:", sixth_scores["test_score"].mean())

In [None]:
seventh_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

seventh_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, 
               second_feature, 
               third_feature, 
               fourth_feature, 
               fifth_feature, 
               sixth_feature, 
               seventh_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", seventh_scores["train_score"].mean())
print("Validation score:", seventh_scores["test_score"].mean())

In [None]:
eighth_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

eighth_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, 
               second_feature, 
               third_feature, 
               fourth_feature, 
               fifth_feature, 
               sixth_feature, 
               seventh_feature, 
               eighth_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", eighth_scores["train_score"].mean())
print("Validation score:", eighth_scores["test_score"].mean())

In [None]:
ninth_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

ninth_scores = cross_validate(
    estimator=baseline_model,
    X=X_train[[first_feature, 
               second_feature, 
               third_feature, 
               fourth_feature, 
               fifth_feature, 
               sixth_feature, 
               seventh_feature, 
               eighth_feature, 
               ninth_feature]],
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", ninth_scores["train_score"].mean())
print("Validation score:", ninth_scores["test_score"].mean())