In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import statsmodels.api as sm
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate, ShuffleSplit
sns.set_theme(palette='magma_r')

pd.set_option('display.max_rows', 100) # Allows Jupyter Notebook to expand how much data is shown.

In [None]:
df = pd.read_csv('data/kc_house_data.csv')
df.info()

In [None]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(15,10))
axs[0, 0].scatter(df.sqft_living, df.price)
axs[0, 0].set_title('sqft_living')
axs[0, 1].scatter(df.sqft_above, df.price)
axs[0, 1].set_title('sqft_above')
axs[1, 0].scatter(df.sqft_living15, df.price)
axs[1, 0].set_title('sqft_living15')
axs[1, 1].scatter(df.bathrooms, df.price)
axs[1, 1].set_title('bathrooms')
axs[2, 0].scatter(df.bedrooms, df.price)
axs[2, 0].set_title('bedrooms')
axs[2, 1].scatter(df.lat, df.price)
axs[2, 1].set_title('lat')
fig.tight_layout();

In [None]:
df.corr()

In [None]:
price_corr = df.corr()['price'].map(abs).sort_values(ascending=False)
price_corr

In [None]:
sns.heatmap(df.corr(),center=0);

# Data cleaning

In [None]:
def determine_dupes(series):
    series_vcs = pd.Series(series.value_counts())
    series_dupes = [series_vcs.index[index] for index in range(len(series_vcs)) if series_vcs.values[index] > 1]
    print("Amount of unique duplicates: " + str(len(series_dupes)))
    print("Total amount of duplicates: " + str(series_vcs.values[0:len(series_dupes)].sum()))
    
    return series_vcs

In [None]:
determine_dupes(df.id)

In [None]:
df = df.drop_duplicates(subset=['id'], keep='last')
df.info()

In [None]:
df.drop(df.loc[df['bedrooms']==33].index, inplace=True)
df.drop(df.loc[df['bedrooms']==11].index, inplace=True)
df.drop(df.loc[df['bedrooms']==10].index, inplace=True)
df.drop(df.loc[df['bedrooms']==9].index, inplace=True)

df.sort_values('bedrooms', ascending=False).head(10)

In [None]:
df.yr_renovated = df.yr_renovated.fillna(0)
df.yr_renovated = df.yr_renovated.astype('int64')

df.view = df.view.fillna('NONE')

df.waterfront = df.waterfront.fillna('NO')

df.loc[df.sqft_basement == '?', 'sqft_basement'] = 0.0
df.sqft_basement = df.sqft_basement.astype('float64').astype('int64')

In [None]:
df.info()

In [None]:
df.grade = pd.to_numeric(df.grade.map(lambda x: x.split()[0]))
for index in df.grade.value_counts().sort_index().index:
    df.grade.replace(index, index-2, inplace=True)

In [None]:
# changing condition from string to numeric
df['condition'].replace('Poor', 1, inplace=True)
df['condition'].replace('Fair', 2, inplace=True)
df['condition'].replace('Average', 3, inplace=True)
df['condition'].replace('Good', 4, inplace=True)
df['condition'].replace('Very Good', 5, inplace=True)
df.condition.value_counts()

In [None]:
lb_make = LabelEncoder()
df['waterfront'] = lb_make.fit_transform(df['waterfront'])
df.waterfront.value_counts()

In [None]:
df['view'].replace('NONE', 0, inplace=True)
df['view'].replace('FAIR', 2, inplace=True)
df['view'].replace('AVERAGE', 3, inplace=True)
df['view'].replace('GOOD', 4, inplace=True)
df['view'].replace('EXCELLENT', 5, inplace=True)
df.view.value_counts()

In [None]:
df['date'] = pd.to_datetime(df['date'])
df.info()

# Modeling Preparation

In [None]:
X = df.drop('price',axis=1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
sns.distplot(y_train, fit=stats.norm)
fig = plt.figure()
stats.probplot(y_train, plot=plt);

The distribution of price is not normal. A transfromation may be needed to normalize the distribution of price.

In [None]:
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

sns.distplot(y_train_log, fit=stats.norm)
fig = plt.figure()
stats.probplot(y_train_log, plot=plt);

The log transformation normalized the distribution well.

In [None]:
heatmap_data = pd.concat([y_train, X_train], axis=1)
corr = heatmap_data.corr()

fig, ax = plt.subplots(figsize=(13, 13))

sns.heatmap(
    
    data=corr,
    
    mask=np.triu(np.ones_like(corr, dtype=bool)),
    
    ax=ax,
    
    annot=True,
    
    cbar_kws={"label": "Correlation", "orientation": "horizontal", "pad": .2, "extend": "both"}
)

# Customize the plot appearance
ax.set_title("Heatmap of Correlation Between Attributes (Including Target)");

According to the heat map, sqft_living is the hightest correlated feature to price. Other highly correlated features include bathroom, grade, sqft_above, sqft_living, and sqft_basement.

In [None]:
fig, ax = plt.subplots()

ax.scatter(X_train['sqft_living'], y_train, alpha=0.5)
ax.set_xlabel('sqft_living')
ax.set_ylabel("House Price")
ax.set_title("sqft_living vs. House Price");

In [None]:
y_train.mean()

# Models

### Base Model

In [None]:
baseline = DummyRegressor(strategy='mean')
baseline.fit(X_train,y_train_log)
baseline.score(X_test,y_test_log)

### First Model

Run our first model using only the most correlated independent variable.

In [None]:

first_model = LinearRegression()

splitter = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)

first_scores = cross_validate(estimator=first_model,
                                 X=X_train[['sqft_living']],
                                 y=y_train_log, return_train_score=True,
                                 cv=splitter)

print('Train score: ', first_scores['train_score'].mean())
print('Validation score: ', first_scores['test_score'].mean())

### Second Model

Selecting relevant columns as features for our second model.

In [None]:
select_features = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
                             'floors', 'waterfront', 'view', 'condition', 'grade',
                             'sqft_above', 'sqft_basement', 'sqft_living15',
                             'sqft_lot15']].copy()

In [None]:
second_model = LinearRegression()

second_model_scores = cross_validate(
    estimator=second_model,
    X=select_features,
    y=y_train_log,
    return_train_score=True,
    cv=splitter
)

print("Current Model")
print("Train score:     ", second_model_scores["train_score"].mean())
print("Validation score:", second_model_scores["test_score"].mean())
print()

In [None]:
import statsmodels.api as sm

sm.OLS(y_train, sm.add_constant(select_features)).fit().summary()

In [None]:
#looking at p-values, remove sqft_above, floors, and sqft_lot.
select_features = select_features.drop(['sqft_above','floors','sqft_lot'],axis=1)

### Third Model

In [None]:
third_model = LinearRegression()

third_model_scores = cross_validate(estimator=third_model,
                                     X=select_features, y=y_train_log,
                                     return_train_score=True, cv=splitter)
print("Current Model")
print("Train score:     ", third_model_scores["train_score"].mean())
print("Validation score:", third_model_scores["test_score"].mean())
print()

In [None]:
sm.OLS(y_train, sm.add_constant(select_features)).fit().summary()

In [None]:
from sklearn.feature_selection import RFECV
X_train_for_RFECV = StandardScaler().fit_transform(select_features)

model_for_RFECV = LinearRegression()

selector = RFECV(model_for_RFECV, cv=splitter)
selector.fit(X_train_for_RFECV, y_train_log)

print("Was the column selected?")
for index, col in enumerate(select_features.columns):
    print(f"{col}: {selector.support_[index]}")

In [None]:
select_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                  'waterfront', 'view', 'condition', 'grade', 'sqft_above',
                  'sqft_living15', 'sqft_lot15']

### Final Model

In [None]:
X_train_final = X_train[select_features]
X_test_final = X_test[select_features]

final_model = LinearRegression()
final_model.fit(X_train_final, y_train)

final_model.score(X_test_final, y_test)

# Checking Assumptions of Final Model

### Linearity

In [None]:
preds = final_model.predict(X_test_final)
fig, ax = plt.subplots()

perfect_line = np.arange(y_test.min(), y_test.max())
ax.plot(perfect_line, color="g", label="Perfect Fit")
ax.scatter(y_test, preds, alpha=0.5)
ax.set_xlabel("Actual Price")
ax.set_ylabel("Predicted Price")
ax.legend();

The scatter shows a linear relationship of the features vs the target.

### Normality of residuals

In [None]:
import scipy.stats as stats
residuals = (y_test - preds)
sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True);

Based on the qqplot we can see that the residuals are not normally distributed.

### Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = [variance_inflation_factor(X_train_final.values, i) for i in range(X_train_final.shape[1])]
pd.Series(vif, index=X_train_final.columns, name="Variance Inflation Factor")

The VIF values for all the features minus waterfront, view, sqft_lot15, and sqft_lot are high. This indicates there is is strong multicollinearity among the features.

### Homoskedasticity

In [None]:
fig, ax = plt.subplots()

ax.scatter(preds, residuals, alpha=0.5)
ax.plot(preds, [0 for i in range(len(X_test))])
ax.set_xlabel("Predicted Value")
ax.set_ylabel("Actual - Predicted Value");

A clear funnel shaped pattern is shown for the scatter of residuals vs predicted value which indicates that the Homoskedasticity assumption is not fulfilled.