## Train test split for future model validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)

print(f"X_train is a DataFrame with {X_train.shape[0]} rows and {X_train.shape[1]} columns")
print(f"y_train is a Series with {y_train.shape[0]} values")

In [None]:
from sklearn.preprocessing import OneHotEncoder

condition = X_train[["condition"]]

# (2) Instantiate a OneHotEncoder with categories="auto",
# sparse=False, and handle_unknown="ignore"
ohe = OneHotEncoder(categories="auto", sparse=False, handle_unknown="ignore")

# (3) Fit the encoder on fireplace_qu_train
ohe.fit(condition)

# Inspect the categories of the fitted encoder
ohe.categories_

condition_encoded = ohe.transform(condition)

# (5a) Make the transformed data into a dataframe
condition_encoded = pd.DataFrame(
    # Pass in NumPy array
    condition_encoded,
    # Set the column names to the categories found by OHE
    columns=ohe.categories_[0],
    # Set the index to match X_train's index
    index=X_train.index
)

X_train.drop("condition", axis=1, inplace=True)

X_train = pd.concat([X_train, condition_encoded], axis=1)
X_train

### Transforming Test Set

In [None]:
# only keep relevant columns
X_test = X_test.loc[:, relevant_columns]

#continuous = ['sqft_living', 'sqft_lot']
#categoricals = ['grade', 'condition', 'bedrooms', 'bathrooms', 'floors', 
#                'waterfront', 'mth_sold', 'view', 'zipcode']
#dummy_cat = ['condition', 'waterfront', 'view']

df_cont = X_test[continuous]

# take the log of continuous variables
log_names = [f'{column}_log' for column in df_cont.columns]

df_log = np.log(df_cont)
df_log.columns = log_names

# normalize (subract mean and divide by std)
def normalize(feature):
    return (feature - feature.mean()) / feature.std()

df_log_norm = df_log.apply(normalize)

# take log and normalize y
y_log_test = pd.DataFrame(np.log(y_test))
y_norm_test = y_log_test.apply(normalize)


# categoricals
df_ohe = pd.get_dummies(X_test[dummy_cat], prefix=dummy_cat, drop_first=True)

floors_dummies = pd.get_dummies(X_test['floors'], prefix='floors', drop_first=True)
bedrooms_dummies = pd.get_dummies(X_test['bedrooms'], prefix='beds', drop_first=True)
zipcode_dummies = pd.get_dummies(X_test['zipcode'], prefix='zip', drop_first=True)

# join categorical and continuous dataframes together, drop original columns
X_test_preprocessed = pd.concat([df_log_norm, df_ohe, floors_dummies, 
                                 bedrooms_dummies, zipcode_dummies], axis=1)

These are the R^2 values of the train and test models using a linear regression with square footage of the living room as the independent variable and listing price as the dependent variable. We get similar R^2s for each, around 0.5, which  is a good sign that the model is predicting the test data correctly, but is a fairly low R^2 for an inferential model as currently it only explains around 50% of the variation in house price.

In [None]:
# scores using cross validation with 10 splits

X_base = X_train[[most_correlated_feature]]

base_model = LinearRegression()
base_model.fit(X_base, y_train)

splitter = ShuffleSplit(n_splits=10, test_size=0.25, random_state=0)

baseline_scores = cross_validate(
    estimator=base_model,
    X=X_base,
    y=y_train,
    return_train_score=True,
    cv=splitter
)

print("Train score:     ", baseline_scores["train_score"].mean())
print("Validation score:", baseline_scores["test_score"].mean())

In [None]:
# scores using single test/train split

base_model.fit(X_base, y_train)

print("Train R^2: ", str(base_model.score(X_base, y_train)))
print("Test R^2: ", base_model.score(X_test[[most_correlated_feature]], y_test))

In [None]:
X_base = X[[most_correlated_feature]]
base_model = LinearRegression()
base_model.fit(X_base, y)
print("BASE R^2: ", str(base_model.score(X_base, y)))

In [None]:
pd.plotting.scatter_matrix(X[continuous], figsize=(20,20));

In [None]:
f = 'price~sqft_living'
#f2 = 'sales~radio'
model = smf.ols(formula=f, data=data_ols).fit()
#model2 = smf.ols(formula=f2, data=data).fit()

resid1 = model.resid
#resid2 = model2.resid
fig = sm.graphics.qqplot(resid1, dist=stats.norm, line='45', fit=True)
#fig = sm.graphics.qqplot(resid2, dist=stats.norm, line='45', fit=True)

In [None]:
# Linearity

X_final = X_preprocessed # add when finished
y_final = y

preds = second_model.predict(X_final)
fig, ax = plt.subplots()

perfect_line = np.arange(y_final.min(), y_final.max())
ax.plot(perfect_line, linestyle="--", color="orange", label="Perfect Fit")
ax.scatter(y_final, preds, alpha=0.5)
ax.set_xlabel("Actual Price")
ax.set_ylabel("Predicted Price")
ax.legend();

In [None]:
# log transform continuous variables where it makes sense

# log cols
cols_log = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15']

df_transf = pd.DataFrame([])

for feat in cols_log:
    df_transf[feat] = X[feat].map(lambda x: np.log(x))
    
def normalize(feature):
    return (feature - feature.mean()) / feature.std()

features_final = df_transf.apply(normalize)

features_final.hist(figsize  = [20, 20], bins='auto')

In [None]:
### Normalize log

# normalize (subract mean and divide by std)
def normalize(feature):
    return (feature - feature.mean()) / feature.std()

# apply normalization function
df_log_norm = df_log.apply(normalize)

In [None]:
#def calculate_residuals(model, features, label):
#    """
#    Creates predictions on the features with the model and calculates residuals
#    """
#    predictions = model.predict(features)
#    df_results = pd.DataFrame({'Actual': label, 'Predicted': predictions})
#    df_results['Residuals'] = abs(df_results['Actual']) - abs(df_results['Predicted'])
    
#    return df_results

preds = model.predict(X_model)
fig, ax = plt.subplots()

perfect_line = np.arange(y_model.min(), y_model.max())
#ax.plot(perfect_line, linestyle="--", color="orange", label="Perfect Fit")
ax.scatter(y_model, preds, alpha=0.5)
ax.set_xlabel("Actual Price")
ax.set_ylabel("Predicted Price")
ax.legend();

plt.plot(perfect_line)

In [None]:
##### EXTRA ######

#join categorical and continuous dataframes together, drop original columns
X_preprocessed = X.drop(categoricals, axis=1)
X_preprocessed = pd.concat([X_preprocessed, df_ohe, zipcode_dummies], axis=1) #df_cont?

X_preprocessed

#X_preprocessed = pd.concat([df_log_norm, df_ohe, condition_dummies, #floors_dummies, bedrooms_dummies, 
#                                  zipcode_dummies, X[other_continuous]], axis=1)

In [None]:
second_model = LinearRegression()
second_model.fit(X_preprocessed, y_norm)

print("Second Model R^2: ", second_model.score(X_preprocessed, y_norm))

In [None]:
# normalize (subract mean and divide by std)

def normalize(feature):
    return (feature - feature.mean()) / feature.std()

df_log_norm = df_log.apply(normalize)

y_log = pd.DataFrame(np.log(y))
y_norm = y_log.apply(normalize)

In [None]:
log_cols = ['sqft_living', 'sqft_lot']
other_continuous = ['floors', 'bedrooms']
categoricals = ['grade', 'condition', 'bedrooms', 'bathrooms', 'floors', 
                'waterfront', 'mth_sold', 'view', 'zipcode']
dummy_cat = ['condition', 'waterfront', 'view']

# log transform
df_cont = X[log_cols]
log_names = [f'{column}_log' for column in df_cont.columns]

df_log = np.log(df_cont)
df_log.columns = log_names

In [None]:
# Independence (would be train)

X_final = X_train_preprocessed

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = [variance_inflation_factor(X_final.values, i) for i in range(X_final.shape[1])]
pd.Series(vif, index=X_final.columns, name="Variance Inflation Factor") # looking for below 5