## Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import sklearn
import sklearn.metrics as metrics

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

## Read Data

In [2]:
train_cleaned = pd.read_csv('../data/train_cleaned.csv', index_col='Id')
test_cleaned = pd.read_csv('../data/test_cleaned.csv', index_col='Id')

## Baseline Regression Model

In [3]:
# Train data

X_train = train_cleaned.drop(['SalePrice', 'PID'], axis=1).copy()
y_train = train_cleaned[['SalePrice']].copy()

# Test data

X_test = test_cleaned.drop(['PID'], axis=1).copy()

In [4]:
X_train.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Gara

In [5]:
X_test.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Gara

In [6]:
nominal_features = ['PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config',
                    'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style',
                    'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating',
                    'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type']

In [7]:
# Take all nominal features except PID
reduced_nominal_features = nominal_features[1:]

**Get dummies of train dataset**

In [8]:
X_train = pd.get_dummies(X_train, columns=reduced_nominal_features, drop_first=True)

**Get dummies of test dataset**

In [9]:
X_test = pd.get_dummies(X_test, columns=reduced_nominal_features, drop_first=False)

**Ensure that the number of columns in the train and test datasets are identical and in the same order**

In [10]:
def handle_unequal_train_test_columns(X_train, X_test):
    columns_not_in_test = list(set(X_train.columns).difference(set(X_test.columns)))
    for col in columns_not_in_test:
        X_test[col] = 0
        
    columns_not_in_train = list(set(X_test.columns).difference(set(X_train.columns)))
    X_test.drop(columns_not_in_train, axis=1, inplace=True)
    return X_train[sorted(X_train.columns)], X_test[sorted(X_test.columns)]

In [11]:
X_train, X_test = handle_unequal_train_test_columns(X_train, X_test)

In [14]:
np.array_equal(X_test.columns, X_train.columns)

True

**Linear Regression**

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X_train, y_train)

In [None]:
y_train_pred = linreg.predict(X_train)

In [None]:
def r2_adj(y_true, y_preds, p):
    n = len(y_true)
    y_mean = np.mean(y_true)
    numerator = np.sum(np.square(y_true - y_preds)) / (n - p - 1)
    denominator = np.sum(np.square(y_true - y_mean)) / (n - 1)
    return (1 - (numerator / denominator)).values[0]

In [None]:
def get_regression_metrics(y_true, y_pred, p):
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    msle = metrics.mean_squared_log_error(y_true, y_pred)
    mae = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    r2a = r2_adj(y_true, y_pred, p)
    
    print('Mean squared error      = ', mse)
    print('Root mean squared error = ', rmse)
    print('Mean squared log error  = ', msle)
    print('Median absolute error   = ', mae)
    print('R^2                     = ', r2)
    print('Adjusted R^2            = ', r2a)
    
    return {
        'mse': mse,
        'rmse': rmse,
        'msle': msle,
        'mae': mae,
        'r2': r2,
        'r2_adjusted': r2a
    }

In [None]:
regression_metrics = get_regression_metrics(y_train, y_train_pred, X_train.shape[1])

In [None]:
regression_metrics['rmse']

In [None]:
scores = cross_val_score(linreg, X_train, y_train, cv=5)
scores.mean()

In [None]:
X_test_indices = X_test.index

In [None]:
y_test_pred = linreg.predict(X_test)

In [None]:
linreg.coef_

In [None]:
y_test_pred = [i[0] for i in y_test_pred.tolist()]

In [None]:
def make_and_get_submissions_file(X_test_indices, y_test_pred, file_name):
    submissions = pd.DataFrame({'Id': X_test_indices, 'SalePrice': y_test_pred})
    submissions.set_index('Id', inplace=True)
    submissions.sort_index(inplace=True)
    submissions.to_csv('../data/{}.csv'.format(file_name))
    
    return submissions

In [None]:
submissions = make_and_get_submissions_file(X_test_indices, y_test_pred, 'submissions_baseline_regression')

## Scaling and Regularization

In [None]:
# Train data

X_train = train_cleaned.drop(['SalePrice', 'PID'], axis=1).copy()
y_train = train_cleaned[['SalePrice']].copy()

# Test data
X_test = test_cleaned.drop(['PID'], axis=1).copy()

In [None]:
nominal_features = ['PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config',
                    'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style',
                    'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating',
                    'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type']

In [None]:
# Take all nominal features except PID
reduced_nominal_features = nominal_features[1:]

**Get dummies for ordinal features**

In [None]:
X_train = pd.get_dummies(X_train, columns=reduced_nominal_features, drop_first=True)

In [None]:
X_test = pd.get_dummies(X_test, columns=reduced_nominal_features, drop_first=False)

In [None]:
X_train, X_test = handle_unequal_train_test_columns(X_train, X_test)
X_train_columns = X_train.columns
X_test_columns = X_test.columns

**Standardization of features**

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

**Lasso Regression with All Features** 

In [None]:
# Cross Validation
optimal_lasso = LassoCV(n_alphas=500, cv=10) # uses 10-fold

In [None]:
# Model Fitting and Evaluation
optimal_lasso.fit(X_train, y_train)

lasso = Lasso(alpha=optimal_lasso.alpha_)
lasso.fit(X_train, y_train)

lasso_scores = cross_val_score(lasso, X_train, y_train, cv=10)
lasso_scores.mean()

In [None]:
(-cross_val_score(lasso, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')).mean()

In [None]:
y_test_pred = lasso.predict(X_test)

In [None]:
submissions = make_and_get_submissions_file(X_test_indices, y_test_pred, 'submissions_lasso_regression_all_features')

In [None]:
plt.figure(figsize=(10, 8))
plt.stem(lasso.coef_)

In [None]:
(lasso.coef_ != 0).nonzero()[0]

In [None]:
X_test_columns[(lasso.coef_ != 0).nonzero()[0]]

In [None]:
lasso.coef_[(lasso.coef_ != 0).nonzero()[0]]

In [None]:
def plot_cv():
    pass

**Ridge Regression**

In [None]:
# # Cross Validation
# ridge = RidgeCV(alphas=np.logspace(0, 5, 200))
# ridge_scores = cross_val_score(ridge, X_train, y_train, cv=10)
# ridge_scores.mean()

In [None]:
# Cross Validation
optimal_ridge = RidgeCV(alphas=np.logspace(0, 5, 200), cv=10)

In [None]:
# Model Fitting and Evaluation
optimal_ridge.fit(X_train, y_train)

ridge = Ridge(alpha=optimal_ridge.alpha_)
ridge.fit(X_train, y_train)

ridge_scores = cross_val_score(ridge, X_train, y_train, cv=10)
ridge_scores.mean()

In [None]:
(-cross_val_score(ridge, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')).mean()

In [None]:
y_test_pred = ridge.predict(X_test)

In [None]:
y_test_pred = y_test_pred.reshape(-1)

In [None]:
submissions = make_and_get_submissions_file(X_test_indices, y_test_pred, 'submissions_ridge_regression_all_features')

## Feature Engineering

In [None]:
nominal_features

In [None]:
train_cleaned.corr()

In [None]:
len(set(nominal_features).difference(set(train_cleaned.corr().columns)))

In [None]:
set(nominal_features).difference(set(train_cleaned.corr().columns))

In [None]:
train_cleaned[sorted(nominal_features)].info()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(train_cleaned.corr(), annot=False, cmap='coolwarm')

In [None]:
submissions.shape