# Predicting House Prices
## Objectives:
* Clean data
* Perform exploratory Data Analysis
* Preparing Data Feature Engineering
* Split, scale, and standardize data
* Find best Hyperparameter for Linear Regression, Ridge Regression and Lasso Regression
* Model selection for test data and submit predictions 

# Import Libraries & Load Data

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
submision_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [None]:
train_data.head()

In [None]:
train_data.info()

# Data Cleaning

In [None]:
train_data.isna().sum()

In [None]:
#Drop Id column
train_data = train_data.drop('Id', axis=1)

In [None]:
#Fill LotFrontage with mean values
meanLot = train_data['LotFrontage'].mean()
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(meanLot)

In [None]:
#Fill GarageYrBlt with constant 0 value for houses with no grarage or masonry
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(0)
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(0)

In [None]:
#Change data type of MSSubClass to repersent categorical data
train_data['MSSubClass'] = train_data['MSSubClass'].astype(str)

In [None]:
train_data.isna().sum()

### Fill Missing Values for Categorical Data
In some cases, nan actually repersents a category (e.g., for the column Alley, a house may not have access to alley, therefore, nan is a valid category). For the rest of the cateogrical values we can fill missing values with the mode.

In [None]:
train_data.select_dtypes('object').loc[:, train_data.isna().sum() > 0].columns

In [None]:
#Fill missing values with constant
for column in ['Alley', 
               'BsmtQual', 
               'BsmtCond',
               'BsmtExposure', 
               'BsmtFinType1', 
               'BsmtFinType2',
               'FireplaceQu', 
               'GarageType', 
               'GarageFinish',
               'GarageQual', 
               'GarageCond',
               'PoolQC', 
               'Fence'
              ]:
    train_data[column] = train_data[column].fillna('none')
#Fill missing values with mode
for column in ['MasVnrType', 
               'Electrical', 
               'MiscFeature']:
    mode = train_data[column].mode()
    train_data[column] = train_data[column].fillna(mode[0])

In [None]:
train_data.isna().sum().sum()

# Exploratory Data Analysis

## Exploring Numeric Features

In [None]:
numeric_corr = train_data.corr()
corr_pairs = numeric_corr.unstack()

In [None]:
#Find the numeric features which have the highest correlation with Sale Price
sorted_corr = corr_pairs['SalePrice'].sort_values(ascending=False).apply(abs)
highNumeric = sorted_corr[sorted_corr > 0.5]
highNumeric

In [None]:
sns.regplot(x='OverallQual', y='SalePrice', data=train_data, scatter_kws={'color':'blue'}, line_kws={'color':'red'})
plt.title('Overall Quality of Materials vs Sale Price')
plt.xlabel('Overall Quality of Materials')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.regplot(x='GrLivArea', y='SalePrice', data=train_data, scatter_kws={'color':'blue'}, line_kws={'color':'red'})
plt.title('Above grade (ground) Living Area  vs Sale Price')
plt.xlabel('Above grade (ground) Living Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.regplot(x='GarageCars', y='SalePrice', data=train_data, scatter_kws={'color':'blue'}, line_kws={'color':'red'})
plt.title('Size of Garage vs Sale Price')
plt.xlabel('Size of Garage (car capacity)')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.regplot(x='GarageArea', y='SalePrice', data=train_data, scatter_kws={'color':'blue'}, line_kws={'color':'red'})
plt.title('Garage Area vs Sale Price')
plt.xlabel('Garage Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.regplot(x='TotalBsmtSF', y='SalePrice', data=train_data, scatter_kws={'color':'blue'}, line_kws={'color':'red'})
plt.title('Basement Area vs Sale Price')
plt.xlabel('Basment Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

## Exploring Categorical Features

In [None]:
df_cat = train_data.copy()
df_cat = df_cat.select_dtypes('object')
df_cat.head()

In [None]:
for column in df_cat.columns:
    df_cat[column] = df_cat[column].astype('category').cat.codes

In [None]:
df_cat = pd.concat([df_cat, train_data['SalePrice']], axis=1)
df_cat.head()

In [None]:
cat_corr = df_cat.corr()
catcorr_pairs = cat_corr.unstack()

In [None]:
#Find the categorical features which have the highest correlation with Sale Price
sorted_Catcorr = catcorr_pairs['SalePrice'].sort_values(ascending=False).apply(abs)
highCat = sorted_Catcorr[sorted_Catcorr >= 0.5]
highCat

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train_data, hue='ExterQual')
plt.title('Above Ground Living Area vs Sale Price')
plt.xlabel('Above grade (ground) Living Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

* Ex = Excellent
* Gd = Good
* TA = Average
* Fa = Fair
* Po = Poor

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train_data, hue='BsmtQual')
plt.title('Above Ground Living Area vs Sale Price')
plt.xlabel('Above grade (ground) Living Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train_data, hue='KitchenQual')
plt.title('Above Ground Living Area vs Sale Price')
plt.xlabel('Above grade (ground) Living Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train_data, hue='GarageFinish')
plt.title('Above Ground Living Area vs Sale Price')
plt.xlabel('Above grade (ground) Living Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train_data, hue='FireplaceQu')
plt.title('Above Ground Living Area vs Sale Price')
plt.xlabel('Above grade (ground) Living Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train_data, hue='GarageType')
plt.title('Above Ground Living Area vs Sale Price')
plt.xlabel('Above grade (ground) Living Area (sqft)')
plt.ylabel('Sale Price')
plt.show()

# Feature Transformation

In [None]:
import scipy.stats
df_skew = pd.DataFrame(train_data.select_dtypes(np.number).columns, columns=['Features'])
df_skew['Skew'] = df_skew['Features'].apply(lambda feature: scipy.stats.skew(train_data[feature]))
df_skew['Abs Skew'] = df_skew['Skew'].apply(abs)
df_skew['Skewed'] = df_skew['Abs Skew'].apply(lambda x: True if x >= 0.5 else False)
df_skew

In [None]:
df_skew.query('Skewed == True')['Features']

In [None]:
for column in df_skew.query('Skewed == True')['Features'].values:
    train_data[column] = np.log1p(train_data[column])

# Feature Engineering
Now that we have some preliminary insights about how each important variable would affect the Sale Price. We will select the features that will be used in prediction for our future models.

In [None]:
features = train_data[['OverallQual', 'GrLivArea', 
                       'GarageCars', 'GarageArea', 
                       'TotalBsmtSF', '1stFlrSF', 
                       'FullBath', 'TotRmsAbvGrd', 
                       'YearBuilt', 'YearRemodAdd', 
                       'MasVnrArea', 'Fireplaces', 
                       'GarageFinish', 'KitchenQual', 
                       'BsmtQual', 'ExterQual', 'SalePrice']]
features.head()

In [None]:
features.shape

In [None]:
features.select_dtypes('object').head()

In [None]:
ordinal_features = ['KitchenQual',
                    'BsmtQual',
                    'ExterQual']

nominal_features = 'GarageFinish'

In [None]:
ordinal_orderings = [
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'none'],
    ['Ex', 'Gd', 'TA', 'Fa', 'Po']
]

In [None]:
#Ordinal Encoding
def ordinal_encode(df, columns, orderings):
    df = df.copy()
    for column, ordering in zip(columns, orderings):
        df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

#One-hot Encode Categorical Variables
def onehot_encode(df):
    df = df.copy()
    for column in df[[nominal_features]]:
        features_one_hot = pd.get_dummies(df[column])
        df = pd.concat([df, features_one_hot], axis =1)
        df = df.drop(column, axis=1)
    return df

data = ordinal_encode(features, ordinal_features, ordinal_orderings)
data = onehot_encode(data)
data.head()

In [None]:
data.shape

In [None]:
data = data.astype('float64')
data.head()

# Data Preprocessing

In [None]:
def preprocessing_inputs(df):
    df = df.copy()
    
    #Split DataFrame
    y = df['SalePrice']
    X = df.drop('SalePrice', axis=1)
    
    #Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    
    #Scale X
    scaler = StandardScaler()
    X_train = scaler.fit(X_train).transform(X_train)
    X_test = scaler.fit(X_test).transform(X_test)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocessing_inputs(data)

In [None]:
print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)

# Model Training

## Linear Regression

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
print('Linear Regression Model Trained')

## Ridge Regression

In [None]:
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}

In [None]:
ridge = Ridge()
ridge_cv = GridSearchCV(estimator=ridge, param_grid=parameters, cv=10)
ridge_cv.fit(X_train, y_train)
print('Ridge Regression Model Trained')

In [None]:
print('Best parameters:', ridge_cv.best_params_)
print('Highest Accuracy:', '{:.2%}'.format(ridge_cv.best_score_))

## Lasso

In [None]:
parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'selection': ['cyclic', 'random']}

In [None]:
lasso = Lasso()
lasso_cv = GridSearchCV(estimator=lasso, param_grid=parameters, cv=10)
lasso_cv.fit(X_train, y_train)
print('Lasso Regression Model Trained')

In [None]:
print('Best parameters:', lasso_cv.best_params_)
print('Highest Accuracy:', '{:.2%}'.format(lasso_cv.best_score_))

# Training Results

In [None]:
yhat_lin = linreg.predict(X_test)
yhat_ridge = ridge_cv.predict(X_test)
yhat_lasso = lasso_cv.predict(X_test)

In [None]:
lin_r2 = r2_score(y_test, yhat_lin)
ridge_r2 = r2_score(y_test, yhat_ridge)
lasso_r2 = r2_score(y_test, yhat_lasso)

In [None]:
print('Linear Regression R2-score: ', '{:.5}'.format(lin_r2))
print('Ridge Regression R2-score:  ', '{:.5}'.format(ridge_r2))
print('Lasso Regression R2-score:  ', '{:.5}'.format(lasso_r2))

In [None]:
lin_mse = np.mean((yhat_lin - y_test) ** 2)
ridge_mse = np.mean((yhat_ridge - y_test) ** 2)
lasso_mse = np.mean((yhat_lasso - y_test) ** 2)

In [None]:
print('Linear Regression MSE:', '{:.3}'.format(lin_mse))
print('Ridge Regression MSE: ', '{:.3}'.format(ridge_mse))
print('Lasso Regression MSE: ', '{:.3}'.format(lasso_mse))

In [None]:
model_dict = {'model':['Linear', 'Ridge', 'Lasso'],
              'R2_score': [lin_r2, ridge_r2, lasso_r2],
              'MSE': [lin_mse, ridge_mse, lasso_mse]}
model_results = pd.DataFrame.from_dict(model_dict)
model_results

In [None]:
model_results.plot(kind='bar', x='model', y='R2_score', color='skyblue', figsize=(8,6))
plt.title('Model Performance')
plt.xlabel('Model')
plt.xticks(rotation=25)
plt.ylabel('R2 Score')
plt.show()
model_results.plot(kind='bar', x='model', y='MSE', color='violet', figsize=(8,6))
plt.title('Model Performance')
plt.xlabel('Model')
plt.xticks(rotation=25)
plt.ylabel('Mean Squared Error')
plt.show()

As we can see all our models performed similarly according, however, if we look at our R2 score and MSE, the model that peforms best is the Linear Regression model. Therefore, we will use the Linear Regression model on our test set.

# Model Evaluation using Test set

### Load Test set for evaluation

In [None]:
test_data.head()

Since, we have created our model on only a select few features we will perform our preprocessing on these features only.

### Clean Data

In [None]:
test_features = test_data[['OverallQual', 'GrLivArea', 
                       'GarageCars', 'GarageArea', 
                       'TotalBsmtSF', '1stFlrSF', 
                       'FullBath', 'TotRmsAbvGrd', 
                       'YearBuilt', 'YearRemodAdd', 
                       'MasVnrArea', 'Fireplaces', 
                       'GarageFinish', 'KitchenQual', 
                       'BsmtQual', 'ExterQual']]
test_features.head()

In [None]:
test_features.isna().sum()

In [None]:
#Fill missing values with constant 0
for column in ['GarageCars', 
               'GarageArea', 
               'TotalBsmtSF', 
               'MasVnrArea']:
    test_features[column] = test_features[column].fillna(0)

#Fill missing values with constant
for column in ['BsmtQual', 
               'GarageFinish']:
    test_features[column] = test_features[column].fillna('none')

    #Fill missing values with mode
for column in ['KitchenQual']:
    mode = test_features[column].mode()
    test_features[column] = test_features[column].fillna(mode[0])

In [None]:
test_features.isna().sum().sum()

### Preprocessing

In [None]:
#Check for skewed columns
feature_skew = pd.DataFrame(test_features.select_dtypes(np.number).columns, columns=['Features'])
feature_skew['Skew'] = feature_skew['Features'].apply(lambda feature: scipy.stats.skew(test_features[feature]))
feature_skew['Abs Skew'] = feature_skew['Skew'].apply(abs)
feature_skew['Skewed'] = feature_skew['Abs Skew'].apply(lambda x: True if x >= 0.5 else False)
feature_skew

In [None]:
#Correct skewed columns
for column in feature_skew.query('Skewed == True')['Features'].values:
    test_features[column] = np.log1p(test_features[column])

### Feature Engineering

In [None]:
X_test_ = ordinal_encode(test_features, ordinal_features, ordinal_orderings)
X_test_ = onehot_encode(X_test_)
X_test_.head()

# Training & Submission

In [None]:
#Scale X
scaler = StandardScaler()
X_test_ = scaler.fit(X_test_).transform(X_test_)

In [None]:
predictions = linreg.predict(X_test_)
predictions = np.exp(predictions)

In [None]:
submission = pd.DataFrame({'ID': test_data['Id'],
                           'Sale_Price': predictions})
submission.head()

In [None]:
submission.to_csv('Submission.csv',index=False)