In [18]:
# importing required libraries
import numpy as np
import pandas as pd
from pathlib import Path
# importing matplotlib
import matplotlib.pyplot as plt

# display plots in the notebook itself
%matplotlib inline

#import the necessary libraries
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
from scipy.special import jv

from sklearn.model_selection import train_test_split #train data set sampler

from sklearn.preprocessing import MinMaxScaler  #minmaxScaler

import statsmodels.api as sm  #statsModel

from sklearn.feature_selection import RFE #RFE
from sklearn.linear_model import LinearRegression #LineaerRegression

from statsmodels.stats.outliers_influence import variance_inflation_factor #VIF

# Linear Regression
## Housing Sale Case Study


A US-based housing company named Surprise Housing has decided to enter the Australian market. The company uses data analytics to purchase houses at a price below their actual values and flip them on at a higher price. For the same purpose, the company has collected a data set from the sale of houses in Australia.

#### Problem Statement:
- Build a regression model using regularisation in order to predict the actual value of the prospective properties

Essentially, the company wants —
- Decide whether to invest in house or not
- Which variables are significant in predicting the price of a house, and
- How well those variables describe the Sale Price of a house.



In [19]:
df = pd.read_csv('train.csv')
df = df.rename(columns=lambda x: x.strip())
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Identifying shape of the data

In [20]:
df.shape

(1460, 81)

## Data Cleaning

Cleaning the dataset using following steps:
 - Drop columns with all null values
 - Drop columns with having only one, non null value
 - Drop columns that are not going to effect the driving factors
 - Validate internal rules
 
 #### Using SalePrice as target column

### Checking Null count

In [21]:
df.isnull().sum().sort_values(ascending=False)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
FireplaceQu     690
               ... 
ExterQual         0
Exterior2nd       0
Exterior1st       0
RoofMatl          0
SalePrice         0
Length: 81, dtype: int64

<font size="4">As most of the variables above which has null values are categorical variables and based on the data dictionary, it can be seen that NA can be assigned to null values</font>

In [24]:
df['PoolQC'].fillna('NA', inplace=True)
df['MiscFeature'].fillna('NA', inplace=True)
df['Alley'].fillna('NA', inplace=True)
df['Fence'].fillna('NA', inplace=True)
df['FireplaceQu'].fillna('NA', inplace=True)

               

In [25]:
df.isnull().sum().sort_values(ascending=False)

LotFrontage     259
GarageFinish     81
GarageCond       81
GarageYrBlt      81
GarageQual       81
               ... 
Exterior2nd       0
Exterior1st       0
RoofMatl          0
RoofStyle         0
SalePrice         0
Length: 81, dtype: int64

In [33]:
df['LotFrontage'].value_counts()

0.0      259
60.0     143
70.0      70
80.0      69
50.0      57
        ... 
137.0      1
38.0       1
33.0       1
150.0      1
46.0       1
Name: LotFrontage, Length: 111, dtype: int64

In [29]:
# Now the values in numbers to be replaced with zero 
df['LotFrontage'].fillna(0, inplace=True)

In [35]:
# Again categorical variables filling with NA
df['GarageFinish'].fillna('NA', inplace=True)
df['GarageCond'].fillna('NA', inplace=True)
df['GarageQual'].fillna('NA', inplace=True)

In [36]:
df.isnull().sum().sort_values(ascending=False)

GarageYrBlt     81
GarageType      81
BsmtExposure    38
BsmtFinType2    38
BsmtQual        37
                ..
Exterior2nd      0
Exterior1st      0
RoofMatl         0
RoofStyle        0
SalePrice        0
Length: 81, dtype: int64

In [40]:
# Again categorical variables filling with NA
df['GarageType'].fillna('NA', inplace=True)
df['BsmtExposure'].fillna('NA', inplace=True)
df['BsmtFinType2'].fillna('NA', inplace=True)
df['BsmtQual'].fillna('NA', inplace=True)     

In [41]:
df.isnull().sum().sort_values(ascending=False)

GarageYrBlt     81
BsmtFinType1    37
BsmtCond        37
MasVnrType       8
MasVnrArea       8
                ..
Exterior2nd      0
Exterior1st      0
RoofMatl         0
RoofStyle        0
SalePrice        0
Length: 81, dtype: int64

In [44]:
# Again categorical variables filling with NA
df['BsmtFinType1'].fillna('NA', inplace=True)
df['BsmtCond'].fillna('NA', inplace=True)

# Now the variable here is None not NA
df['MasVnrType'].fillna('None', inplace=True)
       
# Now the values in numbers to be replaced with zero 
df['MasVnrArea'].fillna(0, inplace=True)

In [45]:
df.isnull().sum().sort_values(ascending=False)

GarageYrBlt    81
Electrical      1
CentralAir      0
GarageType      0
FireplaceQu     0
               ..
MasVnrArea      0
MasVnrType      0
Exterior2nd     0
Exterior1st     0
SalePrice       0
Length: 81, dtype: int64

In [54]:
# dropping null value for electrical
df = df[~df['Electrical'].isnull()]  

In [78]:
# 81 garages has no types hence the garageYrBlt is also null
df['GarageType'].value_counts()

Attchd     870
Detchd     387
BuiltIn     87
NA          81
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotFrontage    1459 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          1459 non-null   object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1459 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

Below variable are categorical variables and should have object data type rather than int or float
- MSSubClass    
- OverallQual    
- OverallCond    
- BsmtFullBath   
- BsmtHalfBath
- FullBath
- HalfBath
- BedroomAbvGr
- KitchenAbvGr
- TotRmsAbvGrd
- Fireplaces
- GarageCars

In [79]:
# Converting above columns to object type to have those used as categorical variables
df[
    ['MSSubClass',
     'OverallQual',
     'OverallCond',
     'BsmtFullBath',
     'BsmtHalfBath',
     'FullBath',
     'HalfBath',
     'BedroomAbvGr',
     'KitchenAbvGr',
     'TotRmsAbvGrd',
     'Fireplaces',
     'GarageCars'
    ]
] = df[
    ['MSSubClass',
     'OverallQual',
     'OverallCond',
     'BsmtFullBath',
     'BsmtHalfBath',
     'FullBath',
     'HalfBath',
     'BedroomAbvGr',
     'KitchenAbvGr',
     'TotRmsAbvGrd',
     'Fireplaces',
     'GarageCars'
    ]
].astype('object')

In [80]:
df[['MSSubClass',
     'OverallQual',
     'OverallCond',
     'BsmtFullBath',
     'BsmtHalfBath',
     'FullBath',
     'HalfBath',
     'BedroomAbvGr',
     'KitchenAbvGr',
     'TotRmsAbvGrd',
     'Fireplaces',
     'GarageCars'
    ]].dtypes


MSSubClass      object
OverallQual     object
OverallCond     object
BsmtFullBath    object
BsmtHalfBath    object
FullBath        object
HalfBath        object
BedroomAbvGr    object
KitchenAbvGr    object
TotRmsAbvGrd    object
Fireplaces      object
GarageCars      object
dtype: object

In [82]:
df.shape

(1459, 81)