In [None]:
# Import necessary Python libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Step 1:  Inspecting the Dataframe for understanding provided data

In [None]:
#read the file
df=pd.read_csv('E:/IIITB_Upgrad_AI_ML_Course/AdvancedLinearRegression/HousepricingAssignment/train.csv')
df.head()

In [None]:
#Checking size of the data

df.shape

In [None]:
#Checking data types of columns and null value analysis if any

df.info()

In [None]:
#display basic statistical information for the data
df.describe()

### Step 2: Data Cleansing Process -> Handling null & missing values

In [None]:
# check if any missing values are present in the data
df.isnull().sum().sum()

In [None]:
# display the column names that have missing values
df.columns[df.isnull().any()]

In [None]:
# check the amount of missing values in the columns
columns_nan =  df.columns[df.isnull().any()]

for col in columns_nan:
    print(col, df[col].isnull().sum())

In [None]:
#check the percentage of missing data to make decisions on working with the missing data
nullval = pd.DataFrame(round(df.isnull().sum()/len(df.index)*100,2).sort_values(ascending=False),columns=["Null values in %"])
nullval.index.name = 'Feature names'
nullval.head(10)

#### Observations:we will drop the 'PoolQC','MiscFeature','Alley','Fence','FireplaceQu'column because there are so many missing values and id column is not required.


In [None]:
df = df.drop(['PoolQC','MiscFeature','Id','Alley','Fence','FireplaceQu'],axis=1)

In [None]:
null = pd.DataFrame(round(df.isnull().sum()/len(df.index)*100,2).sort_values(ascending=False),columns=["Null %"])
null.index.name = 'Features'
null_df = null[null["Null %"] > 0]
null_df

In [None]:
df.shape

In [None]:
df.columns

In [None]:
#Categorical columns
df.select_dtypes(include='object').columns

In [None]:
# Numeric columns
df.select_dtypes(exclude='object').columns

In [None]:
# Check for the columns with highest percentage of missing values
print('The unique values in columsn with highest number if nan or missing values')
print('\n')
print('LotFrontage: ',df['LotFrontage'].value_counts())
print('\n')
print('GarageCond: ',df['GarageCond'].value_counts())
print('\n')
print('GarageType: ',df['GarageType'].value_counts())
print('\n')
print('GarageYrBlt: ',df['GarageYrBlt'].value_counts())
print('\n')
print('GarageFinish: ',df['GarageFinish'].value_counts())
print('\n')
print('GarageQual: ',df['GarageQual'].value_counts())
print('\n')
print('BsmtExposure: ',df['BsmtExposure'].value_counts())
print('\n')
print('BsmtFinType2: ',df['BsmtFinType2'].value_counts())
print('\n')
print('BsmtFinType1: ',df['BsmtFinType1'].value_counts())
print('\n')
print('BsmtCond: ',df['BsmtCond'].value_counts())
print('\n')
print('BsmtQual: ',df['BsmtQual'].value_counts())
print('\n')
print('MasVnrArea: ',df['MasVnrArea'].value_counts())
print('\n')
print('MasVnrType: ',df['MasVnrType'].value_counts())
print('\n')
print('Electrical: ',df['Electrical'].value_counts())

In [None]:
#Checking for outlier in the numerical columns
df.describe(percentiles=[.25,.5,.75,.90,.95,.99])

In [None]:

# for the LotFrontage column and GarageYrBlt we will impute the missing values with the median since the feature contains outliers
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())

df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].median())

# for the "below columns" we will impute the null values with 'mode'
for col in ('GarageCond', 'GarageType', 'GarageFinish','GarageQual'):
    
    df[col] = df[col].fillna(df[col].mode()[0])
    
# for the "Bsmt" columns we will impute the null values with 'mode'
for col in ('BsmtExposure', 'BsmtFinType2', 'BsmtFinType1','BsmtCond','BsmtQual'):
    
    df[col] = df[col].fillna(df[col].mode()[0])
    
# for the columns we will impute the null values with 'mode'

for col in ('MasVnrArea', 'MasVnrType', 'Electrical'):
    
    df[col] = df[col].fillna(df[col].mode()[0])

    

In [None]:
null = pd.DataFrame(round(df.isnull().sum()/len(df.index)*100,2).sort_values(ascending=False),columns=["Null %"])
null.index.name = 'Features'
null_df = null[null["Null %"] > 0]
null_df

In [None]:
# checking for the presence of any more null values
df.isnull().values.any()

In [None]:
# check Null value
df.isnull().sum()

#### Observation: Data is clean and filled with related values

In [None]:
# Check the shape
df.shape

In [None]:
df.describe()

In [None]:
print(df['PoolArea'].value_counts())
print(df['MiscVal'].value_counts())
print(df['3SsnPorch'].value_counts())

In [None]:
# we will drop these columns as it dominated by one value and it won't add any extra information to our model
df = df.drop(['PoolArea','MiscVal','3SsnPorch'],axis=1)

In [None]:
df.shape

In [None]:
df.describe()

### Step 3: EDA - Data visualization

In [None]:
#Correlation map to see how features are correlated with SalePrice
corrmat = df.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

#### From the above we can see that some of the numerical colums are highly related with the sales price
- LotFrontage
- Overall Quality
- Year Built
- Year removeadd
- MasVnrArea
- TotalBsmn SF
- 1st Foor SF
- Gr ving Area
- Fullbath
- Fireplaces
- Garage Area



#### We will retain them in our consideration during model evaluation

In [None]:
# IQR to remove the outlier
cols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
         'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
        '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
        'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
        'EnclosedPorch','ScreenPorch', 'MoSold', 'YrSold', 'SalePrice'] # one or more

Q1 = df[cols].quantile(0.05)
Q3 = df[cols].quantile(0.95)
IQR = Q3 - Q1

df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
df.info()

#### Let us also check what are the most corelated values present in the data from provided features.

In [None]:
k = 10
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

#### We can see from the above chart that the top 10 variables are similar to what we had initially deciphered
- OverallQual - It is obvious that having a good quality house would fetch bigger price
- Grlivarea - More the area of the floor above the ground floor, greater is the price
- Garage cars, garage area, total basement SF, 1st floor SF -Seems to make sense 

#### Plot graphs between some of these important numerical variables and see if we see any pattern

In [None]:
# Checking the same with a pairplot 
sns.set()
cols = ['SalePrice', 'GrLivArea', 'GarageCars', 'BsmtUnfSF', 'BsmtFinSF1', 'GarageArea', 'TotalBsmtSF', 'YearBuilt', 'TotRmsAbvGrd', 'GarageYrBlt']
sns.pairplot(df[cols], size = 2.5)
plt.show()

##### Drop columns that are correlated and not contributing to 'SalePrice'

In [None]:
df = df.drop(['GarageCars'], axis = 1)
df = df.drop(['BsmtUnfSF'], axis = 1)
df = df.drop(['TotRmsAbvGrd'], axis = 1)
df = df.drop(['GarageYrBlt'], axis = 1)    

df.head()

In [None]:
df.shape

In [None]:
# Sale columns
plt.figure()
sns.distplot(df['SalePrice'],color='b')
plt.show()

#### Target variable 'sale Price' vs a few select columns

In [None]:
# Analyse some important numeric columns
sns.jointplot(x='GrLivArea', y='SalePrice', data=df)
plt.show()

In [None]:
# Lot frontage vs SalePrice 
sns.jointplot(x = df['LotFrontage'], y = df['SalePrice'])
plt.show()

In [None]:
# LotArea vs SalePrice
sns.jointplot(x = df['LotArea'], y = df['SalePrice'])
plt.show()

In [None]:
# 1stFlrSF vs SalePrice
sns.jointplot(x = df['1stFlrSF'], y = df['SalePrice'])
plt.show()

In [None]:
# 2ndFlrSF vs SalePrice
sns.jointplot(x = df['2ndFlrSF'], y = df['SalePrice'])
plt.show()

In [None]:
# OverallQual vs SalePrice
sns.jointplot(x = df['OverallQual'], y = df['SalePrice'])
plt.show()

In [None]:
# OverallCond vs SalePrice
sns.jointplot(x=df['OverallCond'], y = df['SalePrice'])
plt.show()

#### Observation: 
 - Increase in the overall quality has a direct positive effect on the sale price.
 - Ground or First level houses i.e. '0' second floor Sq.Ft has also a steady increase.

In [None]:
df.shape

#### We can derive a column for 'Age of the property' when it was sold: Name it as 'PropAge'

In [None]:
# PropAge -  Property Age from yearsold - yearbuilt
df['PropAge'] = (df['YrSold'] - df['YearBuilt'])
df.head()

In [None]:
# PropAge vs SalePrice
sns.jointplot(x = df['PropAge'], y = df['SalePrice'])
plt.show()

#### Observatons:
 - Increase in Property Age shows a decreasing saleprice trend i.e newer the property, high is the value
 -  We can drop the column Month sold and Year Sold, Year built and Year remodelled since it will not be required further

In [None]:
df.shape

#### Handling of Categorical Columns

In [None]:
#Categorical columns
df.select_dtypes(include='object').columns

#### Analyzing Categorical Data for presence of Outliers

In [None]:
qualitative = [f for f in df.columns if df.dtypes[f] == 'object']

In [None]:
for c in qualitative:
    df[c] = df[c].astype('category')
    if df[c].isnull().any():
        df[c] = df[c].cat.add_categories(['MISSING'])
        df[c] = df[c].fillna('MISSING')

def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)
f = pd.melt(df, id_vars=['SalePrice'], value_vars=qualitative)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, height=5)
g = g.map(boxplot, "value", "SalePrice")

#### Observation: Most of these variables have a diverse relationship with the 'Sales' and we will try to define some of them below 

In [None]:
def anova(frame):
    anv = pd.DataFrame()
    anv['feature'] = qualitative
    pvals = []
    for c in qualitative:
        samples = []
        for cls in frame[c].unique():
            s = frame[frame[c] == cls]['SalePrice'].values
            samples.append(s)
        pval = stats.f_oneway(*samples)[1]
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')

a = anova(df)
a['disparity'] = np.log(1./a['pval'].values)
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)

#### Observation:From the above chart we can see that some of the majorly influencing variables are -
- Neighbourhood
- ExterQuality
- Basement Quality
- KitchenQuality

### Step 4: Data preparation for Model generation

In [None]:
# Convert categorical value into Dummy variable
df=pd.get_dummies(df,drop_first=True)
df.head()

In [None]:
#removing response variable from the set
y = df.pop('SalePrice')
y.head()

In [None]:
X = df
X.shape

In [None]:
#import train_test_split to split the data
from sklearn.model_selection import train_test_split

In [None]:
# split into train and test
X_train, X_test, y_train,  y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [None]:
X_train.head()

In [None]:
# Check for no. of rows and columns in Train and Test data
print('X_train shape',X_train.shape)
print('X_test shape',X_test.shape)
print('y_train shape',y_train.shape)
print('y_test shape',y_test.shape)

##### Scaling of numeric varaibles

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
# columns to be scaled
X_train.select_dtypes(include=['int64','int32','float64','float32']).columns

In [None]:

num_vars= ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',  
           'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 
           '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
           'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageArea',
           'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch']
X_train[num_vars].head()

In [None]:
X_train.describe()

In [None]:
X_train.head()

In [None]:
#Scaling using MinMax
from sklearn.preprocessing import StandardScaler,MinMaxScaler
#scaler = StandardScaler()
scaler=MinMaxScaler()

In [None]:
X_train[num_vars] = scaler.fit_transform(X_train[num_vars])
X_test[num_vars] = scaler.transform(X_test[num_vars])

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_train.describe()

In [None]:
X_train.shape

### Step 5: Model Building and Evaluation Phase Starts

##### Objective(s) are to generate model(s) with /  without RFE data set and evaluate model peformance.

 - Create linear regression model.
 - Use Ridge and Lasso
 - Compare to represent final model

##### 5.1.  Linear Regression model on Train data set without RFE 

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [None]:
# Linear Regression on features without RFE
lm = LinearRegression()
lm.fit(X_train, y_train)

# Running RFE for further model evaluation on train data with RFE columns
# Since there are more than 200 variables for analysis, we will run RFE to select some that have high predictive power
# running RFE for top 100 variables
rfe = RFE(lm, n_features_to_select=100)   
rfe = rfe.fit(X_train, y_train)

In [None]:
# Place holder for columns with good score of RFE
col = X_train.columns[rfe.support_]
col

In [None]:
# Columns may not be considered for model evaluation using RFE
X_train.columns[~rfe.support_]

In [None]:
# Creating related rfe dataframe(s) with RFE selected variables
X_train_rfe=X_train[col]
X_test_rfe=X_test[col]
print(X_train_rfe.shape)
print(X_test_rfe.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train_rfe = pd.DataFrame(X_train[col])

In [None]:
# Print the coefficients and intercept
print(lm.intercept_)
print(lm.coef_)

In [None]:
# Check the ranks
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
#import libraries for model evalution
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
#r2score,RSS and RMSE for Linear regression model without RFE Columns inclusion.
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

metric = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric.append(mse_test_lr**0.5)

#### Observations:
 - The Linear model without RFE for Test data is Overfitting.
 - Generated R2 score for Test is negative.
 
##### In the following step, Ridge and Lasso are applied to compare the result by using Train data set not having columns from RFE

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

##### 5.2.  Using  Ridge Regression

In [None]:
# list of hyperparameter values or alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 5
ridge_model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
ridge_model_cv.fit(X_train, y_train) 

In [None]:
# Check for the best hyperparameter alpha
print(ridge_model_cv.best_params_)
print(ridge_model_cv.best_score_)

In [None]:
alpha = 3
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
ridge.coef_

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric2.append(mse_test_lr**0.5)

##### Observations: 
 - The above Ridge model score better than the non-regularised version seen above. 

#### 5.2.  Using  Lasso Regression

In [None]:
# list of Hyperparameter values or  alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3,
                                   0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
                                   4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50,
                                   100, 500, 1000]},
             return_train_score=True, scoring='neg_mean_absolute_error',
             verbose=1)
lasso = Lasso()

## Lasso regression auto selects important features.
# cross validation
lasso_model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

lasso_model_cv.fit(X_train, y_train)

In [None]:
# Check for best hyperparameter alpha value.
print(lasso_model_cv.best_params_)
print(lasso_model_cv.best_score_)

##### The optimum value of alpha is 100

In [None]:
alpha =100

lasso = Lasso(alpha=alpha)
        
lasso.fit(X_train, y_train) 

In [None]:
lasso.coef_

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE

y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric3.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric3.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric3.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric3.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric3.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric3.append(mse_test_lr**0.5)

In [None]:
# Creating a table which contain all the metrics

lr_table = {'Metric': ['R2 Score (Train)','R2 Score (Test)','RSS (Train)','RSS (Test)',
                       'MSE (Train)','MSE (Test)'], 
        'Linear Regression': metric
        }

pd.options.display.float_format = '{:.2f}'.format
lr_metric = pd.DataFrame(lr_table ,columns = ['Metric', 'Linear Regression'] )

rg_metric = pd.Series(metric2, name = 'Ridge Regression')
ls_metric = pd.Series(metric3, name = 'Lasso Regression')

final_metric = pd.concat([lr_metric, rg_metric, ls_metric], axis = 1)

final_metric

#### Observations:
  - Both Ridge and Lasso produce a better performing model, with Lasso outperforming Ridge slightly.

#### 5.3 Predictions and Normality of Residuals with columns not having RFE

In [None]:
# Using predictions generated by ridge method.
ridge_pred = ridge.predict(X_test)

In [None]:
# Plotting y_test and y_pred to understand the spread for ridge regression.
fig = plt.figure(dpi=100)
plt.scatter(y_test,ridge_pred)
fig.suptitle('y_test vs ridge_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('ridge_pred', fontsize=16)  
plt.show()

##### Observation(s): 
 - The graph depicts that there is a linear regression model which can be generated.

In [None]:
y_res=y_test-ridge_pred
# Distribution of errors
sns.distplot(y_res,kde=True)
plt.title('Normality of error terms/residuals Ridge')
plt.xlabel("Residuals")
plt.show()

In [None]:
# Using predictions generated by lasso method.
lasso_pred = lasso.predict(X_test)

In [None]:
# Plotting y_test and y_pred to understand the spread for lasso regression.
fig = plt.figure(dpi=100)
plt.scatter(y_test,lasso_pred)
fig.suptitle('y_test vs lasso_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('lasso_pred', fontsize=16)  
plt.show()

##### Observation(s): 
 - The graph depicts that there is a linear regression model similar to ridge with slight variation.

In [None]:
y_res=y_test-lasso_pred
# Distribution of errors
sns.distplot(y_res,kde=True)
plt.title('Normality of error terms/residuals Lasso')
plt.xlabel("Residuals")
plt.show()

####  5.4 Check for Changes in Coefficients 

In [None]:
betas = pd.DataFrame(index=X_train.columns)

In [None]:
betas.rows = X_train.columns

In [None]:
betas['Linear'] = lm.coef_
betas['Ridge'] = ridge.coef_
betas['Lasso'] = lasso.coef_

In [None]:
pd.set_option('display.max_rows', None)
betas.head(70)

In [None]:
betas = pd.DataFrame(index=X_train.columns)
betas.rows = X_train.columns
betas['Lasso'] = lasso.coef_
betas.head(70)

#### 6. Model Evaluation

In [None]:
# Linear Regression on features on columns generated with RFE
lm_rfe=LinearRegression()

lm_rfe.fit(X_train_rfe,y_train)

y_pred_train = lm_rfe.predict(X_train_rfe)
y_pred_test = lm_rfe.predict(X_test_rfe)

In [None]:
#r2score,RSS and RMSE
y_pred_train = lm_rfe.predict(X_train_rfe)
y_pred_test = lm_rfe.predict(X_test_rfe)

metric = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric.append(mse_test_lr**0.5)

#### Observations:
 - The Linear model provides R2 Test score value in terms of negative which is highly overfitting. 
  - R2 is negative only when the chosen model does not follow the trend of provided data.
  - Fit is worse than a horizontal linear and generated model is highly problematic.
  - It shows a high score in the training data, but this will be due to the model learning the data and overfitting on the training data. 
##### In the following step, Ridge and Lasso are applied to compare the result.

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 5
ridge_model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
ridge_model_cv.fit(X_train_rfe, y_train) 

In [None]:
# Printing the best hyperparameter alpha
print(ridge_model_cv.best_params_)
print(ridge_model_cv.best_score_)

In [None]:
alpha = 1
ridge = Ridge(alpha=alpha)
ridge.fit(X_train_rfe, y_train)
ridge.coef_

In [None]:
#r2score,RSS and RMSE
y_pred_train = ridge.predict(X_train_rfe)
y_pred_test = ridge.predict(X_test_rfe)

metric = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric.append(mse_test_lr**0.5)

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3,
                                   0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
                                   4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50,
                                   100, 500, 1000]},
             return_train_score=True, scoring='neg_mean_absolute_error',
             verbose=1)
lasso = Lasso()

## Lasso regression auto selects important features.
# cross validation
lasso_model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

lasso_model_cv.fit(X_train_rfe, y_train)

In [None]:
print(lasso_model_cv.best_params_)
print(lasso_model_cv.best_score_)

In [None]:
alpha =8

lasso = Lasso(alpha=alpha)
        
lasso.fit(X_train_rfe, y_train) 

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE

y_pred_train = lasso.predict(X_train_rfe)
y_pred_test = lasso.predict(X_test_rfe)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric3.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric3.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric3.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric3.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric3.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric3.append(mse_test_lr**0.5)

In [None]:
# Creating a table which contain all the metrics

lr_table = {'Metric': ['R2 Score (Train)','R2 Score (Test)','RSS (Train)','RSS (Test)',
                       'MSE (Train)','MSE (Test)'], 
        'Linear Regression': metric
        }

pd.options.display.float_format = '{:.4f}'.format
lr_metric = pd.DataFrame(lr_table ,columns = ['Metric', 'Linear Regression'] )

rg_metric = pd.Series(metric2, name = 'Ridge Regression')
ls_metric = pd.Series(metric3, name = 'Lasso Regression')

final_metric = pd.concat([lr_metric, rg_metric, ls_metric], axis = 1)

final_metric

#### 6.1 Predictions and Normality of Residuals with columns generaed using RFE

In [None]:
ridge_pred_rfe = ridge.predict(X_test_rfe)

In [None]:
# Plotting y_test and y_pred to understand the spread for ridge regression.
fig = plt.figure(dpi=100)
plt.scatter(y_test,ridge_pred_rfe)
fig.suptitle('y_test vs ridge_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('ridge_pred', fontsize=16)  
plt.show()

##### Observation(s): 
 - The graph depicts that there is a linear regression model which can be generated.

In [None]:
y_res=y_test-ridge_pred_rfe
# Distribution of errors
sns.distplot(y_res,kde=True)
plt.title('Normality of error terms/residuals')
plt.xlabel("Residuals")
plt.show()

In [None]:
lasso_pred_rfe = lasso.predict(X_test_rfe)

In [None]:
# Plotting y_test and y_pred to understand the spread for lasso regression.
fig = plt.figure(dpi=100)
plt.scatter(y_test,lasso_pred_rfe)
fig.suptitle('y_test vs lasso_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('lasso_pred', fontsize=16)  
plt.show()

In [None]:
y_res=y_test-lasso_pred_rfe
# Distribution of errors
sns.distplot(y_res,kde=True)
plt.title('Normality of error terms/residuals')
plt.xlabel("Residuals")
plt.show()

####  6.2 Check for Changes in Coefficients 

In [None]:
betas_rfe = pd.DataFrame(index=X_train_rfe.columns)

In [None]:
betas_rfe.rows = X_train_rfe.columns

In [None]:
pd.set_option('display.max_rows', None)
betas_rfe

#### 7. Subjective Questsions related analysis
 

- Question 1: What happens if you double the ideal alpha value?

#### Calculations for Ridge related :

In [None]:
# Best optimal hyper parameter considered during ridge regression model generation {'alpha': 3.0} without RFE
# Doubling the value from 3 to 6
alpha = 6
ridge_Double = Ridge(alpha=alpha)
ridge_Double.fit(X_train, y_train)

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = ridge_Double.predict(X_train)
y_pred_test = ridge_Double.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric2.append(mse_test_lr**0.5)


In [None]:
# Best optimal hyper parameter considered during ridge regression model generation {'alpha': 1.0} with RFE

In [None]:
# Doubling the value from 1 to 2.
alpha = 2
ridge_Double_rfe = Ridge(alpha=alpha)
ridge_Double_rfe.fit(X_train_rfe, y_train)

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = ridge_Double_rfe.predict(X_train_rfe)
y_pred_test = ridge_Double_rfe.predict(X_test_rfe)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric2.append(mse_test_lr**0.5)


#### Observations : After doubling Ridge hyperparameter or Alpha value obtained with / without RFE
 
  - In both cases alpha value has decreased R2 score slightly on training and slightly increased on test data.

#### Calculations for Lasso related :

In [None]:
# Best optimal hyper parameter considered during ridge regression model generation {'alpha': 100} without RFE
# Doubling the value from 100 to 200

alpha = 200

In [None]:

lasso_Double = Lasso(alpha=alpha)
        
lasso_Double.fit(X_train, y_train) 

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = lasso_Double.predict(X_train)
y_pred_test = lasso_Double.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric2.append(mse_test_lr**0.5)


In [None]:
# Best optimal hyper parameter considered during ridge regression model generation {'alpha': 8.0} with RFE

# Doubling the value from 8 to 16.
alpha = 16
lasso_Double_rfe = Lasso(alpha=alpha)
        
lasso_Double_rfe.fit(X_train_rfe, y_train) 

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = lasso_Double_rfe.predict(X_train_rfe)
y_pred_test = lasso_Double_rfe.predict(X_test_rfe)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric2.append(mse_test_lr**0.5)


#### Observations : After doubling Lasso hyperparameter or Alpha value obtained with / without RFE
 
  - In both cases alpha value has decreased R2 score slightly on training and increased on test data.

In [None]:
# Tracking out important predictor variables
beta_vals = pd.DataFrame(index=X_train_rfe.columns)
beta_vals.rows = X_train_rfe.columns
beta_vals['ridge_Double_rfe'] = ridge_Double_rfe.coef_
beta_vals['Ridge'] = ridge.coef_
beta_vals['Lasso'] = lasso.coef_
beta_vals['lasso_Double_rfe'] = lasso_Double_rfe.coef_
pd.set_option('display.max_rows', None)
beta_vals.head(70)

##### Question 3
- After building the model, you realised that the five most important predictor variables in the lasso model are not available in the incoming data. You will now have to create another model excluding the five most important predictor variables. Which are the five most important predictor variables now?



In [None]:
X_train_rfe

In [None]:
y_train

In [None]:
X_train_rfe.columns

##### LotArea,OverallQual,YearBuilt,BsmtFinSF1,TotalBsmtSF are the top 5 important predictors.

In [None]:
# Dropping top 5 columns
X_train2 = X_train_rfe.drop(['LotArea','OverallQual','YearBuilt','BsmtFinSF1','TotalBsmtSF'],axis=1)
X_test2 = X_test_rfe.drop(['LotArea','OverallQual','YearBuilt','BsmtFinSF1','TotalBsmtSF'],axis=1)

In [None]:
X_train2.head()

In [None]:
X_test2.head()

In [None]:
# alpha 8
alpha = 8
lasso2_rfe = Lasso(alpha=alpha)
lasso2_rfe.fit(X_train2, y_train) 

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = lasso2_rfe.predict(X_train2)
y_pred_test = lasso2_rfe.predict(X_test2)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print('r2 train: ', r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print('r2 test: ',r2_test_lr)
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))
print('rss1: ',rss1_lr)
metric2.append(rss1_lr)

rss2_lr = np.sum(np.square(y_test - y_pred_test))
print('rss2: ',rss2_lr)
metric2.append(rss2_lr)

mse_train_lr = mean_squared_error(y_train, y_pred_train)
print('MSE train: ',mse_train_lr)
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)
print('MSE test: ',mse_test_lr)
metric2.append(mse_test_lr**0.5)


In [None]:
#important predictor variables
betas = pd.DataFrame(index=X_train2.columns)
betas.rows = X_train_rfe.columns
betas['lasso2_rfe'] = lasso2_rfe.coef_
pd.set_option('display.max_rows', None)
betas.head(68)


 - 1stFlrSF
 - GrLivArea
 - Stree_Pave
 - RoofSyle_Shed
 - BsmtExposure_Gd