# Import the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns

# Change Working Directory to where the data file resides

In [2]:
os.chdir("C:\\Users\\Manish Kumar\\Documents\\Git-Repository\\Data-Science\\Height-Weight-Regression-Problem")
df=pd.read_csv("data.csv")

# Inspect the data in general and do some pre-processing activities

This includes:
1. checking the shape i.e. Number of rows and columns for the data
2. Checking for Null Values

In [3]:
df.shape

(25000, 3)

In [4]:
df.head(10)

Unnamed: 0,Index,Height(Inches),Weight(Pounds)
0,1,65.78331,112.9925
1,2,71.51521,136.4873
2,3,69.39874,153.0269
3,4,68.2166,142.3354
4,5,67.78781,144.2971
5,6,68.69784,123.3024
6,7,69.80204,141.4947
7,8,70.01472,136.4623
8,9,67.90265,112.3723
9,10,66.78236,120.6672


In [5]:
print("Null Values")
for cols in df.columns:
    null_count=df[cols].isnull().sum()
    print(cols + " : " + str(null_count))

Null Values
Index : 0
Height(Inches) : 0
Weight(Pounds) : 0


# PTRATIO field needs to be treated for NULL values as there exist 54 such records.
Replacing them with Mean value of the same column

In [13]:
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values='NaN',strategy='mean',axis=0)
imputer=imputer.fit(df.iloc[:,10:11])
df.iloc[:,10:11]=imputer.transform(df.iloc[:,10:11])



ValueError: Found array with 0 feature(s) (shape=(25000, 0)) while a minimum of 1 is required.

In [None]:
print("Null Values")
for cols in df.columns:
    null_count=df[cols].isnull().sum()
    print(cols + " : " + str(null_count))

# Selecting the independent and Dependent variables
All fields strating from CRIM to LSTAT are independent variables, and the one on the right most i.e. MEDV is the dependent variable.

In [None]:
x=df.iloc[:,0:13]
y=df.iloc[:,13]

In [None]:
x.head()

In [None]:
y.head()

# Creating the Training Set and the Test Set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=0 )

In [None]:
x_train.head()

In [None]:
y_train.head()

# Building a Linear Regression Model and predicting the values of MEDV field

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

In [None]:
y_pred=regressor.predict(x_test)

In [None]:
regressor.coef_

In [None]:
regressor.intercept_

In [None]:
from sklearn.metrics import r2_score
r2=r2_score(y_test,y_pred)
print("R-Squared = " + str(r2))

R-Squared for the linear regression model comes out to be 0.53 which means that all the independent variables put together only explain ~53% of the dependent variable MEDV.

To improve the predictability of this linear model we will need to apply Feature Selection technique to only select the features that explain the dependent variable considerably and leave out the rest.

First of all trying to implement Feature Selection method using Filtering method (which inherently uses Pearson Corelation co-efficient)

In [None]:
corr1=df.corr()
print(corr1)

In [None]:
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
cols = abs(cor["MEDV"])
selected_features = cols[cols>0.5]
print(selected_features)

What I found here is that the feature LSTAT has a very high co-relation co-efficient for MEDV. This could be the feature that explains the dependent variable the most. I will need to find out by creating another linear regression model using just LSTAT as the independent variable and MEDV as the dependent variable.

In [None]:
x=df.iloc[:,12:13]
y=df.iloc[:,13]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=0 )

In [None]:
regressor.fit(x_train,y_train)

In [None]:
y_pred1=regressor.predict(x_test)

In [None]:
r2=r2_score(y_test,y_pred1)
print(r2)

What we observe here is that feature selection using the filtering method resulted in even lower R-squared values. So, this model is worse in comparison to the one we created without applying feature selection technique.

Feature selection using Backwars Elimination method: Using R-Squared, Adj R-Squared values and P-Value.

In [None]:
x=df.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12]]
y=df.iloc[:,13]

In [None]:
import statsmodels.formula.api as sm
x=np.append(arr=np.ones((506,1)).astype(int),values=x,axis=1)
x_opt=x[:,[0,2,3,4,5,6,8,12]]
regressor_ols=sm.OLS(endog=y,exog=x_opt).fit()

In [None]:
regressor_ols.summary()

In [None]:
regressor_ols.pvalues

In [None]:
x=df.iloc[:,[0,2,3,4,5,6,8,12]]
y=df.iloc[:,13]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=1/3,random_state=0 )
regressor.fit(x_train,y_train)
y_pred=regressor.predict(x_test)

In [None]:
r2=r2_score(y_test,y_pred)
print(r2)

check for multicollinearity

In [None]:
x.corr()

Checking for Multicollinearilty

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns

In [None]:
vif

# Applying Support Vector Regression Technique
This technique is not so commonly used and so it does not automatically auto-scale.
Scaling needs to be specifically applied to the data set for applying SVR technique.

# Applying Random Forest Regression Technique

In [None]:
x=df.iloc[:,[0,2,3,6,8,12]]
y=df.iloc[:,13]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=1/3,random_state=0 )
regressor.fit(x_train,y_train)
y_pred=regressor.predict(x_test)

In [None]:
r2=r2_score(y_test,y_pred)
print(r2)

What we see above is that after treating for multicollinearity, even the accuracy of multiple linear regression model increased.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns

In [None]:
vif

In [None]:
from sklearn.ensemble import RandomForestRegressor

for i in range(1, 26):
    regressor = RandomForestRegressor(n_estimators = i, random_state = 0)
    regressor.fit(x, y)
    y_pred=regressor.predict(x_test)
    r2=r2_score(y_test,y_pred)
    print('Estimators = ' + str(i) + ', R2 = ' + str(r2))

In [None]:
res=pd.DataFrame()

In [None]:
res['Actual Values']=y_test
res['Predicted Values']=y_pred
print(res)