In [84]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [85]:
data = pd.read_csv('1.02.Multiple-linear-regression.csv')

In [86]:
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


#### We declare the dependent and independent variables

In [87]:
x = data[['SAT','Rand 1,2,3']]
y = data['GPA']

#### Now we standardize the data set(feature scaling)

In [88]:
from sklearn.preprocessing import StandardScaler

In [89]:
scaler = StandardScaler()

In [90]:
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [91]:
x_scaled = scaler.transform(x)

In [92]:
x_scaled

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087],
       [-1.68684014, -1.24637147],
       [-0.78218146, -0.07002087],
       [-0.78218146, -1.24637147],
       [-0.51270866, -0.07002087],
       [ 0.04548499,  1.10632974],
       [-1.06127829,  1.10632974],
       [-0.67631715, -0.07002087],
       [-1.06127829, -1.24637147],
       [-1.28263094,  1.10632974],
       [-0.6955652 , -0.07002087],
       [ 0.25721362, -0.07002087],
       [-0.86879772,  1.10632974],
       [-1.64834403, -0.07002087],
       [-0.03150724,  1.10632974],
       [-0.57045283,  1.10632974],
       [-0.81105355,  1.10632974],
       [-1.18639066,  1.10632974],
       [-1.75420834,  1.10632974],
       [-1.52323165, -1.24637147],
       [ 1.23886453, -1.24637147],
       [-0.18549169, -1.24637147],
       [-0.5608288 , -1.24637147],
       [-0.23361183,  1.10632974],
       [ 1.68156984,

#### Scaling the dataset prevents us from being misled by the large numbers in the data set that could intensify the magnitude of any of the features of the data

### Create the Regression

In [93]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [94]:
reg.score(x,y)

-1383037.8990380405

In [95]:
reg.coef_

array([ 0.17181389, -0.00703007])

In [96]:
reg.intercept_

3.330238095238095

### Create the Summary Table

In [97]:
reg_summary = pd.DataFrame(data = [['Bias'],['SAT'],['Rand 1,2,3']], columns = ['Features'])
reg_summary['Weights'] = reg.intercept_, reg.coef_[0], reg.coef_[1]
reg_summary

Unnamed: 0,Features,Weights
0,Bias,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


#### Now we run a prediction

In [98]:
new_data = pd.DataFrame(data = [[1700,2],[1800,1]], columns = ['SAT','Rand 1,2,3'])
new_data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1700,2
1,1800,1


In [99]:
reg.predict(new_data)
#prediction

array([295.39979563, 312.58821497])

#### The data generated doesn't look like an actual GPA because we did not scale the data before prediction. Now we standardize the data

In [100]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-1.39811928, -0.07002087],
       [-0.43571643, -1.24637147]])

In [101]:
prediction = reg.predict(new_data_scaled)
prediction

array([3.09051403, 3.26413803])

#### What if we removed the Rand 1,2,3 variable from the regression model

In [102]:
reg_simple = LinearRegression()
x_matrix = x_scaled[0:,0].reshape(-1,1)
x_matrix.shape

(84, 1)

In [103]:
reg_simple.fit(x_matrix,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [104]:
reg_simple.predict(new_data_scaled[0:,0].reshape(-1,1))

array([3.08970998, 3.25527879])

#### We can see the feature scaling already reduces the weight of variables with less predictive power as shown in this prediction. The values predicted with the Rand 1,2,3 variable are not significantly different from the values predicted without the Rand 1,2,3 variable. This is one of the reasons why p-values are not natively supported by sklearn.