In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
#override matplot styles to use seaboran
sns.set()

from sklearn.linear_model import LinearRegression

In [12]:
#using pandas method read the data file
#dat will be auto converted to dataframe
data = pd.read_csv('data/1.02. Multiple linear regression.csv')

In [5]:
data.head()

Unnamed: 0,SAT,GPA
0,1714,2.4
1,1664,2.52
2,1760,2.54
3,1685,2.74
4,1693,2.83


In [6]:
X = data['SAT']
y = data['GPA']

In [7]:
X_matrix = X.values.reshape(-1,1)

In [8]:
reg = LinearRegression()
reg.fit(x,y)
print(reg.coef_)
print(reg.intercept)

NameError: name 'x' is not defined

In [10]:
data.columns

Index(['SAT', 'GPA'], dtype='object')

In [13]:
X = data[['SAT','Rand 1,2,3']]
y = data['GPA']

In [16]:
#multiple regression
reg = LinearRegression()
reg.fit(X,y)
print(reg.coef_)
print(reg.intercept_)

[ 0.00165354 -0.00826982]
0.29603261264909353


In [18]:
#Calculated R-squared.  Measuring goodness of fit
#R-squared of linear is the same for simple and multiple regression
reg.score(X,y)

0.4066811952814282

### Formula for Ajusted R^2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [19]:
#adjusted R-Score is more appropriate for the multiple regression
#adjust for the number of variables included in the model
#if we are using variable that dont add value, adjusted Rsquare 
#would increase
X.shape

(84, 2)

### n = 84 (number of observations - rows)
### p = 2 (number of predictors- SAT and Rand - columns)

In [23]:
r2 = reg.score(X,y)
n = X.shape[0]
p = X.shape[1]
adjusted_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.39203134825134

In [27]:
#can confirm value using the stats model
x1=X
x = sm.add_constant(x1)
results = sm.OLS(y,x).fit()
#results.summary()

#### The Adj R squared (.392) is less than R2 (.406) so one or more of my predictors have little to no explanatory power

If a P value has a value above 0.05, you can disregard it

In [33]:
#feature_selection.f_regression
#F-regression - creates a simple linear regression of each 
#feature and the dependent variable

In [36]:
#How to detect variables that are unneeded in a model?
#Feature selection using pvalues
#results.summary()

#1- we predict GPA with SAT
#2- we predict GP with Rand 1,2,3

#Note that for a simple linear regression, 
#p-value of F-stat = the pvalue of the only independe variable

from sklearn.feature_selection import f_regression

In [37]:
f_regression(X,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [39]:
#### 2 output arrays
#### 1 - F-statistics
#### 2 - p values
p_values = f_regression(X,y)[1]
p_values

array([7.19951844e-11, 6.76291372e-01])

In [40]:
#convert from scientific notation
p_values.round(3)

array([0.   , 0.676])

In [41]:
feature_cols = ['SAT','Rand 1,2,3']
print(list(zip(feature_cols,p_values.round(3))))

[('SAT', 0.0), ('Rand 1,2,3', 0.676)]


#### SAT is a useful variable while Rand 1,2,3 is useless

    These are the univariate pvalues reached from simple linear regression.  They do not reflect the interconnection of the feature in our multiple linear regression
    
    f_regression should be used with caution because its too simplistic for complicated problems. 
    

In [44]:
#creating  a summary table
reg_summary = pd.DataFrame(X.columns.values, columns=['Features'])
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [45]:
reg_summary['coefficients'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)

In [46]:
reg_summary

Unnamed: 0,Features,coefficients,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


#### conclusion:  Rand does not contribute to our model

In [48]:
#Pvalues are one of the best ways to determine if a variable
#is redundant but does not provide information about how 
#a variable is

#2 values may both have 0.000 but that does not make the 
#variables equally important