In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
#override matplot styles to use seaboran
sns.set()

from sklearn.linear_model import LinearRegression

In [3]:
#using pandas method read the data file
#dat will be auto converted to dataframe
data = pd.read_csv('data/1.02. Multiple linear regression.csv')

In [4]:
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [5]:
X = data['SAT']
y = data['GPA']

In [6]:
X_matrix = X.values.reshape(-1,1)

In [7]:
reg = LinearRegression()
reg.fit(X_matrix,y)
print(reg.coef_)
print(reg.intercept_)

[0.00165569]
0.2750402996602803


In [8]:
data.columns

Index(['SAT', 'Rand 1,2,3', 'GPA'], dtype='object')

In [9]:
X = data[['SAT','Rand 1,2,3']]
y = data['GPA']

In [10]:
#multiple regression
reg = LinearRegression()
reg.fit(X,y)
print(reg.coef_)
print(reg.intercept_)

[ 0.00165354 -0.00826982]
0.29603261264909486


In [11]:
#Calculated R-squared.  Measuring goodness of fit
#R-squared of linear is the same for simple and multiple regression
d

0.40668119528142843

### Formula for Ajusted R^2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [12]:
#adjusted R-Score is more appropriate for the multiple regression
#adjust for the number of variables included in the model
#if we are using variable that dont add value, adjusted Rsquare 
#would increase
X.shape

(84, 2)

### n = 84 (number of observations - rows)
### p = 2 (number of predictors- SAT and Rand - columns)

In [13]:
r2 = reg.score(X,y)
n = X.shape[0]
p = X.shape[1]
adjusted_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.39203134825134023

In [14]:
#can confirm value using the stats model
x1=X
x = sm.add_constant(x1)
results = sm.OLS(y,x).fit()
#results.summary()

#### The Adj R squared (.392) is less than R2 (.406) so one or more of my predictors have little to no explanatory power

If a P value has a value above 0.05, you can disregard it

In [15]:
#feature_selection.f_regression
#F-regression - creates a simple linear regression of each 
#feature and the dependent variable

In [16]:
#How to detect variables that are unneeded in a model?
#Feature selection using pvalues
#results.summary()

#1- we predict GPA with SAT
#2- we predict GP with Rand 1,2,3

#Note that for a simple linear regression, 
#p-value of F-stat = the pvalue of the only independe variable

from sklearn.feature_selection import f_regression

In [17]:
f_regression(X,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [18]:
#### 2 output arrays
#### 1 - F-statistics
#### 2 - p values
p_values = f_regression(X,y)[1]
p_values

array([7.19951844e-11, 6.76291372e-01])

In [19]:
#convert from scientific notation
p_values.round(3)

array([0.   , 0.676])

In [20]:
feature_cols = ['SAT','Rand 1,2,3']
print(list(zip(feature_cols,p_values.round(3))))

[('SAT', 0.0), ('Rand 1,2,3', 0.676)]


#### SAT is a useful variable while Rand 1,2,3 is useless

    These are the univariate pvalues reached from simple linear regression.  They do not reflect the interconnection of the feature in our multiple linear regression
    
    f_regression should be used with caution because its too simplistic for complicated problems. 
    

In [21]:
#creating  a summary table
reg_summary = pd.DataFrame(X.columns.values, columns=['Features'])
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [22]:
reg_summary['coefficients'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)

In [23]:
reg_summary

Unnamed: 0,Features,coefficients,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


#### conclusion:  Rand does not contribute to our model

#Pvalues are one of the best ways to determine if a variable
#is redundant but do not provide information about how 
#useful a variable is

#2 values may both have 0.000 but that does not make the 
#variables equally important



#### standardization

In [24]:
#import module
from sklearn.preprocessing import StandardScaler

In [25]:
scaler = StandardScaler()

#scaler will be used to substract the mean and divide by 
#the standard deviation by each point.
#then we are going to fit our data



In [26]:
#calculate the mean and standard deviation of each feature


#Apply the scaler mechanism
scaler.fit(X)

#transform the data by substracting the mean and dividing 
#the standard deviation
x_scaled = scaler.transform(X)

#can see all the new data has been standarized
x_scaled[0:3]
    
    
#most commonly used
#new_data_scaled = scaler_transform(new_data)

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974]])

In [27]:
#When ever you get new data, know that all the standardized
#information is in the scaler

#Before standardization, when we calculated the coefficients
#of each variable, we could not see the affect of each 
#variable on the output

#because SAT has a range between 1634 - 2050
#and Rand has a range between 1 - 3
data.describe()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
count,84.0,84.0,84.0
mean,1845.27381,2.059524,3.330238
std,104.530661,0.855192,0.271617
min,1634.0,1.0,2.4
25%,1772.0,1.0,3.19
50%,1846.0,2.0,3.38
75%,1934.0,3.0,3.5025
max,2050.0,3.0,3.81


In [28]:
#the respective coefficients were
reg_summary

Unnamed: 0,Features,coefficients,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


#### Notes:
it seems like Rand has a bigger impact because it has a bigger coefficient
however this could be wrong
Since SAT has a bigger magnituide meaning the number are alot bigger, it more important

Feature scaling allows all features of the same magnitude to compare their impact

Now create new Regression with scaled inputs


#### Regression with Scaled features

In [29]:
reg = LinearRegression()
reg.fit(x_scaled,y)      #here we are training the model using 
                         #the standardized inputs
                         #reg is our model

LinearRegression()

In [30]:
x_scaled[:10]


array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087],
       [-1.68684014, -1.24637147],
       [-0.78218146, -0.07002087],
       [-0.78218146, -1.24637147],
       [-0.51270866, -0.07002087],
       [ 0.04548499,  1.10632974]])

In [31]:
reg.coef_

array([ 0.17181389, -0.00703007])

In [32]:
reg.intercept_

3.330238095238095

In [33]:
reg_summary = pd.DataFrame([['Intercept'],['SAT'],['Rand 1,2,3']], columns=['Features'])

In [34]:
#Let create a new series in the dataframe called weights
#Weights will include the intercepts and 2 coefficients
reg_summary['Weights'] = reg.intercept_, reg.coef_[0],reg.coef_[1]
reg_summary
 

Unnamed: 0,Features,Weights
0,Intercept,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


### the bigger the weight, the bigger the impact on the regression
It carries weight on teh result

The ML word for intercept is bias.  Nothign but a number that adjust our number
with some constant

If we adjust the number with a constant then the number is biased.

In [35]:
reg_summary = pd.DataFrame([['Bias'],['SAT'],['Rand 1,2,3']], columns=['Features'])
#Let create a new series in the dataframe called weights
#Weights will include the intercepts and 2 coefficients
reg_summary['Weights'] = reg.intercept_, reg.coef_[0],reg.coef_[1]
reg_summary

Unnamed: 0,Features,Weights
0,Bias,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


In [36]:
#if the value is less than 0.05 they are useless. 

#### Making predictions and the standardized coefficients (weights)

In [37]:
#Let say we a student that scored a 1700 on the SAT and was assigned a 2 randomly
#We also had another student 1800, 1.

#we have to format the data like X
new_data = pd.DataFrame(data=[[1700,2],[1800,1]],columns=['SAT','Rand 1,2,3'])
new_data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1700,2
1,1800,1


In [38]:
#Lets predict the new value (GPA)
#We can simply call the predict method on the regression 
#and then specify the new inputs as an argument.
reg.predict(new_data)           #we are usign the reg model 
                             #reg model is expecting standardized inputs
                             #our input magnitude is large



array([295.39979563, 312.58821497])

#### This is not a GPA.  
THis happened because oure regression model was tranined on standardized inputs --> the x_scaled data. It expects values that are of the same magnitude as the ones used in the training process.

The input data must be formatted like X and it must be standardized in the same way with the same mean and standard deviation which we have stored in the scaler object.

In [39]:
#lets transform this data
new_data_scaled = scaler.transform(new_data)
new_data_scaled

array([[-1.39811928, -0.07002087],
       [-0.43571643, -1.24637147]])

In [40]:
reg.predict(new_data_scaled)

array([3.09051403, 3.26413803])

##### Our first student is predicted to have a GPA of three point zero nine, while the second three point

### What happens if we remove the Rand 1 2 3 feature?

In [41]:
#theory suggest nothing will happen
#1. Create a new regression
reg_simple = LinearRegression()

In [42]:
#Create a new variable call X_Simple to contain all observations from 
#X scale but only include the SAT column
x_simple_matrix = x_scaled[:,0]

In [43]:
x_simple_matrix.shape

(84,)

In [44]:
#make it a matrix with the reshape method
x_simple_matrix = x_scaled[:,0].reshape(-1,1)

In [45]:
x_simple_matrix.shape

(84, 1)

In [46]:
#fit the regression with inputs
reg_simple.fit(x_simple_matrix,y)

LinearRegression()

In [50]:
#Now that it's fitted, we can predict the new data

#only feed it the first column SAT because the 
#regression was trained only on SAT
#Must reshape so the code executes properly
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1))

array([3.08970998, 3.25527879])

    When compared the previous linear regression:
        prev:array([3.09051403, 3.26413803])
        now: array([3.08970998, 3.25527879])


Let's compare it to what we got from our multiple linear regression, the predicted GPA is slightly different, but actually if we round up to two digits after the dot, we get the exact same results,three point zero nine and three point two six.This finding shows us why the developers of K Learn have decided that P values are not needed.When we apply feature scaling, it often does not affect the final result.If we keep or leave out any significant features, the rates will be so close to zero that they will barely influence the predictions.

## Overfitting and Underfitting


Broadly speaking, overfitting means our regression has focused on the particular data set so much it has missed the point.

Underfeeding, on the other hand, means the model has not captured the underlying logic of the data.

There is one popular solution to overfitting, though, we can split their initial data set into two			
training and test splits, like 90 percent training and 10 percent test or 80, 20 are common.			
It works like this, we create the regression on the training data after we have the coefficients,			
we test the model on the test data by assessing the accuracy.			
			
The whole point is that the model has never seen the test data set, therefore it cannot overfit on			
it.			
			
			
			
			
			
			
			
			
			