In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
seaborn.set()
from sklearn.linear_model import LinearRegression

In [50]:
data = pd.read_csv('1.02.Multiple-linear-regression.csv')
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [51]:
x = data[['Rand 1,2,3','SAT']]
y = data['GPA']

In [52]:
x.shape

(84, 2)

In [53]:
y.shape

(84,)

In [54]:
reg = LinearRegression()
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [55]:
reg.score(x,y)

0.4066811952814283

In [56]:
reg.coef_

array([-0.00826982,  0.00165354])

In [57]:
reg.intercept_

0.2960326126490922

#### Formula for Adjusted R-squared

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [58]:
n = x.shape[0]
p = x.shape[1]

fraction = (n-1)/(n-p-1)
r_squared = 1 - (1-reg.score(x,y)) * fraction
r_squared

0.3920313482513401

#### In order to find the insignificant variable in the regression modell, we use feature selection in sklearn

In [59]:
from sklearn.feature_selection import f_regression

#### F_regression gives us the f-statistic of the variables on the left and the p-values of the variable on the right

In [80]:
f_data = f_regression(x,y)
f_data

(array([ 0.17558437, 56.04804786]), array([6.76291372e-01, 7.19951844e-11]))

In [61]:
p_values = f_regression(x,y)[1]
p_values

array([6.76291372e-01, 7.19951844e-11])

In [82]:
p_values = p_values.round(3)
p_values

array([[0.676],
       [0.   ]])

#### Now we aggregate the data 

In [87]:
reg_summary = pd.DataFrame(data = x.columns.values, columns = ['Features'])
reg_summary['coef'] = reg.coef_
reg_summary['F-statistic'] = f_regression(x,y)[0]
reg_summary['p-values'] = p_values
reg_summary['r-squared'] = reg.score(x,y)
reg_summary['adj r-squared'] = r_squared
reg_summary

Unnamed: 0,Features,coef,F-statistic,p-values,r-squared,adj r-squared
0,"Rand 1,2,3",-0.00827,0.175584,0.676,0.406681,0.392031
1,SAT,0.001654,56.048048,0.0,0.406681,0.392031


#### Note: The f_regression calculates the p values by running regressions with each variable and doesn't take into account the relationship between both variables