# Simple Linear Reg

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [3]:
from sklearn.linear_model import LinearRegression

In [4]:
data = pd.read_csv('sat.csv')

In [5]:
data

Unnamed: 0,SAT,GPA
0,1714,2.40
1,1664,2.52
2,1760,2.54
3,1685,2.74
4,1693,2.83
...,...,...
79,1936,3.71
80,1810,3.71
81,1987,3.73
82,1962,3.76


In [12]:
X = data['SAT']

In [14]:
X

0     1714
1     1664
2     1760
3     1685
4     1693
      ... 
79    1936
80    1810
81    1987
82    1962
83    2050
Name: SAT, Length: 84, dtype: int64

In [15]:
y = data['GPA']

In [16]:
y

0     2.40
1     2.52
2     2.54
3     2.74
4     2.83
      ... 
79    3.71
80    3.71
81    3.73
82    3.76
83    3.81
Name: GPA, Length: 84, dtype: float64

In [17]:
y.shape

(84,)

In [18]:
reg = LinearRegression()

In [20]:
reg.fit(np.array(X).reshape(-1,1),y)   # to have two dimensions

LinearRegression()

In [21]:
new_data = pd.DataFrame(data=[1740,1860],columns=['SAT'])

In [22]:
new_data

Unnamed: 0,SAT
0,1740
1,1860


In [23]:
reg.predict(new_data)



array([3.15593751, 3.35462007])

In [24]:
reg.coef_

array([0.00165569])

In [25]:
reg.intercept_

0.2750402996602803

In [26]:
reg.score(np.array(X).reshape(-1,1),y)    # R-squared

0.40600391479679765

# Multiple Linear Reg

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
data = pd.read_csv('multisat.csv')

In [30]:
data

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.40,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2
...,...,...,...
79,1936,3.71,3
80,1810,3.71,1
81,1987,3.73,3
82,1962,3.76,1


In [32]:
X = data[['SAT','Rand 1,2,3']]

In [33]:
X

Unnamed: 0,SAT,"Rand 1,2,3"
0,1714,1
1,1664,3
2,1760,3
3,1685,3
4,1693,2
...,...,...
79,1936,3
80,1810,1
81,1987,3
82,1962,1


In [34]:
y = data['GPA']

In [35]:
y

0     2.40
1     2.52
2     2.54
3     2.74
4     2.83
      ... 
79    3.71
80    3.71
81    3.73
82    3.76
83    3.81
Name: GPA, Length: 84, dtype: float64

In [36]:
mul_reg = LinearRegression()

In [37]:
mul_reg.fit(X,y)

LinearRegression()

In [38]:
mul_reg.coef_

array([ 0.00165354, -0.00826982])

In [39]:
mul_reg.score(X,y)

0.40668119528142843

In [40]:
from sklearn.feature_selection import f_regression

In [41]:
f_regression(X,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [42]:
f_regression(X,y)[1]

array([7.19951844e-11, 6.76291372e-01])

In [43]:
p_value = f_regression(X,y)[1]

In [45]:
p_value.round(3)

array([0.   , 0.676])

In [46]:
reg_summary = pd.DataFrame(data=['SAT','Rand 1,2,3'],columns=['Features'])

In [47]:
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [48]:
reg_summary['Coefficient'] = mul_reg.coef_

In [49]:
reg_summary

Unnamed: 0,Features,Coefficient
0,SAT,0.001654
1,"Rand 1,2,3",-0.00827


In [50]:
reg_summary['P_value'] = p_value.round(3)

In [51]:
reg_summary

Unnamed: 0,Features,Coefficient,P_value
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676
