# **Introduction to Linear Regression**
> ## *Author* : [Rathachai CHAWUTHAI](https://rathachai.creatier.pro/) , Ph.D
> ### *Affiliation* : Computer Engineering, King Mongkut's Institute of Technology Ladkrabang (KMITL)
> #### *Updated Date* : 2022-04-17
---

> <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.

## Import Libraries

In [137]:
import numpy as np
import pandas as pd
import io

In [138]:
from sklearn.linear_model import LinearRegression

## Dataset

In [139]:
# csv_data = 
# x1	x2	y
# 2	6	1
# 3	7	2
# 5	8	3

In [140]:
csv_data = """
x1	x2	y
1	2	14
2	3	18
3	4	24
4	5	32
"""

In [141]:
csv_data

'\nx1\tx2\ty\n1\t2\t14\n2\t3\t18\n3\t4\t24\n4\t5\t32\n'

In [142]:
df = pd.read_csv(io.StringIO(csv_data), sep="\t")

In [143]:
df

Unnamed: 0,x1,x2,y
0,1,2,14
1,2,3,18
2,3,4,24
3,4,5,32


## Input & Output

In [144]:
X = df[["x1", "x2"]]

In [145]:
X

Unnamed: 0,x1,x2
0,1,2
1,2,3
2,3,4
3,4,5


In [146]:
y = df["y"]

In [147]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

## LM Model

In [148]:
lm = LinearRegression()

In [149]:
lm.fit(X,y)

In [150]:
lm.coef_

array([3., 3.])

In [151]:
lm.intercept_

4.000000000000007

## Evaluation

In [152]:
lm.predict([(1,2), (2,3)])



array([13., 19.])

In [153]:
lm.predict(X)

array([13., 19., 25., 31.])

In [154]:
y_pred = lm.predict(X)

In [155]:
y_pred

array([13., 19., 25., 31.])

In [156]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

In [157]:
y - y_pred

0    1.0
1   -1.0
2   -1.0
3    1.0
Name: y, dtype: float64

In [158]:
rmse = np.sqrt(((y - y_pred)**2).mean())

In [159]:
rmse

1.0

In [160]:
# y = 4 + 3*x1 + 3*x2



---

## Train-Test Split


In [161]:
df

Unnamed: 0,x1,x2,y
0,1,2,14
1,2,3,18
2,3,4,24
3,4,5,32


In [162]:
X

Unnamed: 0,x1,x2
0,1,2
1,2,3
2,3,4
3,4,5


In [163]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

In [164]:
X_train = X.loc[0:2]
y_train = y.loc[0:2]

X_test = X.loc[[3]]
y_test = y[3]

In [165]:
from sklearn.model_selection import train_test_split

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [167]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [168]:
print(lm.coef_)
print(lm.intercept_)

[3.5 3.5]
0.16666666666666785


In [169]:
y_pred = lm.predict(X_test)

print(y_pred)

[10.66666667]


In [170]:
y_test

0    14
Name: y, dtype: int64

In [171]:
np.sqrt(((y_test - y_pred)**2).mean())

3.333333333333332



---



In [185]:
# NOTE: train_test_split is random, not always the same result
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
np.sqrt(((y_test - y_pred)**2).mean())

1.4285714285714306



---

## K-Fold Cross-Validation

In [173]:
from sklearn.model_selection import KFold

In [174]:
kf = KFold(n_splits=4, shuffle=True)

In [175]:
kf.get_n_splits(X)

4

In [176]:
for train_index, test_index in kf.split(X):
  print("train_index =", train_index)
  print("test_index =", test_index)
  print(" -------------- ")

train_index = [1 2 3]
test_index = [0]
 -------------- 
train_index = [0 2 3]
test_index = [1]
 -------------- 
train_index = [0 1 3]
test_index = [2]
 -------------- 
train_index = [0 1 2]
test_index = [3]
 -------------- 


In [177]:
X.loc[train_index]

Unnamed: 0,x1,x2
0,1,2
1,2,3
2,3,4


In [178]:
X.loc[test_index]

Unnamed: 0,x1,x2
3,4,5


In [179]:
rmse_ = []

for train_index, test_index in kf.split(X):
  X_train = X.loc[train_index]
  X_test = X.loc[test_index]
  y_train = y.loc[train_index]
  y_test = y.loc[test_index]

  lm = LinearRegression()
  lm.fit(X_train, y_train)
  
  y_pred = lm.predict(X_test)
  rmse_i = np.sqrt(((y_test - y_pred)**2).mean())
  print('rmse_i = ', rmse_i)
  rmse_.append(rmse_i)

rmse_i =  1.428571428571427
rmse_i =  1.4285714285714306
rmse_i =  3.3333333333333357
rmse_i =  3.333333333333327


In [180]:
rmse_

[1.428571428571427, 1.4285714285714306, 3.3333333333333357, 3.333333333333327]

In [181]:
print("RMSE for 4-fold = ", np.mean(rmse_) )

RMSE for 4-fold =  2.38095238095238
