# **Introduction to Linear Regression**
> ## *Author* : [Rathachai CHAWUTHAI](https://rathachai.creatier.pro/) , Ph.D
> ### *Affiliation* : Computer Engineering, King Mongkut's Institute of Technology Ladkrabang (KMITL)
> #### *Updated Date* : 2022-04-17
---

> <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import io

In [2]:
from sklearn.linear_model import LinearRegression

## Dataset

In [3]:
csv_data = """
x1	x2	y
1	2	14
2	3	18
3	4	24
4	5	32
"""

In [4]:
csv_data

'\nx1\tx2\ty\n1\t2\t14\n2\t3\t18\n3\t4\t24\n4\t5\t32\n'

In [5]:
df = pd.read_csv(io.StringIO(csv_data), sep="\t")

In [6]:
df

Unnamed: 0,x1,x2,y
0,1,2,14
1,2,3,18
2,3,4,24
3,4,5,32


## Input & Output

In [7]:
X = df[["x1", "x2"]]

In [8]:
X

Unnamed: 0,x1,x2
0,1,2
1,2,3
2,3,4
3,4,5


In [9]:
y = df["y"]

In [10]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

## LM Model

In [11]:
lm = LinearRegression()

In [12]:
lm.fit(X,y)

In [13]:
lm.coef_

array([3., 3.])

In [14]:
lm.intercept_

4.000000000000007

## Evaluation

In [15]:
lm.predict([(1,2), (2,3)])



array([13., 19.])

In [16]:
lm.predict(X)

array([13., 19., 25., 31.])

In [17]:
y_pred = lm.predict(X)

In [18]:
y_pred

array([13., 19., 25., 31.])

In [19]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

In [20]:
y - y_pred

0    1.0
1   -1.0
2   -1.0
3    1.0
Name: y, dtype: float64

In [21]:
rmse = np.sqrt(((y - y_pred)**2).mean())

In [22]:
rmse

1.0

In [23]:
# y = 4 + 3*x1 + 3*x2



---

## Train-Test Split


In [24]:
df

Unnamed: 0,x1,x2,y
0,1,2,14
1,2,3,18
2,3,4,24
3,4,5,32


In [25]:
X

Unnamed: 0,x1,x2
0,1,2
1,2,3
2,3,4
3,4,5


In [26]:
y

0    14
1    18
2    24
3    32
Name: y, dtype: int64

In [27]:
X_train = X.loc[0:2]
y_train = y.loc[0:2]

X_test = X.loc[[3]]
y_test = y[3]

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [30]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [31]:
print(lm.coef_)
print(lm.intercept_)

[3.5 3.5]
0.16666666666667496


In [32]:
y_pred = lm.predict(X_test)

print(y_pred)

[10.66666667]


In [33]:
y_test

0    14
Name: y, dtype: int64

In [34]:
np.sqrt(((y_test - y_pred)**2).mean())

3.333333333333327



---



In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
np.sqrt(((y_test - y_pred)**2).mean())

3.3333333333333357



---

## K-Fold Cross-Validation

In [36]:
from sklearn.model_selection import KFold

In [37]:
kf = KFold(n_splits=4, shuffle=True)

In [38]:
kf.get_n_splits(X)

4

In [39]:
for train_index, test_index in kf.split(X):
  print("train_index =", train_index)
  print("test_index =", test_index)
  print(" -------------- ")

train_index = [0 1 2]
test_index = [3]
 -------------- 
train_index = [0 1 3]
test_index = [2]
 -------------- 
train_index = [1 2 3]
test_index = [0]
 -------------- 
train_index = [0 2 3]
test_index = [1]
 -------------- 


In [40]:
X.loc[train_index]

Unnamed: 0,x1,x2
0,1,2
2,3,4
3,4,5


In [41]:
X.loc[test_index]

Unnamed: 0,x1,x2
1,2,3


In [42]:
rmse_ = []

for train_index, test_index in kf.split(X):
  X_train = X.loc[train_index]
  X_test = X.loc[test_index]
  y_train = y.loc[train_index]
  y_test = y.loc[test_index]

  lm = LinearRegression()
  lm.fit(X_train, y_train)
  
  y_pred = lm.predict(X_test)
  rmse_i = np.sqrt(((y_test - y_pred)**2).mean())
  print('rmse_i = ', rmse_i)
  rmse_.append(rmse_i)

rmse_i =  3.3333333333333357
rmse_i =  3.333333333333327
rmse_i =  1.4285714285714306
rmse_i =  1.4285714285714235


In [43]:
rmse_

[3.3333333333333357, 3.333333333333327, 1.4285714285714306, 1.4285714285714235]

In [44]:
print("RMSE for 4-fold = ", np.mean(rmse_) )

RMSE for 4-fold =  2.380952380952379
