# 多元線性迴歸
### $\begin{equation}y = wx + b\end{equation}$

### OLS 公式
<img src='images/regression_formula.png' style='width:400px' />

## 載入資料集

In [3]:
import pandas as pd

df = pd.read_csv('./data/population.csv')

## 使用矩陣計算

In [4]:
import numpy as np

X = df[['year']].values

# b = b * 1
one=np.ones((len(df), 1))

# 將 x 與 one 合併 
X = np.concatenate((X, one), axis=1)

y = df[['pop']].values

# 求解
w = np.linalg.inv(X.T @ X) @ X.T @ y
print(f'w={w[0, 0]}, b={w[1, 0]}')

w=0.06115935866154644, b=-116.35631056115507


## 使用polyfit驗算

In [5]:
coef = np.polyfit(df['year'], df['pop'], deg=1)
print(f'w={coef[0]}, b={coef[1]}')

w=0.061159358661554586, b=-116.35631056117121


## 使用Scikit-Learn LinearRegression類別驗算

In [6]:
from sklearn.linear_model import LinearRegression

X, y = df[['year']].values, df['pop'].values

lr = LinearRegression()
lr.fit(X, y)
lr.coef_, lr.intercept_

(array([0.06115936]), -116.35631056117116)

## 以計程車小費資料集為例，求解多元線性迴歸

In [7]:
# 載入資料集
import seaborn as sns

df= sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## 文字欄位轉換為數值欄位

In [8]:
df.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [9]:
df.sex = df.sex.map({'Female':0, 'Male':1})
df.smoker = df.smoker.map({'No':0, 'Yes':1})
df.day = df.day.map({'Thur':0, 'Fri':1, 'Sat':2, 'Sun':3})
df.time = df.time.map({'Lunch':0, 'Dinner':1})
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,3,1,2
1,10.34,1.66,1,0,3,1,3
2,21.01,3.5,1,0,3,1,3
3,23.68,3.31,1,0,3,1,2
4,24.59,3.61,0,0,3,1,4


## 使用矩陣計算

In [10]:
X, y = df.drop('tip', axis=1).values, df.tip.values

# b = b * 1
one=np.ones((X.shape[0], 1))

# 將 x 與 one 合併 
X2 = np.concatenate((X, one), axis=1)

# 求解
w = np.linalg.inv(X2.T @ X2) @ X2.T @ y
w

array([ 0.09432509, -0.03464496, -0.07566309,  0.05273982, -0.11247777,
        0.17481962,  0.72400336])

## 以Scikit-Learn的線性迴歸驗證

In [11]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X, y)

lr.coef_, lr.intercept_

(array([ 0.09432509, -0.03464496, -0.07566309,  0.05273982, -0.11247777,
         0.17481962]),
 0.7240033611886232)