# 2. Multiple Linear Regression

### One-hot Encoding

In [4]:
import pandas as pd

In [5]:
dataset = pd.read_csv("MultipleLinearRegressionData.csv")
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
X

array([[0.5, 3, 'Home'],
       [1.2, 4, 'Library'],
       [1.8, 2, 'Cafe'],
       [2.4, 0, 'Cafe'],
       [2.6, 2, 'Home'],
       [3.2, 0, 'Home'],
       [3.9, 0, 'Library'],
       [4.4, 0, 'Library'],
       [4.5, 5, 'Home'],
       [5.0, 1, 'Cafe'],
       [5.3, 2, 'Cafe'],
       [5.8, 0, 'Cafe'],
       [6.0, 3, 'Library'],
       [6.1, 1, 'Cafe'],
       [6.2, 1, 'Library'],
       [6.9, 4, 'Home'],
       [7.2, 2, 'Cafe'],
       [8.4, 1, 'Home'],
       [8.6, 1, 'Library'],
       [10.0, 0, 'Library']], dtype=object)

In [7]:
from sklearn.compose import  ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [2])], remainder='passthrough')
X = ct.fit_transform(X)
X

# 1 0 : Home
# 0 1 : Library
# 0 0 : Cafe

array([[1.0, 0.0, 0.5, 3],
       [0.0, 1.0, 1.2, 4],
       [0.0, 0.0, 1.8, 2],
       [0.0, 0.0, 2.4, 0],
       [1.0, 0.0, 2.6, 2],
       [1.0, 0.0, 3.2, 0],
       [0.0, 1.0, 3.9, 0],
       [0.0, 1.0, 4.4, 0],
       [1.0, 0.0, 4.5, 5],
       [0.0, 0.0, 5.0, 1],
       [0.0, 0.0, 5.3, 2],
       [0.0, 0.0, 5.8, 0],
       [0.0, 1.0, 6.0, 3],
       [0.0, 0.0, 6.1, 1],
       [0.0, 1.0, 6.2, 1],
       [1.0, 0.0, 6.9, 4],
       [0.0, 0.0, 7.2, 2],
       [1.0, 0.0, 8.4, 1],
       [0.0, 1.0, 8.6, 1],
       [0.0, 1.0, 10.0, 0]], dtype=object)

### Data Set Separation

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=0)

### Learning (Multiple Linear Regression)

In [9]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(X_train, y_train)

LinearRegression()

### Comparison of  predicted and actual values (Test Set)

In [10]:
### Predicted values
y_pred = reg.predict(X_test)
y_pred

array([ 92.15457859,  10.23753043, 108.36245302,  38.14675204])

In [11]:
### Actual Values
y_test

array([ 90,   8, 100,  38])

In [12]:
reg.coef_   # ([Home, Library, Study hours, Absent])
            # Studying at home: -5 score, at library: -1 score, at cafe: 0 score
            # 10 score per study time (hour)
            # -1.64 score per absent

array([-5.82712824, -1.04450647, 10.40419528, -1.64200104])

In [13]:
reg.intercept_

5.365006706544783

### The model evaluation

In [14]:
reg.score(X_train, y_train) # train set
                            # 96%

0.9623352565265528

In [15]:
reg.score(X_test, y_test)   # test set
                            # 98.6%

0.9859956178877447

### Various Evaluation Metrics (다양한 평가 지표)

1. MAE (Mean Absolute Error) : (실제 값과 예측 값) 차이의 절대값
1. MSE (Mean Squared Error) : (실제 값과 예측 값) 차이의 제곱
1. RMSE (Root Mean Squared Error) : (실제 값과 예측 값) 차이의 제곱에 루트
1. R^2 : 결정 계수
> R^2 is better if it is close to 1, and the others better if they are close to 0

In [16]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred) # 실제값, 예측값 넣고 MAE

3.2253285188287957

In [17]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred) # 실제값, 예측값 넣고 MSE

19.900226981514848

In [18]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared=False) # 실제값, 예측값 넣고 RMSE

4.460967045553559

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred) # R^2 R^2는 1에 가까울수록 좋음!

0.9859956178877447