# sklearn Regression - 수치예측

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 실습용 데이터 설정

* pandas DataFrame
 * Insurance.csv

In [None]:
import pandas as pd

DF = pd.read_csv('https://raw.githubusercontent.com/rusita-ai/pyData/master/Insurance.csv')

DF.info()

In [None]:
DF.head(3)

> ## 1) 분석 변수 선택

* X : 'age', 'bmi', 'children'
* y : 'expenses'

In [None]:
DF1 = DF[['expenses', 'age', 'bmi', 'children']]

DF1.head(3)

> ## 2) Train &Test Split

* 7:3

In [None]:
from sklearn.model_selection import train_test_split

X = DF1[['age', 'bmi', 'children']]
y = DF1['expenses']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 2045)

print('Train Data : ', X_train.shape, y_train.shape)
print('Test Data : ', X_test.shape, y_test.shape)

# I. Multivariate Regression

> ## 1) 모델 생성

In [None]:
%%time

from sklearn.linear_model import LinearRegression

MR = LinearRegression(normalize = True,
                      n_jobs = -1)

MR.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, MR.predict(X_test))

# II. Ridge Regression

> ## 1) 모델 생성

* alpha : Regularization strength
 - default : 1.0
 - 값이 커지면 weight 값을 0에 가깝게 학습
 - 값이 작아지면 weight 값을 제한하지 않음
* solver : Optimization Method
 - 'cholesy' : Matrix Decomposition(숄레스키 행렬분해)
 - 'sag' : Stochastic Average Gradient Descent
    * solver = 'sag'
    * random_state = 2045
    * max_iter = 1000

In [None]:
%%time

from sklearn.linear_model import Ridge

RG = Ridge(normalize = True, 
           alpha = 0.3,
           solver = 'cholesky')   

RG.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, RG.predict(X_test))

# III. Lasso Regression

> ## 1) 모델 생성

* alpha : Regularization strength
 - default : 1.0
 - 값이 커지면 weight 값을 0에 가깝게 학습
 - 값이 작아지면 weight 값을 제한하지 않음

In [None]:
%%time

from sklearn.linear_model import Lasso

LS = Lasso(normalize = True, 
           alpha = 0.2)
   
LS.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, LS.predict(X_test))

# IV. ElasticNet Regression

> ## 1) 모델 생성

* l1_ratio : default = 0.5

In [None]:
%%time

from sklearn.linear_model import ElasticNet

EN = ElasticNet(normalize = True, 
                alpha = 0.001,
                l1_ratio = 0.7)
   
EN.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, EN.predict(X_test))

# V. Decision Tree Regressor

> ## 1) 모델 생성

In [None]:
%%time

from sklearn.tree import DecisionTreeRegressor

DTR = DecisionTreeRegressor(max_depth = 5,
                            criterion ='mse',
                            random_state = 2045)

DTR.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, DTR.predict(X_test))

> ## 3) Feature Importance

In [None]:
DTR.feature_importances_

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (9, 6))
sns.barplot(DTR.feature_importances_,
            ['age', 'bmi', 'children'])
plt.show()

# VI. Random Forest Regressor

> ## 1) 모델 생성

* criterion : default = 'mse'
 - The function to measure the quality of a split.

In [None]:
%%time

from sklearn.ensemble import RandomForestRegressor

RFR = RandomForestRegressor(n_estimators = 2000,
                            max_features = 3,
                            max_depth = 1,
                            criterion ='mse',
                            n_jobs = -1,
                            random_state = 2045)

RFR.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, RFR.predict(X_test))

> ## 3) Feature Importance

In [None]:
RFR.feature_importances_

In [None]:
plt.figure(figsize = (9, 6))
sns.barplot(RFR.feature_importances_,
            ['age', 'bmi', 'children'])
plt.show()

# VII. Gradient Boosting Machine(GBM) Regressor

* 이전 트리의 오차를 보완하는 방식으로 순차적으로 트리를 생성

> ## 1) 모델 생성

* loss : Optimization Method
 -  'ls' : Least Squares Regression
* n_estimators : 생성되는 트리의 수
 - 값이 크면 모델의 복잡도가 증가
 - 오차를 보정할 기회가 증가
* learning_rate : 이전 트리의 오차를 얼마나 강한게 보정할 것인지 제어
 - 값이 크면 강한 보정에 의해 복잡한 트리 생성

In [None]:
%%time

from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor(loss = 'ls',
                                n_estimators = 9000,
                                learning_rate = 0.0001,
                                criterion ='mse',
                                max_features = 3,
                                max_depth = 1,
                                random_state = 2045)

GBR.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, GBR.predict(X_test))

> ## 3) Feature Importance

In [None]:
GBR.feature_importances_

In [None]:
plt.figure(figsize = (9, 6))
sns.barplot(GBR.feature_importances_,
            ['age', 'bmi', 'children'])
plt.show()

# VIII. Adaptive Boosting Regressor

* 이전 트리가 잘못 예측한 샘플에 가중치를 높여서 다음 트리를 훈련
* 훈련된 모델은 성능에 따라 가중치가 부여

> ## 1) 모델 생성

* loss : The loss function to use when updating the weights after each boosting iteration
* base_estimator = None
 - DecisionTreeRegressor
 - max_depth = 3
 - random_state = 2045

In [None]:
%%time

from sklearn.ensemble import AdaBoostRegressor

ABR = AdaBoostRegressor(loss = 'square',
                        n_estimators = 500,
                        learning_rate = 0.0001,
                        random_state = 2045)

ABR.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, ABR.predict(X_test))

> ## 3) Feature Importance

In [None]:
ABR.feature_importances_

In [None]:
plt.figure(figsize = (9, 6))
sns.barplot(ABR.feature_importances_,
            ['age', 'bmi', 'children'])
plt.show()

# IX. eXtra Gradient Boost(XGBoost) Regressor

> ## 1) 모델 생성

In [None]:
%%time

from xgboost import XGBRegressor

XGB = XGBRegressor(booster = 'gblinear',
                   n_estimators = 100,
                   learning_rate = 0.4,
                   reg_lambda = 2.0,
                   n_jobs = -1,
                   random_state = 2045)

XGB.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, XGB.predict(X_test))

# X. LightGBM Regressor

> ## 1) 모델 생성

In [None]:
%%time

from lightgbm import LGBMRegressor

LGB = LGBMRegressor(linear_tree = True,
                    boosting_type = 'gbdt',
                    objective = 'regression',
                    n_estimators = 500,
                    learning_rate = 0.001,
                    max_depth = 2,
                    n_jobs = -1)

LGB.fit(X_train, y_train)

> ## 2) 모델 평가

In [None]:
mean_squared_error(y_test, LGB.predict(X_test))

> ## 3) Feature Importance

In [None]:
LGB.feature_importances_

In [None]:
plt.figure(figsize = (9, 6))
sns.barplot(LGB.feature_importances_,
            ['age', 'bmi', 'children'])
plt.show()

# 
# 
# 
# The End
# 
# 
# 