# 앙상블: Boosting

1. XGboost를 적용해본다.
2. 다른 알고리즘들과 성능을 비교해본다.

---------------------------


## 1.데이터 준비

### (1) 라이브러리 불러오기

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

### (2) 데이터 업로드

* 변수중요도 그래프 그리기 함수 만들기

In [None]:
def plot_feature_importance(importance, names):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df.reset_index(drop=True, inplace = True)

    plt.figure(figsize=(10,8))
    sns.barplot(x='feature_importance', y='feature_names', data = fi_df)

    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.grid()

### (1) data loading

In [None]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'

data = pd.read_csv(path)  # csv 파일을 불러올때, 지정한 칼럼의 데이터만 가져오기
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


|	변수명	|	설명	|	구분	|
|----|----|----|
|	Sales 	|	 각 지역 판매량(단위 : 1000개)	|	Target	|
|	CompPrice 	|	지역별 경쟁사 판매가격(달러)	|	feature	|
|	Income 	|	가구당 평균 소득액(1000달러)	|	feature	|
|	Advertising 	|	 각 지역, 회사의 광고 예산(1000달러)	|	feature	|
|	Population 	|	 지역 인구수(단위 : 1000명)	|	feature	|
|	Price 	|	 자사 지역별 판매가격(달러)	|	feature	|
|	ShelveLoc 	|	 진열상태(범주 : Bad, Medium, Good)	|	feature	|
|	Age 	|	 지역 인구의 평균 연령	|	feature	|
|	Education 	|	 교육수준(범주 : 10~18)	|	feature	|
|	Urban 	|	 매장이 도심에 있는지 여부(범주 : Yes, No)	|	feature	|
|	US 	|	 매장이 미국에 있는지 여부(범주 : Yes, No)	|	feature	|


### (2) 데이터분할1 : x, y 나누기

In [None]:
target = 'Sales'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (3) 가변수화

In [None]:
cat_cols = ['ShelveLoc', 'US','Urban']

In [None]:
# 첫번째 가변수 제거
x = pd.get_dummies(x, columns=cat_cols, drop_first=True)
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,US_Yes,Urban_Yes
0,138,73,11,276,120,42,17,False,False,True,True
1,111,48,16,260,83,65,10,True,False,True,True
2,113,35,10,269,80,59,12,False,True,True,True
3,117,100,4,466,97,55,14,False,True,True,True
4,141,64,3,340,128,38,13,False,False,False,True


### (4) 데이터분할2 : train : validation 나누기

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 20)

### (5) 모델링 : 튜닝
* 성능 튜닝을 수행해 봅시다.
* 하이퍼파라미터 범위
    * cv = 3
    * grid 파라미터
        - max_depth : 3 ~ 8 사이에서 3개 선택
        - n_estimators : 5 ~ 100 사이에서 5개 선택
        - learning_rate :0.01 ~ 0.2 사이에서 3개 선택
* 튜닝 후 검증셋으로 예측하고, RMSE, MAE, MAPE로 평가해 봅시다.


In [None]:
grid_params = {'max_depth': [3,5,7], 'n_estimators': [10,20,30], 'learning_rate': [0.01,0.05,0.15]}

In [None]:
model_gs = GridSearchCV(XGBRegressor(), grid_params, cv=3)

In [None]:
model_gs.fit(x_train, y_train)

In [None]:
result = pd.DataFrame(model_gs.cv_results_)

In [None]:
temp = result.loc[:, ['param_max_depth','param_n_estimators', 'param_learning_rate','mean_test_score']]
temp.head()

Unnamed: 0,param_max_depth,param_n_estimators,param_learning_rate,mean_test_score
0,3,10,0.01,0.06403
1,3,20,0.01,0.139487
2,3,30,0.01,0.203679
3,5,10,0.01,0.072528
4,5,20,0.01,0.155514


In [None]:
def mape(y_true, y_pred):
    # 분모가 0이 되는 것을 방지하기 위해 아주 작은 값(1e-10)을 더해줌
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10)) * 100

best_model = model_gs.best_estimator_
pred = best_model.predict(x_val)

print('RMSE:', np.sqrt(mean_squared_error(y_val, pred)))
print('MAE:', mean_absolute_error(y_val, pred))
print('MAPE:', mape(y_val, pred))

RMSE: 1.7071119952495493
MAE: 1.353685448964437
MAPE: 22.19450375822996
