# 성능 최적화 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [5]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 모델링
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import * 

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings(action='ignore')

### (2) 함수 생성

* 선형회귀용 전진선택법

In [6]:
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant

def forward_stepwise_linear(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = OLS(y_train, add_constant(x_tr)).fit(disp=False)
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

* 변수 중요도 그래프

In [7]:
def plot_feature_importance(importance, names):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df.reset_index(drop=True, inplace = True)

    plt.figure(figsize=(10,8))
    sns.barplot(x='feature_importance', y='feature_names', data = fi_df)

    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.grid()

    return fi_df

### (3) Data Loading

In [9]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

URLError: <urlopen error [WinError 10054] 현재 연결은 원격 호스트에 의해 강제로 끊겼습니다>

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [None]:
data.head()

## 3.데이터 준비

### (1) 데이터 정리

In [None]:
data['Diff_Price'] = data['CompPrice'] - data['Price']
data.drop('CompPrice', axis = 1, inplace = True)

### (2) 데이터분할1 : x, y 나누기

In [None]:
target = 'Sales'
x = data.drop(target, axis=1)
y = data.loc[:,target]

### (3) NA 조치

### (4) 가변수화

In [None]:
dummies = ['ShelveLoc','Education','Urban','US']
x = pd.get_dummies(x,columns=dummies,drop_first=True)

### (5) 데이터분할2 : train : validation 나누기

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.3, random_state=2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [None]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## 4.모델링



### (1) 선형회귀

In [None]:
vars, result = forward_stepwise_linear(x_train, y_train)

### (2) KNN

In [None]:
params2 = {'n_neighbors': range(10,50,10), 'metric' : ['euclidean', 'manhattan']}

In [None]:
m2 = KNeighborsRegressor()
m2_gs = GridSearchCV(m2, params2, cv=5, verbose=3)
m2_gs.fit(x_train_s,y_train)
p2 = m2_gs.predict(x_val_s)

In [None]:
m2_gs.best_params_, m2_gs.best_score_

### (3) Decision Tree

In [None]:
params3 = {'max_depth':range(2,11), 'min_samples_leaf':range(10,101,10)}

In [None]:
m3 = DecisionTreeRegressor()
m3_gs = GridSearchCV(m3, params3, cv=5, verbose=3)
m3_gs.fit(x_train,y_train)
p3 = m3_gs.predict(x_val)

In [None]:
print(m3_gs.best_params_, m3_gs.best_score_)

In [None]:
r = plot_feature_importance(m3_gs.best_estimator_.feature_importances_, list(x_train)) # 튜닝했기 때문에

In [None]:
r

### (4) Random Forest

In [None]:
params4 = {'max_features': range(1,21)}

In [None]:
m4 = RandomForestRegressor()
m4_gs = GridSearchCV(m4, params4, cv=5,verbose=3)
m4_gs.fit(x_train, y_train)
p4 = m4_gs.predict(x_val)

In [None]:
print(m4_gs.best_params_,m4_gs.best_score_)

In [None]:
r = plot_feature_importance(m3_gs.best_estimator_.feature_importances_, list(x_train)) # 튜닝했기 때문에

In [None]:
r

### (5) XGB

In [None]:
 Data path : https://bit.ly/3EZKMUU

 Target : defeat (불량여부)

 삭제대상 : datetime

 Data split : train : val = 7:3, random_state = 2022

 알고리즘 XGB ( 트리갯수 50, learning rate 0.1, max depth = 4)

모델에서 가장 중요한 변수는?

In [None]:
from xgboost import XGBClassifier, plot_tree

In [None]:
data = pd.read_csv('https://bit.ly/3EZKMUU')

In [None]:
data.drop('datetime', axis = 1, inplace = True)

In [None]:
data.head()

In [None]:
target = 'defeat'
x = data.drop(target, axis = 1)
y = data.loc[:, target]

In [None]:
dumm_cols = ['defeat']
y = pd.get_dummies(y, columns = dumm_cols, drop_first = True)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.3, random_state=2022)

In [None]:
model = XGBClassifier(n_estimators= 50, learning_rate=0.1, max_depth = 4 )

In [None]:
model.fit(x_train, y_train)

In [None]:
print(x_train.columns)
print(model.feature_importances_)

In [None]:
def plot_feature_importance(importance, names):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df.reset_index(drop=True, inplace = True)

    plt.figure(figsize=(10,8))
    sns.barplot(x='feature_importance', y='feature_names', data = fi_df)

    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.grid()

    return fi_df

In [None]:
result = plot_feature_importance(model.feature_importances_, x_train.columns)