# 데이터 획득 및 설정

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import graphviz
import os
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import shap
from sklearn.feature_selection import chi2, SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/' # windows에서 graphviz를 사용하기 위해 변수경로 설정 

In [None]:
engine = create_engine('mysql://sbe03253:jin94099@database-2.clmg3ftdxi2a.ap-northeast-2.rds.amazonaws.com/MJTradierDB')
conn = engine.connect()

In [None]:
br = pd.read_sql_table('buyReports', conn)

In [None]:
x = br[['nRqTime', 'fPower','fStartGap', 'fPowerWithOutGap','nHogaCnt', 'nNoMoveCnt', 
       'fPowerJar', 'fPlusCnt07', 'fPlusCnt09', 'fMinusCnt07', 'fMinusCnt09',
       'nChegyulCnt', 'nHogaCnt', 'nNoMoveCnt', 'nFewSpeedCnt', 'nMissCnt',
       'lTotalTradePrice', 'lTotalBuyPrice', 'lTotalSellPrice','lMarketCap',
       'nTotalRank','nMinuteTotalRank','nFakeBuyCnt','nFakeAssistantCnt',
       'nFakeResistCnt', 'nPriceUpCnt', 'nPriceDownCnt', 'nTotalFakeCnt','nTotalFakeMinuteCnt',
       'nShootingCnt','nDownCntMa20m', 'nDownCntMa1h', 'nDownCntMa2h','fTSlope', 'fISlope',
       'fHSlope','fRSlope', 'fDAngle']]
y = br[ ['fMaxPowerAfterBuy'] ]

y_cat = y['fMaxPowerAfterBuy'].apply(lambda x: 1 if x > 0.02 else 0) # categorical

# Split Train and Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y_cat, test_size=0.25, random_state=12)

# Filtering Method - 피어슨 상관계수

In [None]:
sns.set(style="white")
xy_data = X_train.copy()
xy_data['y_val'] = y_train

In [None]:
cor = X_train.corr()

f, ax = plt.subplots(figsize=(45, 45))
sns.heatmap(cor, annot=True)

plt.title('MJTradier', size=30)
ax.set_xticklabels(list(X_train.columns), size=15, rotation=90)
ax.set_yticklabels(list(X_train.columns), size=15, rotation=0);

# SelectKBest

In [None]:
# 분석방법 
#For regression: r_regression, f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif

In [None]:
selector = SelectKBest(score_func=f_classif, k=10)

In [None]:
X_train_selected = selector.fit_transform(X_train, y_train)

In [None]:
X_test_selected = selector.transform(X_test)
X_train_selected.shape, X_test_selected.shape

In [None]:
all_names = X_test.columns
## selector.get_support()
selected_mask = selector.get_support()
## 선택된 특성(변수)들
selected_names = all_names[selected_mask]
## 선택되지 않은 특성(변수)들
unselected_names = all_names[~selected_mask]
print('Selected names: ', selected_names)
print('Unselected names: ', unselected_names)

# SelectFromModel

In [None]:
# will select those features which importance is greater than the mean importance of all the features by default
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_train, y_train)

#sel.get_support() #To see which features are important
# make a list and count the selected features
selected_feat= X_train.columns[(sel.get_support())]
selected_feat

# Decision Tree

In [None]:
clf = tree.DecisionTreeClassifier(max_depth = 7)
clf = clf.fit(x, y_cat)

In [None]:
dot_data = tree.export_graphviz(clf,   # 의사결정나무 모형 대입
                               out_file = None,  # file로 변환할 것인가
                               feature_names = x.columns,  # feature 이름
                               class_names = np.array(['fail', 'suc']),  # target 이름
                               filled = True,           # 그림에 색상을 넣을것인가
                               rounded = True,          # 반올림을 진행할 것인가
                               special_characters = True)   # 특수문자를 사용하나

In [None]:
graph = graphviz.Source(dot_data)
graph

# Random Forest

#### Random Forest Built-in Feature Importance

In [None]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)

In [None]:
rf.feature_importances_ # 피처들의 중요도

In [None]:
plt.figure(figsize=(20, 20))
plt.barh(x.columns, rf.feature_importances_)

In [None]:
sorted_idx = rf.feature_importances_.argsort()
plt.figure(figsize=(20, 20))
plt.barh(X_train.columns[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

#### Permutation Based Feature Importance (with scikit-learn)

In [None]:
perm_importance = permutation_importance(rf, X_test, y_test)

In [None]:
sorted_idx = perm_importance.importances_mean.argsort()
plt.figure(figsize=(20, 20))
plt.barh(X_train.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

#### Feature Importance Computed with SHAP Values

Shap plot에 대해서 
Ref : https://towardsdatascience.com/explain-any-models-with-the-shap-values-use-the-kernelexplainer-79de9464897a
1. 요약플롯 summary_plot()
2. 종속성플롯 : dependency_plot()
3. 개별힘플롯 : force_plot(각각)
4. 집합력플롯 : force_plot()

In [None]:
# shaKernelExplainer은 feature갯수에 따라 시간이 지수적으로 올라감
explainer = shap.TreeExplainer(rf) #트리가 아니고 선형회귀기반 중요도분석 KernelExplainer 단점 : 시간이 오래걸림, TreeExplainer도 오래걸림
shap_values = explainer.shap_values(X_test) # X_test에 관한 각각의 중요도 변수들?? 

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X_test.columns  ) # plot_type  : 다양하게

In [None]:
shap.summary_plot(shap_values, X_test, feature_names=x.columns)

In [None]:
shap.dependence_plot("nRqTime", shap_values, X_test)

In [None]:
shap.initjs() # javascript 초기화 (graph 초기화)
shap.force_plot(explainer.expected_value, shap_values[2,:], X_test.iloc[2,:])  

In [None]:
shap.force_plot(explainer.expected_value, shap_values, X_test)  # 오래걸림