# 데이터 획득 및 설정

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import graphviz
import os
from sklearn import tree, ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from skl2onnx.common.data_types import FloatTensorType, Int16TensorType, DoubleTensorType
from skl2onnx import convert_sklearn
from skl2onnx import to_onnx
import shap
from sklearn.feature_selection import chi2, SelectKBest, f_classif, mutual_info_classif, f_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
import onnxmltools

os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/' # windows에서 graphviz를 사용하기 위해 변수경로 설정 

In [2]:
engine = create_engine('mysql://meancl:1234@221.149.119.60:2023/mjtradierdb')
conn = engine.connect()

In [3]:
br_full_data = pd.read_sql_table('buyreports', conn)

# Filtering

In [4]:
# Filtering
get_filter = ( br_full_data['isAllBuyed'] == 1) & ( br_full_data['isAllSelled'] == 1)# & (br_full_data['nBuyStrategyIdx'] == 11)
br = br_full_data[get_filter]

In [5]:
from mylib.featurenames import *
feature_names_102 = f_name_102

feature_size = len(feature_names_102)
feat_name = np.array(feature_names_102)
features = feature_names_102

In [6]:
X = br[features].to_numpy(dtype=np.float64)

In [7]:
y_condition = (br['fMaxPowerAfterBuyWhile10'] >= 0.025)
y = np.where(y_condition, 1, 0)


In [None]:
y.dtype

In [None]:
from mylib.scaler import *

scale_method = ROBUST
# 테스트 상 스케일 방법 중 Normalizer는 좋지 않다.
modelTester = ModelTester(engine, conn)
modelTester.setNpData(X)
modelTester.setScaler(scale_method)
modelTester.fitScale()
X_scaled = modelTester.np_data

In [None]:
X = X.astype(np.float32)

# VarianceThreshold

In [None]:
selector = VarianceThreshold(threshold=1)
train_thresh = selector.fit(X_scaled)
features_out = train_thresh.get_feature_names_out(features)

print(features_out.shape)
print(features_out)

# SelectKBest

In [None]:
# 분석방법 
#For classification: chi2, f_classif, mutual_info_classif

In [None]:
selector = SelectKBest(score_func=f_classif, k=20)

In [None]:
X_selected = selector.fit_transform(X, y)

In [None]:

## selector.get_support()
selected_mask = selector.get_support()
## 선택된 특성(변수)들
selected_names = feat_name[selected_mask]
## 선택되지 않은 특성(변수)들
unselected_names = feat_name[~selected_mask]
print('Selected names: ', selected_names, end='\n\n')
print('Unselected names: ', unselected_names)

# SelectFromModel

In [None]:
# will select those features which importance is greater than the mean importance of all the features by default
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X, y)

selected_feat = feat_name[selector.get_support()]
selected_feat

# Decision Tree - Classifer

In [None]:
clf = tree.DecisionTreeClassifier(max_depth = 5)
#clf = tree.ExtraTreeClassifier(max_depth= 5)
clf = clf.fit(X, y)

In [None]:
dot_data = tree.export_graphviz(clf,   # 의사결정나무 모형 대입
                               out_file = None,  # file로 변환할 것인가
                               feature_names = features,  # feature 이름
                               class_names = np.array(['fail', 'suc']),  # target 이름
                               filled = True,           # 그림에 색상을 넣을것인가
                               rounded = True,          # 반올림을 진행할 것인가
                               special_characters = True)   # 특수문자를 사용하나

In [None]:
graph = graphviz.Source(dot_data)
graph

# Random Forest

#### Random Forest Built-in Feature Importance

In [8]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5)
rf.fit(X, y)

In [None]:
# initial_type = [('float_input', FloatTensorType([None, 102]))]
# onx = convert_sklearn(rf, initial_types=initial_type, options={type(rf): {'zipmap':False}})

# with open("rf_test4.onnx", "wb") as f:
#     f.write(onx.SerializeToString())

In [None]:
# initial_type = [('float_input', FloatTensorType([None, 102]))]
# onx = convert_sklearn(rf, initial_types=initial_type)

# onx.graph.output[0].type.tensor_type.elem_type = onnxmltools.utils. get_numpy_type(np.int32)

# # Save the ONNX model to a file
# onnxmltools.utils.save_model(onx, 'rf_test5.onnx')

# # with open("rf_test5.onnx", "wb") as f:
# #     f.write(onx.SerializeToString())

In [9]:
onx = to_onnx(model=rf, options={'zipmap': False},
              initial_types=[('input', DoubleTensorType([None, 102]))],
              final_types=[('output', DoubleTensorType([None])),
                           ('prob', DoubleTensorType([None, 2]))],
              target_opset=17)

with open("rf_test2.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [None]:
# import onnxruntime as rt

# d = 0.01
# l = [d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d, d, d, d, d, d, d, d, d,
#      d, d
#      ]
# n = np.array([l], dtype=np.float64)



In [None]:

sess = rt.InferenceSession("rf_test0.onnx")

print("input name='{}' and shape={} ".format(sess.get_inputs()[0].name, sess.get_inputs()[0].shape))
print("output name='{}' and shape={}".format(sess.get_outputs()[0].name, sess.get_outputs()[0].shape))

In [None]:
sess = rt.InferenceSession("rf_test0.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run(
    [label_name], {input_name: n.astype(np.float64)})[0]
print(pred_onx)

In [None]:
rf.feature_importances_ # 피처들의 중요도

In [None]:
# plt.figure(figsize=(20, 20))
# plt.barh(feat_name, rf.feature_importances_)

In [None]:
sorted_idx = rf.feature_importances_.argsort()
plt.figure(figsize=(20, 20))
plt.barh(feat_name[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

#### Permutation Based Feature Importance (with scikit-learn)

In [None]:
perm_importance = permutation_importance(rf, X, y)

In [None]:
sorted_idx = perm_importance.importances_mean.argsort()
plt.figure(figsize=(20, 20))
plt.barh(feat_name[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

#### Feature Importance Computed with SHAP Values

Shap plot에 대해서  
Ref : https://towardsdatascience.com/explain-any-models-with-the-shap-values-use-the-kernelexplainer-79de9464897a  
Ref( Kernel Shap vs Tree Shap ) : https://towardsdatascience.com/kernelshap-vs-treeshap-e00f3b3a27db
1. 요약플롯 summary_plot()
2. 종속성플롯 : dependency_plot()
3. 개별힘플롯 : force_plot(각각)
4. 집합력플롯 : force_plot()

In [None]:
# shaKernelExplainer은 feature갯수에 따라 시간이 지수적으로 올라감
rf2 = RandomForestClassifier(n_estimators=100)
rf2.fit(X, y)
explainer = shap.TreeExplainer(rf2) #트리가 아니고 선형회귀기반 중요도분석 KernelExplainer 단점 : 시간이 오래걸림, TreeExplainer도 오래걸림
shap_values = explainer.shap_values(X) # X_test에 관한 각각의 중요도 변수들?? 

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar", feature_names=feat_name) # plot_type  : 다양하게

In [None]:
shap.summary_plot(shap_values, X, feature_names=feat_name)

In [None]:
specific_var = "fPlusCnt07"
shap.dependence_plot(specific_var, shap_values, X, feature_names=feat_name)

In [None]:
shap.initjs() # javascript 초기화 (graph 초기화)
specific_start_row = 2
specific_end_row = 3000

In [None]:
shap.force_plot(explainer.expected_value, shap_values[specific_start_row,:], X[specific_start_row,:],feature_names=feat_name)  

In [None]:
shap.force_plot(explainer.expected_value, shap_values[specific_start_row:,:], X[specific_start_row::,:], feature_names=feat_name)  