# Import Library

In [4]:
import pandas as pd
import numpy as np
import itertools

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import shap

from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
%matplotlib inline

RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xe

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

# Define Function

In [None]:
# Handcrafted function definition
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
#     plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
#     cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def confusionMatrix(modelInput, feature, label, classInfo=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]):
    pred = modelInput.predict(feature) # 코드 추가
    label, pred = to_categorical(label), to_categorical(pred)
    
    cnf_matrix = confusion_matrix(np.argmax(label, axis=1), np.argmax(pred, axis=1))
    np.set_printoptions(precision=2)
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=classInfo, normalize=False, title='')
    plt.show()

# Load Dataset

In [None]:
# CSV 파일 경로
file_path = './dbids_train_labeled.csv'

# CSV 파일을 DataFrame으로 읽기
df = pd.read_csv(file_path)

In [None]:
# DataFrame 출력
df

# Split Train & Test Set

In [None]:
# 특성과 레이블 분리
X = df.drop('class', axis=1)  # 'label' 열 제거하여 특성 추출
y = df['class'] - 1  # 레이블을 0부터 시작하도록 조정           

In [None]:
# 데이터를 8:2 비율로 훈련 데이터와 테스트 데이터로 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Train XGBoost

In [None]:
# XGBoost 모델 생성
model = XGBClassifier()

# 모델 학습 (훈련 데이터 사용)
model.fit(X_train, Y_train)

# 테스트 데이터에 대한 예측
Y_pred = model.predict(X_test)

# 정확도 평가
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Confusion matrix
label = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']
confusionMatrix(model, X_test, Y_test, classInfo=label)

# Train Random Forest

In [None]:
# Random Forest 모델 생성
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, max_features=10)

# 모델 학습 (훈련 데이터 사용)
rf_model.fit(X_train, Y_train)

# 테스트 데이터에 대한 예측
Y_pred_rf = rf_model.predict(X_test)

# 정확도 평가
accuracy_rf = accuracy_score(Y_test, Y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")

# Train Decision Tree

In [None]:
# Decision Tree 모델 생성
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=100, min_samples_leaf=10)

# 모델 학습 (훈련 데이터 사용)
dt_model.fit(X_train, Y_train)

# 테스트 데이터에 대한 예측
Y_pred_dt = dt_model.predict(X_test)

# 정확도 평가
accuracy_dt = accuracy_score(Y_test, Y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt}")

# Train Support Vector Machine

In [None]:
# Support Vector Machine 모델 생성 (성능 낮추기)
svm_model = SVC(kernel='linear')

# 모델 학습 (훈련 데이터 사용)
svm_model.fit(X_train, Y_train)

# 테스트 데이터에 대한 예측
Y_pred_svm = svm_model.predict(X_test)

# 정확도 평가
accuracy_svm = accuracy_score(Y_test, Y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm}")

In [None]:


# SHAP 값을 계산
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# SHAP 요약 플롯 생성
shap.summary_plot(shap_values, X)

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [None]:
#data_point = X.iloc[0]

In [None]:
# Force Plot
#shap.force_plot(explainer.expected_value[0], shap_values[0])

In [None]:
# 2. Decision Plot
shap.multioutput_decision_plot(explainer.expected_value, shap_values, 0)

In [None]:
# 3. Waterfall Plot
# shap.waterfall_plot(shap_values[0], 0)

In [None]:
# 4. Dependency Plot
shap.dependence_plot('whereClauseNum', shap_values[2], X)