Decision tree

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

data = pd.read_csv('HW3_creditcard.csv')

# y: "Class" column, X: other columns 
X = data.drop('Class', axis=1)
y = data['Class']

# split dataset: train 70%, test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = DecisionTreeClassifier(class_weight={0: 0.1, 1: 0.9})   # add weight
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

# recall, precisioin, and F1-score
print(classification_report(y_test, y_pred))

# AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]  # 預測每個類別的概率，這裡選擇了類別1的概率
auroc = roc_auc_score(y_test, y_pred_proba)
print("AUROC: ", auroc)

# class count
print("預測為 Class=0 的數量：", len(y_pred[y_pred==0]))
print("預測為 Class=1 的數量：", len(y_pred[y_pred==1]))

accuracy:  0.9946992394560958
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8523
           1       0.86      0.84      0.85       155

    accuracy                           0.99      8678
   macro avg       0.93      0.92      0.92      8678
weighted avg       0.99      0.99      0.99      8678

AUROC:  0.9181228781324159
預測為 Class=0 的數量： 8527
預測為 Class=1 的數量： 151


SMOTE

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
# Anaconda Prompt：
# conda update scikit-learn
# pip install -U imbalanced-learn

data = pd.read_csv('HW3_creditcard.csv')

# y: "Class" column, X: other columns 
X = data.drop('Class', axis=1)
y = data['Class']

# split dataset: train 70%, test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# add weight, and train
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

# recall, precisioin, and F1-score
print(classification_report(y_test, y_pred))

# AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]  # 預測每個類別的概率，這裡選擇了類別1的概率
auroc = roc_auc_score(y_test, y_pred_proba)
print("AUROC: ", auroc)

# class count
print("預測為 Class=0 的數量：", len(y_pred[y_pred==0]))
print("預測為 Class=1 的數量：", len(y_pred[y_pred==1]))

accuracy:  0.9943535376814935
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8551
           1       0.80      0.81      0.81       127

    accuracy                           0.99      8678
   macro avg       0.90      0.90      0.90      8678
weighted avg       0.99      0.99      0.99      8678

AUROC:  0.9040499936923158
預測為 Class=0 的數量： 8550
預測為 Class=1 的數量： 128


XGBoost

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
# Anaconda Prompt：
# pip install numpy pandas xgboost scikit-learn

data = pd.read_csv('HW3_creditcard.csv')

# y: "Class" column, X: other columns 
X = data.drop('Class', axis=1)
y = data['Class']

# split dataset: train 70%, test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# XGBoost
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

# recall, precisioin, and F1-score
print(classification_report(y_test, y_pred))

# AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]  # 預測每個類別的概率，這裡選擇了類別1的概率
auroc = roc_auc_score(y_test, y_pred_proba)
print("AUROC: ", auroc)

# 統計預測的Class數量
print("預測為 Class=0 的數量：", len(y_pred[y_pred==0]))
print("預測為 Class=1 的數量：", len(y_pred[y_pred==1]))

accuracy:  0.9974648536529154
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8523
           1       0.99      0.86      0.92       155

    accuracy                           1.00      8678
   macro avg       1.00      0.93      0.96      8678
weighted avg       1.00      1.00      1.00      8678

AUROC:  0.9782841873791221
預測為 Class=0 的數量： 8543
預測為 Class=1 的數量： 135


Class=0的數量遠多於Class=1的數量，所以使用SMOTE增量資料，準確率較之前佳。