Decision tree

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

data = pd.read_csv('HW3_creditcard.csv')

# y: "Class" column, X: other columns 
X = data.drop('Class', axis=1)
y = data['Class']

# split dataset: train 70%, test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = DecisionTreeClassifier(class_weight={0: 0.1, 1: 0.9})   # add weight
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

# recall, precisioin, and F1-score
print(classification_report(y_test, y_pred))

# AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]  # 預測每個類別的概率，這裡選擇了類別1的概率
auroc = roc_auc_score(y_test, y_pred_proba)
print("AUROC: ", auroc)

# class count
print("預測為 Class=0 的數量：", len(y_pred[y_pred==0]))
print("預測為 Class=1 的數量：", len(y_pred[y_pred==1]))

SMOTE

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
# Anaconda Prompt：
# conda update scikit-learn
# pip install -U imbalanced-learn

data = pd.read_csv('HW3_creditcard.csv')

# y: "Class" column, X: other columns 
X = data.drop('Class', axis=1)
y = data['Class']

# split dataset: train 70%, test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# add weight, and train
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

# recall, precisioin, and F1-score
print(classification_report(y_test, y_pred))

# AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]  # 預測每個類別的概率，這裡選擇了類別1的概率
auroc = roc_auc_score(y_test, y_pred_proba)
print("AUROC: ", auroc)

# class count
print("預測為 Class=0 的數量：", len(y_pred[y_pred==0]))
print("預測為 Class=1 的數量：", len(y_pred[y_pred==1]))

XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
# Anaconda Prompt：
# pip install numpy pandas xgboost scikit-learn

data = pd.read_csv('HW3_creditcard.csv')

# y: "Class" column, X: other columns 
X = data.drop('Class', axis=1)
y = data['Class']

# split dataset: train 70%, test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# XGBoost
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)

# recall, precisioin, and F1-score
print(classification_report(y_test, y_pred))

# AUROC
y_pred_proba = model.predict_proba(X_test)[:, 1]  # 預測每個類別的概率，這裡選擇了類別1的概率
auroc = roc_auc_score(y_test, y_pred_proba)
print("AUROC: ", auroc)

# 統計預測的Class數量
print("預測為 Class=0 的數量：", len(y_pred[y_pred==0]))
print("預測為 Class=1 的數量：", len(y_pred[y_pred==1]))