# **1. Import Library**

In [266]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# **2. Memuat Dataset dari Hasil Clustering**

Memuat dataset hasil clustering dari file CSV ke dalam variabel DataFrame.

In [267]:
classification_df = pd.read_csv("https://raw.githubusercontent.com/mraihanfauzii/Fraud-Detection-Classification-on-Bank-Transaction-Dataset/refs/heads/main/ClusteringResult.csv")
print("Data loaded. Shape:", classification_df.shape)

Data loaded. Shape: (2512, 7)


In [268]:
classification_df.head()

Unnamed: 0,TransactionAmount_log,TransactionType,Channel,LoginAttempts,AccountBalance,AmountBalanceRatio,Cluster
0,2.714032,Debit,ATM,1.0,5112.21,0.000531,0
1,5.932882,Debit,ATM,1.0,13758.91,0.000431,0
2,4.846468,Debit,Online,1.0,1122.35,0.004318,0
3,5.223055,Debit,Online,1.0,8569.06,0.00061,0
4,2.670694,Credit,Online,1.0,7429.4,0.000359,0


# **3. Data Splitting**

Tahap Data Splitting bertujuan untuk memisahkan dataset menjadi dua bagian: data latih (training set) dan data uji (test set).

In [269]:
num_cols = ['TransactionAmount_log','AccountBalance','LoginAttempts','AmountBalanceRatio']
scaler = StandardScaler()
classification_df[num_cols] = scaler.fit_transform(classification_df[num_cols])

X = classification_df.drop(['Cluster'], axis=1)
y = classification_df['Cluster']

In [270]:
cat_cols = X.select_dtypes(include='object').columns
print("Kolom Kategorikal:", cat_cols.tolist())

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

Kolom Kategorikal: ['TransactionType', 'Channel']


In [271]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training set:", X_train.shape, y_train.shape)
print("Testing set :", X_test.shape, y_test.shape)

Training set: (2009, 6) (2009,)
Testing set : (503, 6) (503,)


In [272]:
num_cols = X_train.select_dtypes(include=np.number).columns

scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

# **4. Membangun Model Klasifikasi**


## **a. Membangun Model Klasifikasi**

Setelah memilih algoritma klasifikasi yang sesuai, langkah selanjutnya adalah melatih model menggunakan data latih disini saya akan membandingkan dengan menggunakan 2 model yaitu Random Forest dan Logistic Regression.

In [273]:
#Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

#Logistic Regression
log_model = LogisticRegression(random_state=42, max_iter=1000)
log_model.fit(X_train, y_train)

**Logistic Regression** adalah metode klasifikasi linier yang menggunakan fungsi logit (sigmoid) untuk memodelkan probabilitas kelas target. Model ini menghitung bobot untuk setiap fitur dan menghasilkan probabilitas apakah suatu sampel termasuk kelas tertentu. **Alasan penggunaannya** modelnya sederhana, cepat untuk dilatih, dan efektif pada data linear jika hubungan antara fitur dan label mendekati linear maka algoritma ini cocok karena logistic regression cenderung bekerja dengan baik.

**Random Forest** adalah algoritma ensemble learning yang menggabungkan banyak Decision Trees untuk meningkatkan akurasi dan mengurangi risiko overfitting. **Alasan penggunaannya** Stabil dan akuran karena dengan ensemble banyak pohon, random forest seringkali menghasilkan akurasi yang baik dan tahan terhadap overfitting. Kemudian Random Forest dapat menangani Outlier dan Skala.

Karena alasan-alasan tersebutlah yang membuat saya memiliki ketertarikan untuk membandingkan hasil dari ke 2 algoritma tersebut untuk kasus klasifikasi ini.

## **b. Evaluasi Model Klasifikasi**

Berikut adalah tahapan yang dilakukan
1. Melakukan prediksi menggunakan data uji.
2. Mengitung metrik evaluasi seperti Accuracy dan F1-Score.
3. Membuat confusion matrix untuk melihat detail prediksi benar dan salah.

In [274]:
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf, average='macro')

print("=== Random Forest (Initial) ===")
print(f"Accuracy : {acc_rf*100:.2f}%")
print(f"F1-Score : {f1_rf*100:.2f}%")

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

=== Random Forest (Initial) ===
Accuracy : 99.40%
F1-Score : 97.60%
Confusion Matrix:
 [[468   2]
 [  1  32]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       470
           1       0.94      0.97      0.96        33

    accuracy                           0.99       503
   macro avg       0.97      0.98      0.98       503
weighted avg       0.99      0.99      0.99       503



In [275]:
y_pred_log = log_model.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log, average='macro')

print("\n=== Logistic Regression (Initial) ===")
print(f"Accuracy : {acc_log*100:.2f}%")
print(f"F1-Score : {f1_log*100:.2f}%")

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))


=== Logistic Regression (Initial) ===
Accuracy : 99.40%
F1-Score : 97.53%
Confusion Matrix:
 [[469   1]
 [  2  31]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       470
           1       0.97      0.94      0.95        33

    accuracy                           0.99       503
   macro avg       0.98      0.97      0.98       503
weighted avg       0.99      0.99      0.99       503



Kedua model sudah sangat tinggi akurasinya (~99.40%) dan F1-Score ( ~97.5–97.6% ).

Perbedaan tipis, di mana Random Forest sedikit unggul dengan F1=0.976 vs. Logistic Regression 0.9753 (selisih sangat kecil).

## **c. Tuning Model Klasifikasi**

Menggunakan GridSearchCV untuk mencari kombinasi hyperparameter terbaik

In [276]:
# Random Forest Tuning
param_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_gs = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_rf,
    scoring='f1_macro',
    cv=3,
    n_jobs=-1,
    verbose=1
)
rf_gs.fit(X_train, y_train)

print("\n=== Best Params for RandomForest ===")
print(rf_gs.best_params_)
print(f"Best CV Score (F1-macro): {rf_gs.best_score_:.4f}")

best_rf = rf_gs.best_estimator_

Fitting 3 folds for each of 24 candidates, totalling 72 fits

=== Best Params for RandomForest ===
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Score (F1-macro): 0.9898


In [277]:
# Logistic Regression Tuning
param_log = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'solver': ['liblinear','lbfgs','saga'],
    'penalty': ['l2']  # 'l1' if solver supports it
}
log_gs = GridSearchCV(
    estimator=LogisticRegression(random_state=42, max_iter=1000),
    param_grid=param_log,
    scoring='f1_macro',
    cv=3,
    n_jobs=-1,
    verbose=1
)
log_gs.fit(X_train, y_train)

print("\n=== Best Params for LogisticRegression ===")
print(log_gs.best_params_)
print(f"Best CV Score (F1-macro): {log_gs.best_score_:.4f}")

best_log = log_gs.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits

=== Best Params for LogisticRegression ===
{'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV Score (F1-macro): 0.9959


## **d. Evaluasi Model Klasifikasi setelah Tuning**

Berikut adalah tahapannya
1. Akan digunakan model dengan hyperparameter terbaik.
2. Akan dihitung ulang metrik evaluasi untuk melihat apakah ada peningkatan performa.

In [278]:
y_pred_rf_best = best_rf.predict(X_test)
acc_rf_best = accuracy_score(y_test, y_pred_rf_best)
f1_rf_best = f1_score(y_test, y_pred_rf_best, average='macro')

print("\n=== Random Forest (Tuned) ===")
print(f"Accuracy : {acc_rf_best*100:.2f}%")
print(f"F1-Score : {f1_rf_best*100:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_best))
print("Classification Report:\n", classification_report(y_test, y_pred_rf_best))


=== Random Forest (Tuned) ===
Accuracy : 99.40%
F1-Score : 97.60%
Confusion Matrix:
 [[468   2]
 [  1  32]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       470
           1       0.94      0.97      0.96        33

    accuracy                           0.99       503
   macro avg       0.97      0.98      0.98       503
weighted avg       0.99      0.99      0.99       503



In [279]:
y_pred_log_best = best_log.predict(X_test)
acc_log_best = accuracy_score(y_test, y_pred_log_best)
f1_log_best = f1_score(y_test, y_pred_log_best, average='macro')

print("\n=== Logistic Regression (Tuned) ===")
print(f"Accuracy : {acc_log_best*100:.2f}%")
print(f"F1-Score : {f1_log_best*100:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_best))
print("Classification Report:\n", classification_report(y_test, y_pred_log_best))


=== Logistic Regression (Tuned) ===
Accuracy : 99.60%
F1-Score : 98.38%
Confusion Matrix:
 [[469   1]
 [  1  32]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       470
           1       0.97      0.97      0.97        33

    accuracy                           1.00       503
   macro avg       0.98      0.98      0.98       503
weighted avg       1.00      1.00      1.00       503



In [280]:
print("\n=== Comparison After Tuning ===")
print(f"RandomForest => Accuracy: {acc_rf_best:.4f}, F1: {f1_rf_best:.4f}")
print(f"LogisticReg  => Accuracy: {acc_log_best:.4f}, F1: {f1_log_best:.4f}")

if acc_rf_best and acc_log_best >= 0.92 and f1_rf_best and f1_log_best >= 0.92:
    print("Both model has met the 92% target for both Accuracy & F1!")
elif acc_rf_best >= 0.92 and f1_rf_best >= 0.92:
    print("RandomForest has met the 92% target for both Accuracy & F1!")
elif acc_log_best >= 0.92 and f1_log_best >= 0.92:
    print("LogisticRegression has met the 92% target for both Accuracy & F1!")
else:
    print("Neither model reached 92% for both metrics. Consider further improvements.")

print("\nProses Klasifikasi Selesai.")


=== Comparison After Tuning ===
RandomForest => Accuracy: 0.9940, F1: 0.9760
LogisticReg  => Accuracy: 0.9960, F1: 0.9838
Both model has met the 92% target for both Accuracy & F1!

Proses Klasifikasi Selesai.


## **e. Analisis Hasil Evaluasi Model Klasifikasi**

Berikut adalah **rekomendasi** tahapannya.
1. Perbandingan hasil sebelum dan sesudah tuning.
  - Random Forest:
    - Sebelum Tuning: Accuracy 99.40%, F1 97.60%
    - Sesudah Tuning: Accuracy 99.40%, F1 97.60%
    - Kesimpulan: Tidak ada peningkatan signifikan; parameter default sudah cukup optimal untuk dataset ini.

  - Logistic Regression:
    - Sebelum Tuning: Accuracy 99.40%, F1 97.53%
    - Sesudah Tuning: Accuracy 99.60%, F1 98.38%
    - Kesimpulan: Ada peningkatan cukup baik (sekitar +0.2% akurasi dan +0.85% F1). Tuning hyperparameter C, solver, dan penalty memberikan dampak positif.

2. Identifikasi Kelemahan Model
- Precision atau Recall Rendah?
  - Pada kedua model, kelas 1 (minoritas) memiliki recall dan precision yang sedikit lebih rendah dibanding kelas 0. Namun, bedanya sangat kecil karena dataset sepertinya tidak terlalu imbalance.
  - Tidak terlihat kelemahan berarti; f1 sudah di atas 0.95 untuk kedua kelas.

- Overfitting / Underfitting?
  - Dengan akurasi ~99% di test set, tampak model fit sangat baik. Tidak tampak tanda underfitting.
  - Overfitting pun minim karena test set juga tinggi.

- Apakah Masih Bisa Ditingkatkan?
  - Mungkin data set ini memang “mudah” dipisahkan. Jika data real lebih kompleks, model mungkin butuh penyesuaian.

3. Rekomendasi Tindakan Lanjutan
- Mencoba algoritma lain untuk membandingkan performanya agar dapat menganalisisnya.