# **Penting**
- Pastikan Anda melakukan Run All sebelum mengirimkan submission untuk memastikan seluruh cell berjalan dengan baik.
- Hapus simbol pagar (#) jika Anda menerapkan kriteria tambahan
- Biarkan simbol pagar (#) jika Anda tidak menerapkan kriteria tambahan

# **1. Import Library**
Pada tahap ini, Anda perlu mengimpor beberapa pustaka (library) Python yang dibutuhkan untuk analisis data dan pembangunan model machine learning.

In [18]:
#Type your code here
# Import library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV
import joblib

# **2. Memuat Dataset dari Hasil Clustering**
Memuat dataset hasil clustering dari file CSV ke dalam variabel DataFrame.

In [19]:
# Gunakan dataset hasil clustering yang memiliki fitur Target
# Silakan gunakan dataset data_clustering jika tidak menerapkan Interpretasi Hasil Clustering [Advanced]
# Silakan gunakan dataset data_clustering_inverse jika menerapkan Interpretasi Hasil Clustering [Advanced]
# Gunakan dataset hasil clustering
df_cluster = pd.read_csv('data_clustering_inverse.csv')  # atau 'data_clustering.csv' jika tidak pakai inverse

In [20]:
# Tampilkan 5 baris pertama dengan function head.
# Tampilkan 5 baris pertama
df_cluster.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,Target
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70.0,Doctor,81.0,1.0,5112.21,2024-11-04 08:08:08,3.0
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68.0,Doctor,141.0,1.0,13758.91,2024-11-04 08:09:35,2.0
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19.0,Student,56.0,1.0,1122.35,2024-11-04 08:07:04,1.0
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26.0,Student,25.0,1.0,8569.06,2024-11-04 08:09:06,0.0
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,,Student,198.0,1.0,7429.4,2024-11-04 08:06:39,1.0


# **3. Data Splitting**
Tahap Data Splitting bertujuan untuk memisahkan dataset menjadi dua bagian: data latih (training set) dan data uji (test set).

In [29]:
# Menggunakan train_test_split() untuk melakukan pembagian dataset.
# Pisahkan fitur (X) dan label (y)
X = df_cluster.drop('Target', axis=1)
y = df_cluster['Target']

# Bagi menjadi data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **4. Membangun Model Klasifikasi**
Setelah memilih algoritma klasifikasi yang sesuai, langkah selanjutnya adalah melatih model menggunakan data latih.

Berikut adalah rekomendasi tahapannya.
1. Menggunakan algoritma klasifikasi yaitu Decision Tree.
2. Latih model menggunakan data yang sudah dipisah.

In [30]:
print('Target' in df_cluster.columns)

True


In [31]:
print(df_cluster.columns.tolist())

['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate', 'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID', 'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate', 'Target']


In [32]:
# Drop kolom ID dan tanggal karena tidak relevan untuk prediksi Target
cols_to_drop = ['TransactionID', 'AccountID', 'TransactionDate', 'PreviousTransactionDate',
               'DeviceID', 'IP Address', 'MerchantID']

df_cluster = df_cluster.drop(columns=cols_to_drop, errors='ignore')

In [33]:
# Cek missing values
print(df_cluster.isnull().sum())

# Imputasi nilai kosong jika ada
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')  # atau 'mean' untuk numerik
df_clean = df_cluster.copy()
df_clean[df_clean.select_dtypes(include=['object']).columns] = imputer.fit_transform(
    df_clean.select_dtypes(include=['object'])
)
df_clean[df_clean.select_dtypes(include=['number']).columns] = imputer.fit_transform(
    df_clean.select_dtypes(include=['number'])
)

TransactionAmount      26
TransactionType        30
Location               30
Channel                27
CustomerAge            18
CustomerOccupation     23
TransactionDuration    26
LoginAttempts          21
AccountBalance         27
Target                 70
dtype: int64


In [34]:
# Encode fitur kategorikal dengan pd.get_dummies
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

# Pastikan Target tidak ikut di-encode
if 'Target' in categorical_cols:
    categorical_cols.remove('Target')

# One-hot encoding
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)

In [35]:
# Pisahkan fitur dan label
X = df_encoded.drop('Target', axis=1)
y = df_encoded['Target']

# Split data menjadi train dan test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Buat model
model_dt = DecisionTreeClassifier(random_state=42)

# Latih model
model_dt.fit(X_train, y_train)

# Prediksi
y_pred = model_dt.predict(X_test)

# Evaluasi
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.33858267716535434
              precision    recall  f1-score   support

         0.0       0.23      0.23      0.23       114
         1.0       0.48      0.49      0.48       220
         2.0       0.15      0.14      0.15        50
         3.0       0.25      0.26      0.25       124

    accuracy                           0.34       508
   macro avg       0.28      0.28      0.28       508
weighted avg       0.34      0.34      0.34       508



In [37]:
import joblib

# Simpan model
joblib.dump(model_dt, 'decision_tree_model.h5')

['decision_tree_model.h5']

In [38]:
from sklearn.ensemble import RandomForestClassifier

# Latih model lain
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

# Evaluasi
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Simpan model
joblib.dump(model_rf, 'explore_RandomForest_classification.h5')

Random Forest Accuracy: 0.39173228346456695
              precision    recall  f1-score   support

         0.0       0.17      0.06      0.09       114
         1.0       0.43      0.81      0.56       220
         2.0       0.00      0.00      0.00        50
         3.0       0.28      0.10      0.15       124

    accuracy                           0.39       508
   macro avg       0.22      0.24      0.20       508
weighted avg       0.29      0.39      0.30       508



['explore_RandomForest_classification.h5']

In [39]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Tuned Model Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))

# Simpan model hasil tuning
joblib.dump(best_model, 'tuning_classification.h5')

Best Parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}
Tuned Model Accuracy: 0.43700787401574803
              precision    recall  f1-score   support

         0.0       0.43      0.03      0.05       114
         1.0       0.43      0.98      0.60       220
         2.0       1.00      0.02      0.04        50
         3.0       0.67      0.02      0.03       124

    accuracy                           0.44       508
   macro avg       0.63      0.26      0.18       508
weighted avg       0.55      0.44      0.28       508



['tuning_classification.h5']

In [40]:
# Buatlah model klasifikasi menggunakan Decision Tree
# Buat model klasifikasi menggunakan Decision Tree
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)

# Prediksi
y_pred_dt = model_dt.predict(X_test)

In [41]:
# Menyimpan Model
# import joblib
# joblib.dump(model, 'decision_tree_model.h5')
# Simpan model Decision Tree
joblib.dump(model_dt, 'decision_tree_model.h5')

['decision_tree_model.h5']

# **5. Memenuhi Kriteria Skilled dan Advanced dalam Membangun Model Klasifikasi**



**Biarkan kosong jika tidak menerapkan kriteria skilled atau advanced**

In [42]:

# Latih model lain: Random Forest
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

In [43]:
# Menampilkan hasil evaluasi akurasi, presisi, recall, dan F1-Score pada seluruh algoritma yang sudah dibuat.
# Evaluasi kedua model
def evaluate_model(y_true, y_pred, model_name):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted')
    rec = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Model {model_name}")
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}\n")

evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_rf, "Random Forest")

Model Decision Tree
Accuracy: 0.3386, Precision: 0.3368, Recall: 0.3386, F1-Score: 0.3377

Model Random Forest
Accuracy: 0.3917, Precision: 0.2917, Recall: 0.3917, F1-Score: 0.3008



In [44]:
# Menyimpan Model Selain Decision Tree
# Model ini bisa lebih dari satu
# import joblib
# joblib.dump(___, 'explore_<Nama Algoritma>_classification.h5')
# Simpan model Random Forest
joblib.dump(model_rf, 'explore_RandomForest_classification.h5')

['explore_RandomForest_classification.h5']

Hyperparameter Tuning Model

Pilih salah satu algoritma yang ingin Anda tuning

In [45]:
# Lakukan Hyperparameter Tuning dan Latih ulang.
# Hyperparameter tuning menggunakan GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

In [46]:
# Menampilkan hasil evaluasi akurasi, presisi, recall, dan F1-Score pada algoritma yang sudah dituning.
# Evaluasi model setelah tuning
evaluate_model(y_test, y_pred_best, "Tuned Decision Tree")
print("Best Parameters:", grid_search.best_params_)

Model Tuned Decision Tree
Accuracy: 0.4370, Precision: 0.5455, Recall: 0.4370, F1-Score: 0.2836

Best Parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}


In [47]:
# Menyimpan Model hasil tuning
# import joblib
# joblib.dump(model_dt, 'tuning_classification.h5')
# Simpan model hasil tuning
joblib.dump(best_model, 'tuning_classification.h5')

['tuning_classification.h5']

# **6. Reduksi Dimensi dengan PCA & Clustering (Tambahan Skilled/Advanced)**



In [58]:
#1. Preprocessing: Drop kolom tidak relevan dan imputasi

In [48]:
cols_to_drop = ['TransactionID', 'AccountID', 'TransactionDate', 'PreviousTransactionDate',
                'DeviceID', 'IP Address', 'MerchantID']
df = pd.read_csv('data_clustering_inverse.csv')  # atau 'data_clustering.csv'
df = df.drop(columns=cols_to_drop, errors='ignore')


In [60]:
#2. Imputasi nilai kosong

In [49]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
df[df.select_dtypes(include='object').columns] = imputer.fit_transform(df.select_dtypes(include='object'))
df[df.select_dtypes(include='number').columns] = imputer.fit_transform(df.select_dtypes(include='number'))


In [61]:
#3. Encoding kategorikal

In [50]:
df_encoded = pd.get_dummies(df, drop_first=True)


In [62]:
#4. Standardisasi data

In [51]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded.drop(columns=['Target'], errors='ignore'))


In [63]:
#5. PCA - Reduksi dimensi

In [52]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)


In [64]:
#6. Clustering dengan KMeans

In [53]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(X_pca)

# Tambahkan hasil cluster ke dataframe
df['Target'] = cluster_labels


# **7. Interpretasi Hasil Clustering**
Interpretasi cluster dengan menghitung modus untuk fitur kategorikal.

In [54]:
# Pilih fitur kategorikal asli
categorical_cols = ['TransactionType', 'Location', 'Channel', 'CustomerOccupation']

# Hitung modus untuk setiap cluster
mode_per_cluster = df.groupby('Target')[categorical_cols].agg(lambda x: x.mode().iloc[0])
print(mode_per_cluster)


       TransactionType    Location Channel CustomerOccupation
Target                                                       
0                Debit  Fort Worth  Branch            Student
1                Debit  Fort Worth     ATM           Engineer
2                Debit      Denver  Branch            Retired
3                Debit  Fort Worth  Online           Engineer


# **8. Simpan Dataset Hasil Clustering**


In [55]:
df.to_csv('data_clustering_inverse.csv', index=False)
