In [None]:
# Analisis Kinerja Kurir dengan Machine Learning Clustering
# Google Colab Version - Final with Courier Names in Cluster Analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
import warnings
import joblib

warnings.filterwarnings('ignore')

# Set style untuk visualisasi
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# ===== LOAD DATA =====
try:
    df = pd.read_csv('Data kp.csv')
    print("Dataset Info:")
    print(f"Shape: {df.shape}")
    df.info()
    print("\nFirst 5 rows:")
    print(df.head())
except FileNotFoundError:
    print("ERROR: Pastikan file 'Data kp.csv' sudah diunggah ke environment Colab Anda.")
    exit()


# ===== DATA PREPROCESSING =====
# Mengecek dan menghapus data duplikat
print(f"\nJumlah data duplikat sebelum dihapus: {df.duplicated().sum()}")
df = df.drop_duplicates()
print(f"Jumlah data duplikat setelah dihapus: {df.duplicated().sum()}")


# Check missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Mengubah kolom keterangan waktu ke dalam tipe data datetime
datetime_columns = ['request_pickup', 'requested_pickup', 'real_pickup', 'manifested_at', 'final_date']
for col in datetime_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Mengisi missing value pada kolom real_pickup
df['pickup_delay'] = ((df['real_pickup'] - df['requested_pickup']).dt.total_seconds() / 3600).round()
avg_delay = pd.to_timedelta(df['pickup_delay'].mean(), unit='h')
df['real_pickup'] = df['real_pickup'].fillna(df['requested_pickup'] + avg_delay)
if 'real_pickup' in df.columns and 'final_date' in df.columns:
    df['delivery_duration'] = (df['final_date'] - df['real_pickup']).dt.total_seconds() / 3600
else:
    df['delivery_duration'] = np.random.uniform(1, 48, len(df))

df_clean = df.dropna(subset=['courier', 'delivery_duration', 'delivery_status'])
df_clean = df_clean[df_clean['delivery_duration'] > 0]

def normalize_delivery_status(status):
    if pd.isna(status): return 'UNKNOWN'
    status_str = str(status).strip().upper()
    success_values = ['DELIVERED', 'SUCCESS', 'SUKSES', 'COMPLETE', 'SELESAI', 'DITERIMA']
    failed_values = ['FAILED', 'GAGAL', 'CANCEL', 'RETURN', 'PENDING', 'LOST', 'RUSAK']
    if any(s in status_str for s in success_values): return 'DELIVERED'
    if any(s in status_str for s in failed_values): return 'FAILED'
    return 'DELIVERED'

df_clean['delivery_status_normalized'] = df_clean['delivery_status'].apply(normalize_delivery_status)

# ===== FEATURE ENGINEERING =====
if 'total_complain' not in df_clean.columns: df_clean['total_complain'] = np.random.uniform(0, 5, len(df_clean))
if 'ship_cost' not in df_clean.columns: df_clean['ship_cost'] = np.random.uniform(10000, 50000, len(df_clean))
if 'solved_percent' not in df_clean.columns: df_clean['solved_percent'] = np.random.uniform(0.5, 1.0, len(df_clean))

courier_aggregated = []
for courier in df_clean['courier'].unique():
    courier_data = df_clean[df_clean['courier'] == courier]
    total_deliveries = len(courier_data)
    if total_deliveries == 0: continue
    successful_deliveries = (courier_data['delivery_status_normalized'] == 'DELIVERED').sum()
    courier_metrics = {
        'courier': courier, 'total_deliveries': total_deliveries,
        'successful_deliveries': successful_deliveries,
        'failed_deliveries': total_deliveries - successful_deliveries,
        'success_rate': successful_deliveries / total_deliveries if total_deliveries > 0 else 0.0,
        'avg_delivery_duration': courier_data['delivery_duration'].mean(),
        'avg_complain': courier_data['total_complain'].mean(),
        'avg_ship_cost': courier_data['ship_cost'].mean(),
        'avg_solved_percent': courier_data['solved_percent'].mean()
    }
    courier_aggregated.append(courier_metrics)

courier_performance = pd.DataFrame(courier_aggregated)

min_deliveries = 5
courier_performance = courier_performance[courier_performance['total_deliveries'] >= min_deliveries]

print(f"\nJumlah kurir yang dianalisis setelah filter: {len(courier_performance)}")

if len(courier_performance) < 3:
    print("\nERROR: Jumlah kurir terlalu sedikit untuk clustering (minimal 3).")
    exit()

# ===== FEATURE SELECTION DAN SCALING =====
features_for_clustering = ['avg_delivery_duration', 'success_rate', 'avg_complain', 'avg_ship_cost']
X = courier_performance[features_for_clustering].fillna(0)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(X)

# ===== MENENTUKAN JUMLAH CLUSTER OPTIMAL =====
n_samples = len(features_scaled)
max_k_allowed = n_samples - 1
desired_k = [3, 4, 5, 6, 7]
K_to_test = [k for k in desired_k if k <= max_k_allowed]

if not K_to_test:
    print(f"\nERROR: Data tidak cukup untuk di-cluster dengan K yang diinginkan.")
    exit()

print(f"\nINFO: Berdasarkan jumlah data ({n_samples} kurir), K akan diuji untuk: {K_to_test}")

inertias = []
silhouette_scores = []
dbi_scores = []

for k in K_to_test:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(features_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(features_scaled, labels))
    dbi_scores.append(davies_bouldin_score(features_scaled, labels))

comparison_df = pd.DataFrame({
    'Jumlah Cluster (K)': K_to_test,
    'Inertia (Elbow)': inertias,
    'Silhouette Score': silhouette_scores,
    'Davies-Bouldin Index': dbi_scores
}).set_index('Jumlah Cluster (K)')

print("\n" + "="*60)
print(" Tabel Perbandingan Metrik untuk Penentuan K Optimal ".center(60, "="))
print("="*60)
print("💡 Ingat:")
print("- Elbow Method      : Cari titik 'siku' dimana penurunan Inertia melambat.")
print("- Silhouette Score  : Cari nilai TERTINGGI (mendekati 1).")
print("- Davies-Bouldin    : Cari nilai TERENDAH (mendekati 0).")
print("-"*60)
print(comparison_df.round(3))
print("="*60)

plt.figure(figsize=(22, 6))
plt.suptitle(f'Perbandingan Metrik Evaluasi Clustering untuk K={K_to_test}', fontsize=16, y=1.03)
plt.subplot(1, 3, 1)
plt.plot(K_to_test, inertias, 'bo-', markerfacecolor='red', markersize=8)
plt.xlabel('Jumlah Cluster (K)', fontsize=12)
plt.ylabel('Inertia (WCSS)', fontsize=12)
plt.title('Elbow Method', fontsize=14)
plt.grid(True)
plt.subplot(1, 3, 2)
plt.plot(K_to_test, silhouette_scores, 'go-', markerfacecolor='blue', markersize=8)
plt.xlabel('Jumlah Cluster (K)', fontsize=12)
plt.ylabel('Silhouette Score', fontsize=12)
plt.title('Silhouette Score (Makin Tinggi Makin Baik)', fontsize=14)
plt.grid(True)
plt.subplot(1, 3, 3)
plt.plot(K_to_test, dbi_scores, 'ro-', markerfacecolor='green', markersize=8)
plt.xlabel('Jumlah Cluster (K)', fontsize=12)
plt.ylabel('Davies-Bouldin Index', fontsize=12)
plt.title('Davies-Bouldin Index (Makin Rendah Makin Baik)', fontsize=14)
plt.grid(True)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

optimal_k_silhouette = K_to_test[np.argmax(silhouette_scores)]
optimal_k_dbi = K_to_test[np.argmin(dbi_scores)]

print("\n--- Rekomendasi K Optimal Berdasarkan Metrik ---")
print(f"✔️ Berdasarkan Silhouette Score (tertinggi): K = {optimal_k_silhouette}")
print(f"✔️ Berdasarkan Davies-Bouldin Index (terendah): K = {optimal_k_dbi}")
optimal_k = optimal_k_silhouette
print(f"\n>> Dipilih K = {optimal_k} untuk analisis lebih lanjut berdasarkan Silhouette Score.")

# ===== CLUSTERING & VISUALISASI HASIL AKHIR =====
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
courier_performance['cluster'] = kmeans.fit_predict(features_scaled)

pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_scaled)
centroids_pca = pca.transform(kmeans.cluster_centers_)

plt.figure(figsize=(9, 7))
scatter = plt.scatter(features_pca[:, 0], features_pca[:, 1], c=courier_performance['cluster'], cmap='viridis', alpha=0.8, s=80)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='red', marker='X', s=250, label='Centroids')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title(f'Visualisasi Cluster Kurir (K={optimal_k}) dengan PCA', fontsize=15)
plt.legend(); plt.colorbar(scatter, label='Cluster ID'); plt.grid(True, alpha=0.5)
plt.show()

# ===== ANALISIS HASIL CLUSTERING (DENGAN NAMA KURIR) =====
print("\n" + "="*50 + "\nANALISIS HASIL CLUSTERING\n" + "="*50)
for cluster_id in sorted(courier_performance['cluster'].unique()):
    cluster_data = courier_performance[courier_performance['cluster'] == cluster_id]

    # [PERBAIKAN] Mengambil daftar nama kurir
    list_of_couriers = cluster_data['courier'].tolist()

    print(f"\nCLUSTER {cluster_id} ({len(cluster_data)} kurir):")
    print(f"  Rata-rata durasi pengiriman: {cluster_data['avg_delivery_duration'].mean():.2f} jam")
    print(f"  Rata-rata success rate: {cluster_data['success_rate'].mean():.2%}")
    print(f"  Rata-rata komplain: {cluster_data['avg_complain'].mean():.2f}")
    print(f"  Rata-rata biaya pengiriman: Rp {cluster_data['avg_ship_cost'].mean():,.0f}")
    print(f"  Total pengiriman rata-rata: {cluster_data['total_deliveries'].mean():.0f}")

    # [PERBAIKAN] Menampilkan nama-nama kurir dalam cluster ini
    print(f"  Anggota Cluster: {', '.join(list_of_couriers)}")

# ===== IDENTIFIKASI KURIR TERBAIK DAN TERBURUK =====
print("\n" + "="*50 + "\nKURIR TERBAIK DAN TERBURUK\n" + "="*50)
max_duration = courier_performance['avg_delivery_duration'].max() if courier_performance['avg_delivery_duration'].max() > 0 else 1
max_complain = courier_performance['avg_complain'].max() if courier_performance['avg_complain'].max() > 0 else 1
courier_performance['performance_score'] = (
    courier_performance['success_rate'] * 0.4 +
    (1 - courier_performance['avg_delivery_duration'] / max_duration) * 0.3 +
    (1 - courier_performance['avg_complain'] / max_complain) * 0.3
)
n_show = min(5, len(courier_performance))
top_couriers = courier_performance.nlargest(n_show, 'performance_score')
worst_couriers = courier_performance.nsmallest(n_show, 'performance_score')

print(f"\nTOP {n_show} KURIR TERBAIK:")
for _, row in top_couriers.iterrows(): print(f"  - {row['courier']}: Score={row['performance_score']:.3f}, Rate={row['success_rate']:.1%}, Durasi={row['avg_delivery_duration']:.1f} jam")

print(f"\nTOP {n_show} KURIR TERBURUK:")
for _, row in worst_couriers.iterrows(): print(f"  - {row['courier']}: Score={row['performance_score']:.3f}, Rate={row['success_rate']:.1%}, Durasi={row['avg_delivery_duration']:.1f} jam")

# ===== SAVE HASIL =====
try:
    courier_performance.to_csv('courier_clustering_results.csv', index=False)
    print(f"\n[OK] Hasil clustering disimpan ke 'courier_clustering_results.csv'")
    joblib.dump(kmeans, 'courier_clustering_model.pkl')
    joblib.dump(scaler, 'courier_scaler.pkl')
    joblib.dump(pca, 'courier_pca.pkl')
    print("[OK] Model, scaler, dan PCA disimpan untuk deployment Streamlit")
except Exception as e:
    print(f"\n[ERROR] Gagal menyimpan hasil: {e}")

print(f"\nAnalisis selesai dengan {len(courier_performance)} kurir dalam {optimal_k} cluster.")