In [1]:
# =============================================================================
# CELL 1: IMPORT LIBRARIES
# =============================================================================
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
import joblib
import os

print("[OK] Libraries loaded successfully")

[OK] Libraries loaded successfully


In [3]:
# =============================================================================
# CELL 2: LOAD DATA
# =============================================================================
df_temuan = pd.read_excel('../data/Temuan 2425.xlsx')
df_history = pd.read_excel('../data/Histori Pemakaian Pelanggan.xlsx')

print(f"Data Temuan (Known Fraud): {len(df_temuan)} rows")
print(f"Data History: {len(df_history)} rows")

# Kolom di Temuan: IDE (bukan IDPEL)
# Kolom di History: IDE, UE, lalu kolom tanggal
ide_temuan = set(df_temuan['IDE'].astype(str).str.strip())
df_history['IDE'] = df_history['IDE'].astype(str).str.strip()
df_history['UE'] = df_history['UE'].astype(str).str.strip()
df_history['label'] = df_history['IDE'].apply(lambda x: 1 if x in ide_temuan else 0)

print(f"\nKnown Fraud (label=1): {df_history['label'].sum()} pelanggan")
print(f"Unknown (label=0): {(df_history['label']==0).sum()} pelanggan")

Data Temuan (Known Fraud): 52 rows
Data History: 152974 rows

Known Fraud (label=1): 61 pelanggan
Unknown (label=0): 152913 pelanggan


In [4]:
# =============================================================================
# CELL 3: DATA PREPROCESSING & ELIGIBILITY FILTERING
# =============================================================================
non_date_cols = ['IDE', 'UE', 'label']
date_columns = [col for col in df_history.columns if col not in non_date_cols]

print(f"Period: {date_columns[0]} to {date_columns[-1]} ({len(date_columns)} months)")

MIN_ACTIVE_MONTHS = 12

def find_active_period(row):
    not_na_mask = ~pd.isna(row)
    if not_na_mask.sum() == 0:
        return None, None
    indices = np.where(not_na_mask)[0]
    return indices[0], indices[-1]

active_periods = []
for idx, row in df_history[date_columns].iterrows():
    start_idx, end_idx = find_active_period(row.values)
    active_length = (end_idx - start_idx + 1) if start_idx is not None else 0
    active_periods.append({'start_idx': start_idx, 'end_idx': end_idx, 'active_months': active_length})

df_history['active_start_idx'] = [p['start_idx'] for p in active_periods]
df_history['active_end_idx'] = [p['end_idx'] for p in active_periods]
df_history['active_months_count'] = [p['active_months'] for p in active_periods]

last_month_col = date_columns[-1]
df_history['is_still_active'] = ~pd.isna(df_history[last_month_col])

def determine_eligibility(row):
    if not row['is_still_active']:
        return 'INACTIVE_STOPPED'
    elif row['active_months_count'] < MIN_ACTIVE_MONTHS:
        return 'INSUFFICIENT_HISTORY'
    else:
        return 'ELIGIBLE'

df_history['prediction_eligibility'] = df_history.apply(determine_eligibility, axis=1)

for col in date_columns:
    df_history[col] = pd.to_numeric(df_history[col], errors='coerce')

print(f"\n{'='*50}")
print("ELIGIBILITY SUMMARY")
print('='*50)
for status, count in df_history['prediction_eligibility'].value_counts().items():
    pct = count / len(df_history) * 100
    print(f"  {status}: {count:,} ({pct:.1f}%)")

df_eligible = df_history[df_history['prediction_eligibility'] == 'ELIGIBLE'].copy()
df_not_eligible = df_history[df_history['prediction_eligibility'] != 'ELIGIBLE'].copy()

print(f"\nELIGIBLE for prediction: {len(df_eligible):,}")
print(f"NOT ELIGIBLE: {len(df_not_eligible):,}")

Period: 2021-03-01 00:00:00 to 2026-01-01 00:00:00 (59 months)

ELIGIBILITY SUMMARY
  ELIGIBLE: 132,738 (86.8%)
  INACTIVE_STOPPED: 14,320 (9.4%)
  INSUFFICIENT_HISTORY: 5,916 (3.9%)

ELIGIBLE for prediction: 132,738
NOT ELIGIBLE: 20,236


In [5]:
# =============================================================================
# CELL 4: FEATURE ENGINEERING - 13 BASIC FEATURES
# =============================================================================
def extract_active_data(row, date_cols):
    start_idx = int(row['active_start_idx']) if not pd.isna(row['active_start_idx']) else 0
    end_idx = int(row['active_end_idx']) if not pd.isna(row['active_end_idx']) else 0
    data = row[date_cols].values[start_idx:end_idx+1]
    return np.array([0 if pd.isna(x) else x for x in data], dtype=float)

print("Extracting 13 Basic Features...")
df_features = df_eligible.copy()

features_data = []
for idx, row in df_features.iterrows():
    data = extract_active_data(row, date_columns)
    if len(data) == 0:
        data = np.array([0])
    
    mean_val = np.mean(data)
    std_val = np.std(data)
    
    features_data.append({
        'usage_mean': mean_val,
        'usage_std': std_val,
        'usage_min': np.min(data),
        'usage_max': np.max(data),
        'usage_range': np.max(data) - np.min(data),
        'coefficient_variation': std_val / mean_val if mean_val > 0 else 0,
        'zero_usage_count': (data == 0).sum(),
        'max_drop': max(0, -np.min(np.diff(data))) if len(data) > 1 else 0,
        'trend_slope': np.polyfit(range(len(data)), data, 1)[0] if len(data) > 1 and np.std(data) > 0 else 0,
        'recent_disconnect': (data[-3:] < 10).sum() if len(data) >= 3 else 0,
        'inactive_months': (data < 5).sum()
    })

for feat in features_data[0].keys():
    df_features[feat] = [f[feat] for f in features_data]

le_ue = LabelEncoder()
df_features['ue_encoded'] = le_ue.fit_transform(df_features['UE'].fillna('UNKNOWN'))

ue_fraud_rate = df_features.groupby('UE')['label'].mean()
df_features['ue_fraud_risk'] = df_features['UE'].map(ue_fraud_rate).fillna(0)

basic_features = ['usage_mean', 'usage_std', 'usage_min', 'usage_max', 'usage_range',
                  'coefficient_variation', 'zero_usage_count', 'max_drop', 'trend_slope',
                  'recent_disconnect', 'inactive_months', 'ue_encoded', 'ue_fraud_risk']

print(f"[OK] 13 Basic Features extracted")
print(f"Features: {basic_features}")

Extracting 13 Basic Features...
[OK] 13 Basic Features extracted
Features: ['usage_mean', 'usage_std', 'usage_min', 'usage_max', 'usage_range', 'coefficient_variation', 'zero_usage_count', 'max_drop', 'trend_slope', 'recent_disconnect', 'inactive_months', 'ue_encoded', 'ue_fraud_risk']


In [6]:
# =============================================================================
# CELL 5: FEATURE ENGINEERING - 14 TEMPORAL FEATURES
# =============================================================================
print("Extracting 14 Temporal Features (comparing first year vs last year)...")

temporal_features_list = []
for idx, row in df_features.iterrows():
    data = extract_active_data(row, date_columns)
    
    if len(data) < 12:
        temporal_features_list.append({k: 0 for k in ['first_year_mean', 'last_year_mean', 'year_change_abs', 
            'year_change_pct', 'first_year_zero', 'last_year_zero', 'zero_increase', 'stability_first',
            'stability_last', 'stability_change', 'max_window_drop', 'volatility_change', 
            'trend_break_score', 'recent_anomaly_score']})
        continue
    
    first_year = data[:12]
    last_year = data[-12:]
    
    first_mean = np.mean(first_year)
    last_mean = np.mean(last_year)
    
    year_change_pct = ((last_mean - first_mean) / first_mean * 100) if first_mean > 0 else 0
    
    first_zero = (first_year == 0).sum()
    last_zero = (last_year == 0).sum()
    
    first_std = np.std(first_year)
    last_std = np.std(last_year)
    
    # Max window drop (6-month windows)
    window_size = 6
    max_drop = 0
    if len(data) >= window_size * 2:
        for i in range(0, len(data) - window_size * 2 + 1, window_size):
            w1 = np.mean(data[i:i+window_size])
            w2 = np.mean(data[i+window_size:i+window_size*2])
            drop = w1 - w2
            if drop > max_drop:
                max_drop = drop
    
    # Trend break
    mid = len(data) // 2
    slope_first = np.polyfit(range(mid), data[:mid], 1)[0] if mid > 1 else 0
    slope_last = np.polyfit(range(len(data) - mid), data[mid:], 1)[0] if len(data) - mid > 1 else 0
    
    # Recent anomaly
    overall_mean = np.mean(data)
    overall_std = np.std(data)
    recent_mean = np.mean(data[-6:])
    anomaly_score = (recent_mean - overall_mean) / overall_std if overall_std > 0 else 0
    
    temporal_features_list.append({
        'first_year_mean': first_mean,
        'last_year_mean': last_mean,
        'year_change_abs': last_mean - first_mean,
        'year_change_pct': year_change_pct,
        'first_year_zero': first_zero,
        'last_year_zero': last_zero,
        'zero_increase': last_zero - first_zero,
        'stability_first': first_std,
        'stability_last': last_std,
        'stability_change': last_std - first_std,
        'max_window_drop': max_drop,
        'volatility_change': abs(last_std - first_std),
        'trend_break_score': abs(slope_first - slope_last),
        'recent_anomaly_score': anomaly_score
    })

temporal_cols = list(temporal_features_list[0].keys())
for feat in temporal_cols:
    df_features[feat] = [f[feat] for f in temporal_features_list]

print(f"[OK] 14 Temporal Features extracted")
print(f"Features: {temporal_cols}")

all_feature_cols = basic_features + temporal_cols
print(f"\nTotal Features: {len(all_feature_cols)}")

Extracting 14 Temporal Features (comparing first year vs last year)...
[OK] 14 Temporal Features extracted
Features: ['first_year_mean', 'last_year_mean', 'year_change_abs', 'year_change_pct', 'first_year_zero', 'last_year_zero', 'zero_increase', 'stability_first', 'stability_last', 'stability_change', 'max_window_drop', 'volatility_change', 'trend_break_score', 'recent_anomaly_score']

Total Features: 27


In [7]:
# =============================================================================
# CELL 6: ISOLATION FOREST (Anomaly Detection)
# =============================================================================
print("Training Isolation Forest for anomaly detection...")

scaler = StandardScaler()
X_all = df_features[all_feature_cols].values
X_all_scaled = scaler.fit_transform(X_all)

iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42, n_jobs=-1)
iso_predictions = iso_forest.fit_predict(X_all_scaled)

df_features['iso_anomaly'] = (iso_predictions == -1).astype(int)

n_anomaly = df_features['iso_anomaly'].sum()
print(f"[OK] Anomalies detected: {n_anomaly:,} ({n_anomaly/len(df_features)*100:.2f}%)")

Training Isolation Forest for anomaly detection...
[OK] Anomalies detected: 1,328 (1.00%)


In [8]:
# =============================================================================
# CELL 7: PSEUDO-LABELING
# =============================================================================
print("Creating pseudo-labels...")

df_known_fraud = df_features[df_features['label'] == 1].copy()
df_unknown = df_features[df_features['label'] == 0].copy()

likely_normal_mask = (
    (df_unknown['iso_anomaly'] == 0) &
    (df_unknown['zero_usage_count'] <= 1) &
    (df_unknown['coefficient_variation'] < 0.5) &
    (df_unknown['year_change_pct'] > -30)
)

df_likely_normal = df_unknown[likely_normal_mask]
n_sample = min(5000, len(df_likely_normal))
df_pseudo_normal = df_likely_normal.sample(n=n_sample, random_state=42)

df_train = pd.concat([df_known_fraud, df_pseudo_normal])

print(f"Known Fraud: {len(df_known_fraud)}")
print(f"Pseudo Normal: {len(df_pseudo_normal)}")
print(f"Training set: {len(df_train)}")

Creating pseudo-labels...
Known Fraud: 41
Pseudo Normal: 5000
Training set: 5041


In [9]:
# =============================================================================
# CELL 8: SMOTE BALANCING
# =============================================================================
print("Balancing with SMOTE...")

X_train = df_train[all_feature_cols].values
y_train = df_train['label'].values
X_train_scaled = scaler.transform(X_train)

k_neighbors = min(5, len(df_known_fraud) - 1)
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"Before SMOTE: Fraud={y_train.sum()}, Normal={(y_train==0).sum()}")
print(f"After SMOTE: Fraud={(y_train_balanced==1).sum()}, Normal={(y_train_balanced==0).sum()}")

Balancing with SMOTE...
Before SMOTE: Fraud=41, Normal=5000
After SMOTE: Fraud=5000, Normal=5000


In [10]:
# =============================================================================
# CELL 9: TRAIN MODELS (Random Forest + XGBoost)
# =============================================================================
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=5,
                                   class_weight='balanced', random_state=42, n_jobs=-1)
rf_model.fit(X_train_balanced, y_train_balanced)

print("Training XGBoost...")
xgb_model = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1,
                          scale_pos_weight=10, random_state=42, n_jobs=-1,
                          use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_balanced, y_train_balanced)

print("[OK] Models trained successfully")

Training Random Forest...
Training XGBoost...
[OK] Models trained successfully


In [11]:
# =============================================================================
# CELL 10: PREDICTION & ENSEMBLE
# =============================================================================
print("Making predictions on all eligible customers...")

rf_proba = rf_model.predict_proba(X_all_scaled)[:, 1]
xgb_proba = xgb_model.predict_proba(X_all_scaled)[:, 1]
ensemble_proba = (rf_proba + xgb_proba) / 2

df_features['rf_fraud_prob'] = rf_proba
df_features['xgb_fraud_prob'] = xgb_proba
df_features['ensemble_fraud_prob'] = ensemble_proba
df_features['ensemble_pred'] = (ensemble_proba >= 0.5).astype(int)

def get_priority(prob):
    if prob >= 0.7: return 'CRITICAL'
    elif prob >= 0.5: return 'HIGH'
    elif prob >= 0.3: return 'MEDIUM'
    else: return 'LOW'

df_features['priority'] = df_features['ensemble_fraud_prob'].apply(get_priority)

pred_fraud = (df_features['ensemble_pred'] == 1).sum()
print(f"\nPredicted Fraud: {pred_fraud:,} ({pred_fraud/len(df_features)*100:.2f}%)")

print(f"\nPriority Distribution:")
for priority in ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW']:
    count = (df_features['priority'] == priority).sum()
    print(f"  {priority}: {count:,}")

Making predictions on all eligible customers...

Predicted Fraud: 27,756 (20.91%)

Priority Distribution:
  CRITICAL: 23,151
  HIGH: 4,605
  MEDIUM: 2,190
  LOW: 102,792


In [12]:
# =============================================================================
# CELL 11: EVALUATE ON KNOWN FRAUD
# =============================================================================
print("="*60)
print("MODEL EVALUATION ON KNOWN FRAUD")
print("="*60)

known_fraud = df_features[df_features['label'] == 1]
detected = (known_fraud['ensemble_pred'] == 1).sum()
total = len(known_fraud)
recall = detected / total * 100

print(f"\nKnown Fraud: {total}")
print(f"Detected: {detected}")
print(f"Recall: {recall:.1f}%")

if total > detected:
    missed = known_fraud[known_fraud['ensemble_pred'] == 0]
    print(f"\nMissed ({total - detected}):")
    for _, row in missed.iterrows():
        print(f"  IDE: {row['IDE']}, Prob: {row['ensemble_fraud_prob']:.2f}")

MODEL EVALUATION ON KNOWN FRAUD

Known Fraud: 41
Detected: 41
Recall: 100.0%


In [13]:
# =============================================================================
# CELL 12: FEATURE IMPORTANCE
# =============================================================================
print("="*60)
print("FEATURE IMPORTANCE (Top 15)")
print("="*60)

rf_importance = rf_model.feature_importances_
xgb_importance = xgb_model.feature_importances_
avg_importance = (rf_importance + xgb_importance) / 2

importance_df = pd.DataFrame({
    'Feature': all_feature_cols,
    'RF_Importance': rf_importance,
    'XGB_Importance': xgb_importance,
    'Avg_Importance': avg_importance
}).sort_values('Avg_Importance', ascending=False)

print("\n{:<25} {:>12} {:>12} {:>12}".format('Feature', 'RF', 'XGB', 'Average'))
print("-" * 65)
for i, (_, row) in enumerate(importance_df.head(15).iterrows()):
    print(f"{row['Feature']:<25} {row['RF_Importance']:>12.4f} {row['XGB_Importance']:>12.4f} {row['Avg_Importance']:>12.4f}")

importance_df.head(15)

FEATURE IMPORTANCE (Top 15)

Feature                             RF          XGB      Average
-----------------------------------------------------------------
ue_fraud_risk                   0.2388       0.4457       0.3422
ue_encoded                      0.1488       0.0451       0.0970
year_change_pct                 0.0846       0.0515       0.0680
coefficient_variation           0.0652       0.0247       0.0449
usage_min                       0.0443       0.0389       0.0416
max_window_drop                 0.0317       0.0461       0.0389
stability_last                  0.0261       0.0438       0.0350
usage_std                       0.0206       0.0492       0.0349
stability_first                 0.0323       0.0347       0.0335
recent_anomaly_score            0.0392       0.0153       0.0272
year_change_abs                 0.0363       0.0155       0.0259
usage_mean                      0.0216       0.0221       0.0219
usage_range                     0.0222       0.0199       0.

Unnamed: 0,Feature,RF_Importance,XGB_Importance,Avg_Importance
12,ue_fraud_risk,0.238791,0.445686,0.342238
11,ue_encoded,0.148814,0.045102,0.096958
16,year_change_pct,0.084552,0.051512,0.068032
5,coefficient_variation,0.065218,0.024682,0.04495
2,usage_min,0.04435,0.038932,0.041641
23,max_window_drop,0.031717,0.046066,0.038891
21,stability_last,0.026136,0.043827,0.034982
1,usage_std,0.02059,0.049187,0.034888
20,stability_first,0.032292,0.034684,0.033488
26,recent_anomaly_score,0.039151,0.015281,0.027216


In [14]:
# =============================================================================
# CELL 13: THEFT MONTH DETECTION
# =============================================================================
print("Detecting theft start months...")

def detect_theft_months(row, date_cols, prob):
    """Detect which months show suspicious patterns (likely theft start)"""
    if prob < 0.3:
        return []
    
    data = extract_active_data(row, date_cols)
    if len(data) < 6:
        return []
    
    start_idx = int(row['active_start_idx'])
    suspicious = []
    
    # Calculate baseline from first 6 months
    baseline = np.mean(data[:6]) if len(data) >= 6 else np.mean(data)
    if baseline <= 0:
        baseline = np.mean(data[data > 0]) if (data > 0).any() else 1
    
    # Check each month for anomalies
    for i in range(6, len(data)):
        month_col = date_cols[start_idx + i]
        value = data[i]
        prev_6_mean = np.mean(data[max(0, i-6):i])
        
        # Criteria for suspicious month
        is_zero = value == 0 and baseline > 50
        is_drop_70 = value < baseline * 0.3 and baseline > 30
        is_sudden_drop = prev_6_mean > 0 and value < prev_6_mean * 0.3
        
        if is_zero or is_drop_70 or is_sudden_drop:
            # Determine severity
            if is_zero:
                severity = 'CRITICAL'
            elif value < baseline * 0.2:
                severity = 'CRITICAL'
            elif value < baseline * 0.3:
                severity = 'HIGH'
            else:
                severity = 'MEDIUM'
            
            suspicious.append({
                'month': month_col,
                'value': value,
                'baseline': baseline,
                'drop_pct': (1 - value/baseline) * 100 if baseline > 0 else 0,
                'severity': severity
            })
    
    return suspicious

theft_patterns = []
for idx, row in df_features.iterrows():
    prob = row['ensemble_fraud_prob']
    patterns = detect_theft_months(row, date_columns, prob)
    
    if patterns:
        first_suspicious = patterns[0]
        theft_patterns.append({
            'IDE': row['IDE'],
            'UE': row['UE'],
            'fraud_prob': prob,
            'priority': row['priority'],
            'first_theft_month': first_suspicious['month'],
            'first_theft_value': first_suspicious['value'],
            'baseline_usage': first_suspicious['baseline'],
            'drop_percentage': first_suspicious['drop_pct'],
            'severity': first_suspicious['severity'],
            'suspicious_months_count': len(patterns),
            'all_suspicious': patterns
        })

df_theft = pd.DataFrame(theft_patterns)
print(f"\n[OK] Detected {len(df_theft):,} customers with suspicious months")

if len(df_theft) > 0:
    print(f"\nSeverity Distribution:")
    for sev in ['CRITICAL', 'HIGH', 'MEDIUM']:
        count = (df_theft['severity'] == sev).sum()
        print(f"  {sev}: {count:,}")

Detecting theft start months...

[OK] Detected 19,166 customers with suspicious months

Severity Distribution:
  CRITICAL: 10,317
  HIGH: 7,267
  MEDIUM: 1,582


In [15]:
# =============================================================================
# CELL 14: CREATE THEFT MONTHS MATRIX (for Excel coloring)
# =============================================================================
print("Creating theft months matrix...")

# Get high priority suspects
high_priority_ides = df_features[df_features['priority'].isin(['CRITICAL', 'HIGH'])]['IDE'].tolist()

# Create matrix with all months
matrix_records = []
for _, row in df_features[df_features['IDE'].isin(high_priority_ides)].iterrows():
    record = {
        'IDE': row['IDE'],
        'NAMA': row['UE'],
        'FRAUD_PROB': round(row['ensemble_fraud_prob'], 3),
        'PRIORITY': row['priority'],
        'KNOWN_FRAUD': 'YES' if row['label'] == 1 else 'NO'
    }
    
    # Get theft months for this customer
    theft_info = df_theft[df_theft['IDE'] == row['IDE']]
    suspicious_months = {}
    if len(theft_info) > 0:
        for pattern in theft_info.iloc[0]['all_suspicious']:
            suspicious_months[pattern['month']] = pattern['severity']
    
    # Add each month column
    for col in date_columns:
        value = row[col] if not pd.isna(row[col]) else 'N/A'
        if col in suspicious_months:
            record[col] = f"{value}|{suspicious_months[col]}"
        else:
            record[col] = value
    
    matrix_records.append(record)

df_matrix = pd.DataFrame(matrix_records)
print(f"[OK] Matrix created: {len(df_matrix)} customers x {len(date_columns)} months")

Creating theft months matrix...
[OK] Matrix created: 31089 customers x 59 months


In [16]:
# =============================================================================
# CELL 15: EXPORT TO EXCEL WITH COLOR CODING
# =============================================================================
print("="*70)
print("EXPORTING TO EXCEL WITH COLOR CODING")
print("="*70)

output_file = '../results/electricity_theft_detection_results.xlsx'

# Create workbook
wb = Workbook()
ws = wb.active
ws.title = 'Theft_Detection_Matrix'

# Define styles
header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
header_font = Font(bold=True, color='FFFFFF', size=11)

critical_fill = PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid')  # Red
high_fill = PatternFill(start_color='FF6600', end_color='FF6600', fill_type='solid')      # Orange
medium_fill = PatternFill(start_color='FFCC00', end_color='FFCC00', fill_type='solid')    # Yellow
normal_fill = PatternFill(start_color='92D050', end_color='92D050', fill_type='solid')    # Green
na_fill = PatternFill(start_color='D9D9D9', end_color='D9D9D9', fill_type='solid')        # Gray

critical_font = Font(bold=True, color='FFFFFF')
high_font = Font(bold=True, color='000000')
medium_font = Font(bold=True, color='000000')

thin_border = Border(
    left=Side(style='thin'),
    right=Side(style='thin'),
    top=Side(style='thin'),
    bottom=Side(style='thin')
)

# Write header
columns = ['IDE', 'NAMA', 'FRAUD_PROB', 'PRIORITY', 'KNOWN_FRAUD'] + date_columns
for col_idx, col_name in enumerate(columns, 1):
    cell = ws.cell(row=1, column=col_idx, value=col_name)
    cell.fill = header_fill
    cell.font = header_font
    cell.alignment = Alignment(horizontal='center', vertical='center')
    cell.border = thin_border

# Write data with color coding
print("Writing data with color coding...")
for row_idx, record in enumerate(matrix_records, 2):
    for col_idx, col_name in enumerate(columns, 1):
        value = record.get(col_name, '')
        cell = ws.cell(row=row_idx, column=col_idx)
        cell.border = thin_border
        cell.alignment = Alignment(horizontal='center', vertical='center')
        
        # Check if it's a month column with severity
        if col_name in date_columns and isinstance(value, str) and '|' in str(value):
            parts = value.split('|')
            cell.value = float(parts[0]) if parts[0] != 'N/A' else 'N/A'
            severity = parts[1] if len(parts) > 1 else ''
            
            if severity == 'CRITICAL':
                cell.fill = critical_fill
                cell.font = critical_font
            elif severity == 'HIGH':
                cell.fill = high_fill
                cell.font = high_font
            elif severity == 'MEDIUM':
                cell.fill = medium_fill
                cell.font = medium_font
        elif value == 'N/A':
            cell.value = 'N/A'
            cell.fill = na_fill
        else:
            cell.value = value
            # Color priority column
            if col_name == 'PRIORITY':
                if value == 'CRITICAL':
                    cell.fill = critical_fill
                    cell.font = critical_font
                elif value == 'HIGH':
                    cell.fill = high_fill

# Adjust column widths
ws.column_dimensions['A'].width = 15
ws.column_dimensions['B'].width = 25
ws.column_dimensions['C'].width = 12
ws.column_dimensions['D'].width = 12
ws.column_dimensions['E'].width = 12
for col_idx in range(6, len(columns) + 1):
    ws.column_dimensions[get_column_letter(col_idx)].width = 10

# Freeze panes
ws.freeze_panes = 'F2'

# Add legend sheet
ws_legend = wb.create_sheet('Legend')
legend_data = [
    ['Color', 'Meaning', 'Description'],
    ['RED', 'CRITICAL', 'Bulan dengan penurunan >80% atau usage=0 saat baseline tinggi'],
    ['ORANGE', 'HIGH', 'Bulan dengan penurunan 70-80% dari baseline'],
    ['YELLOW', 'MEDIUM', 'Bulan dengan penurunan 50-70% dari baseline'],
    ['GREEN', 'NORMAL', 'Bulan dengan pola normal'],
    ['GRAY', 'N/A', 'Belum berlangganan atau sudah berhenti']
]
for row_idx, row_data in enumerate(legend_data, 1):
    for col_idx, value in enumerate(row_data, 1):
        cell = ws_legend.cell(row=row_idx, column=col_idx, value=value)
        if row_idx == 1:
            cell.font = Font(bold=True)
        if col_idx == 1 and row_idx > 1:
            colors = {'RED': 'FF0000', 'ORANGE': 'FF6600', 'YELLOW': 'FFCC00', 'GREEN': '92D050', 'GRAY': 'D9D9D9'}
            cell.fill = PatternFill(start_color=colors.get(value, 'FFFFFF'), fill_type='solid')

# Add summary sheet
ws_summary = wb.create_sheet('Summary')
summary_data = [
    ['Metric', 'Value'],
    ['Total Eligible Customers', len(df_features)],
    ['Predicted Fraud', pred_fraud],
    ['Known Fraud (Temuan)', total],
    ['Known Fraud Detected', detected],
    ['Recall Rate', f"{recall:.1f}%"],
    ['CRITICAL Priority', (df_features['priority'] == 'CRITICAL').sum()],
    ['HIGH Priority', (df_features['priority'] == 'HIGH').sum()],
    ['MEDIUM Priority', (df_features['priority'] == 'MEDIUM').sum()],
    ['Customers with Theft Months', len(df_theft)]
]
for row_idx, row_data in enumerate(summary_data, 1):
    for col_idx, value in enumerate(row_data, 1):
        cell = ws_summary.cell(row=row_idx, column=col_idx, value=value)
        if row_idx == 1:
            cell.font = Font(bold=True)
            cell.fill = header_fill
            cell.font = header_font

# Save workbook
wb.save(output_file)
print(f"\n[OK] File saved: {output_file}")
print(f"\nSheets:")
print(f"  1. Theft_Detection_Matrix - Main results with color-coded theft months")
print(f"  2. Legend - Color meaning explanation")
print(f"  3. Summary - Key metrics")

EXPORTING TO EXCEL WITH COLOR CODING
Writing data with color coding...

[OK] File saved: ../results/electricity_theft_detection_results.xlsx

Sheets:
  1. Theft_Detection_Matrix - Main results with color-coded theft months
  2. Legend - Color meaning explanation
  3. Summary - Key metrics


In [17]:
# =============================================================================
# CELL 16: SAVE MODELS
# =============================================================================
print("Saving models...")

os.makedirs('../models', exist_ok=True)

joblib.dump(rf_model, '../models/random_forest.joblib')
joblib.dump(xgb_model, '../models/xgboost.joblib')
joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(le_ue, '../models/label_encoder_ue.joblib')
joblib.dump(iso_forest, '../models/isolation_forest.joblib')

# Save metadata
import json
metadata = {
    'features': all_feature_cols,
    'n_features': len(all_feature_cols),
    'min_active_months': MIN_ACTIVE_MONTHS,
    'training_samples': len(df_train),
    'known_fraud_count': len(df_known_fraud),
    'recall': recall
}
with open('../models/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("[OK] Models saved to ../models/")

Saving models...
[OK] Models saved to ../models/


In [18]:
# =============================================================================
# CELL 17: FINAL SUMMARY
# =============================================================================
print("="*70)
print("                    FINAL SUMMARY")
print("="*70)
print(f"""
DATA:
  - Total pelanggan: {len(df_history):,}
  - Eligible untuk prediksi: {len(df_features):,}
  - Tidak eligible (< 12 bulan): {len(df_not_eligible):,}

FEATURES:
  - Basic Features: 13
  - Temporal Features: 14
  - Total: 27 features

MODEL PERFORMANCE:
  - Known Fraud: {total}
  - Detected: {detected}
  - Recall: {recall:.1f}%

PREDICTIONS:
  - CRITICAL (prob >= 70%): {(df_features['priority'] == 'CRITICAL').sum():,}
  - HIGH (prob >= 50%): {(df_features['priority'] == 'HIGH').sum():,}
  - MEDIUM (prob >= 30%): {(df_features['priority'] == 'MEDIUM').sum():,}

OUTPUT FILES:
  - results/electricity_theft_detection_results.xlsx
    (dengan color coding: MERAH=theft bulan critical)

""")
print("="*70)

                    FINAL SUMMARY

DATA:
  - Total pelanggan: 152,974
  - Eligible untuk prediksi: 132,738
  - Tidak eligible (< 12 bulan): 20,236

FEATURES:
  - Basic Features: 13
  - Temporal Features: 14
  - Total: 27 features

MODEL PERFORMANCE:
  - Known Fraud: 41
  - Detected: 41
  - Recall: 100.0%

PREDICTIONS:
  - CRITICAL (prob >= 70%): 23,151
  - HIGH (prob >= 50%): 4,605
  - MEDIUM (prob >= 30%): 2,190

OUTPUT FILES:
  - results/electricity_theft_detection_results.xlsx
    (dengan color coding: MERAH=theft bulan critical)


