In [1]:
# =============================================================================
# CELL 1: IMPORT LIBRARIES
# =============================================================================
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
import joblib
import os

print("[OK] Libraries loaded successfully")

[OK] Libraries loaded successfully


In [19]:
# =============================================================================
# CELL 2: LOAD DATA
# =============================================================================
import os
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

df_temuan = pd.read_excel('data/Temuan 2425.xlsx')
df_history = pd.read_excel('data/Histori Pemakaian Pelanggan.xlsx')

print(f"Data Temuan (Known Fraud): {len(df_temuan)} rows")
print(f"Data History: {len(df_history)} rows")

# Kolom di Temuan: IDE (bukan IDPEL)
# Kolom di History: IDE, UE, lalu kolom tanggal
ide_temuan = set(df_temuan['IDE'].astype(str).str.strip())
df_history['IDE'] = df_history['IDE'].astype(str).str.strip()
df_history['UE'] = df_history['UE'].astype(str).str.strip()
df_history['label'] = df_history['IDE'].apply(lambda x: 1 if x in ide_temuan else 0)

print(f"\nKnown Fraud (label=1): {df_history['label'].sum()} pelanggan")
print(f"Unknown (label=0): {(df_history['label']==0).sum()} pelanggan")

Data Temuan (Known Fraud): 52 rows
Data History: 152974 rows

Known Fraud (label=1): 61 pelanggan
Unknown (label=0): 152913 pelanggan


In [4]:
# =============================================================================
# CELL 3: DATA PREPROCESSING & ELIGIBILITY FILTERING
# =============================================================================
non_date_cols = ['IDE', 'UE', 'label']
date_columns = [col for col in df_history.columns if col not in non_date_cols]

print(f"Period: {date_columns[0]} to {date_columns[-1]} ({len(date_columns)} months)")

MIN_ACTIVE_MONTHS = 12

def find_active_period(row):
    not_na_mask = ~pd.isna(row)
    if not_na_mask.sum() == 0:
        return None, None
    indices = np.where(not_na_mask)[0]
    return indices[0], indices[-1]

active_periods = []
for idx, row in df_history[date_columns].iterrows():
    start_idx, end_idx = find_active_period(row.values)
    active_length = (end_idx - start_idx + 1) if start_idx is not None else 0
    active_periods.append({'start_idx': start_idx, 'end_idx': end_idx, 'active_months': active_length})

df_history['active_start_idx'] = [p['start_idx'] for p in active_periods]
df_history['active_end_idx'] = [p['end_idx'] for p in active_periods]
df_history['active_months_count'] = [p['active_months'] for p in active_periods]

last_month_col = date_columns[-1]
df_history['is_still_active'] = ~pd.isna(df_history[last_month_col])

def determine_eligibility(row):
    if not row['is_still_active']:
        return 'INACTIVE_STOPPED'
    elif row['active_months_count'] < MIN_ACTIVE_MONTHS:
        return 'INSUFFICIENT_HISTORY'
    else:
        return 'ELIGIBLE'

df_history['prediction_eligibility'] = df_history.apply(determine_eligibility, axis=1)

for col in date_columns:
    df_history[col] = pd.to_numeric(df_history[col], errors='coerce')

print(f"\n{'='*50}")
print("ELIGIBILITY SUMMARY")
print('='*50)
for status, count in df_history['prediction_eligibility'].value_counts().items():
    pct = count / len(df_history) * 100
    print(f"  {status}: {count:,} ({pct:.1f}%)")

df_eligible = df_history[df_history['prediction_eligibility'] == 'ELIGIBLE'].copy()
df_not_eligible = df_history[df_history['prediction_eligibility'] != 'ELIGIBLE'].copy()

print(f"\nELIGIBLE for prediction: {len(df_eligible):,}")
print(f"NOT ELIGIBLE: {len(df_not_eligible):,}")

Period: 2021-03-01 00:00:00 to 2026-01-01 00:00:00 (59 months)

ELIGIBILITY SUMMARY
  ELIGIBLE: 132,738 (86.8%)
  INACTIVE_STOPPED: 14,320 (9.4%)
  INSUFFICIENT_HISTORY: 5,916 (3.9%)

ELIGIBLE for prediction: 132,738
NOT ELIGIBLE: 20,236
