In [32]:
# ============================================================================
# TASK 4: CREDIT RISK PROXY TARGET ENGINEERING - IMPORTS
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os
from datetime import datetime
import json

# Set professional styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("viridis")
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")
print(f"üìÖ Execution timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

‚úÖ All libraries imported successfully!
üìÖ Execution timestamp: 2025-12-16 09:50:35


In [33]:
# ============================================================================
# SECTION 1: DATA LOADING WITH MULTIPLE PATH OPTIONS
# ============================================================================
print("="*80)
print("üìä SECTION 1: DATA LOADING AND VALIDATION")
print("="*80)

# Get current directory
current_dir = os.getcwd()
print(f"üìÇ Current working directory: {current_dir}")

# Define multiple possible data paths
data_paths = [
    '../../data/processed/cleaned_data.csv',  # From notebooks/task4
    '../data/processed/cleaned_data.csv',
    'data/processed/cleaned_data.csv',
    './cleaned_data.csv'
]

# Try to load data from multiple paths
data_loaded = False
df = None

for path in data_paths:
    if os.path.exists(path):
        print(f"üîç Found data at: {path}")
        try:
            df = pd.read_csv(path)
            print(f"‚úÖ Data loaded successfully!")
            data_loaded = True
            break
        except Exception as e:
            print(f"‚ö†Ô∏è Error loading {path}: {e}")

if not data_loaded:
    print("‚ùå Could not load data from any path.")
    print("Creating sample data for demonstration...")
    # Create sample data structure
    np.random.seed(42)
    dates = pd.date_range('2022-01-01', '2023-12-31', freq='h')
    df = pd.DataFrame({
        'CustomerId': np.random.choice([f'CUST_{i:04d}' for i in range(1, 1001)], size=5000),
        'TransactionId': [f'TXN_{i:06d}' for i in range(5000)],
        'Amount': np.random.lognormal(6, 1.5, 5000),
        'TransactionStartTime': np.random.choice(dates, 5000),
        'ProductCategory': np.random.choice(['electronics', 'fashion', 'home', 'services'], 5000),
        'FraudResult': np.random.binomial(1, 0.002, 5000)
    })
    print(f"üìù Created sample dataset: {len(df):,} transactions")

print(f"\nüìã DATA OVERVIEW:")
print(f"‚Ä¢ Total transactions: {len(df):,}")
print(f"‚Ä¢ Total columns: {len(df.columns)}")
print(f"‚Ä¢ Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

üìä SECTION 1: DATA LOADING AND VALIDATION
üìÇ Current working directory: d:\10 acadamy\Credit Risk  Model\notebooks\task4
üîç Found data at: ../../data/processed/cleaned_data.csv
‚úÖ Data loaded successfully!

üìã DATA OVERVIEW:
‚Ä¢ Total transactions: 95,662
‚Ä¢ Total columns: 21
‚Ä¢ Memory usage: 78.6 MB


In [34]:
# ============================================================================
# SECTION 2: DATA PREPROCESSING & VALIDATION
# ============================================================================
print("\n" + "="*80)
print("üîß SECTION 2: DATA PREPROCESSING & VALIDATION")
print("="*80)

# Convert TransactionStartTime to datetime
print("üîÑ Converting TransactionStartTime to datetime...")
try:
    df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
    print(f"‚úÖ Successfully converted TransactionStartTime")
except Exception as e:
    print(f"‚ö†Ô∏è Error converting dates: {e}")
    # Create synthetic dates if conversion fails
    dates = pd.date_range('2022-01-01', '2023-12-31', periods=len(df))
    df['TransactionStartTime'] = dates

# Check for required columns
print("\nüîç COLUMN VALIDATION:")
required_cols = ['CustomerId', 'Amount', 'TransactionStartTime']
for col in required_cols:
    if col in df.columns:
        print(f"  ‚úÖ {col}: Found")
    else:
        # Try to find similar columns
        similar = [c for c in df.columns if col.lower() in c.lower()]
        if similar:
            print(f"  ‚ö†Ô∏è  {col}: Not found, using '{similar[0]}' instead")
            df = df.rename(columns={similar[0]: col})
        else:
            print(f"  ‚ùå {col}: Not found and no alternatives")

# Data quality check
print(f"\nüìä DATA QUALITY SUMMARY:")
print(f"‚Ä¢ Missing values: {df.isnull().sum().sum()}")
print(f"‚Ä¢ Duplicate rows: {df.duplicated().sum()}")
print(f"‚Ä¢ Date range: {df['TransactionStartTime'].min().date()} to {df['TransactionStartTime'].max().date()}")
print(f"‚Ä¢ Transaction volume: ${df['Amount'].sum():,.2f}")
print(f"‚Ä¢ Unique customers: {df['CustomerId'].nunique():,}")

print("\n" + "="*80)
print("‚úÖ DATA PREPROCESSING COMPLETE")
print("="*80)


üîß SECTION 2: DATA PREPROCESSING & VALIDATION
üîÑ Converting TransactionStartTime to datetime...
‚úÖ Successfully converted TransactionStartTime

üîç COLUMN VALIDATION:
  ‚úÖ CustomerId: Found
  ‚úÖ Amount: Found
  ‚úÖ TransactionStartTime: Found

üìä DATA QUALITY SUMMARY:
‚Ä¢ Missing values: 0
‚Ä¢ Duplicate rows: 0
‚Ä¢ Date range: 2018-11-15 to 2019-02-13
‚Ä¢ Transaction volume: $642,642,625.50
‚Ä¢ Unique customers: 3,742

‚úÖ DATA PREPROCESSING COMPLETE


In [35]:
# ============================================================================
# SECTION 3: RFM METRICS CALCULATION
# ============================================================================
print("\n" + "="*80)
print("üéØ SECTION 3: RFM METRICS CALCULATION")
print("="*80)

print("\nüìù RFM METHODOLOGY EXPLANATION:")
print("-" * 60)
print("RECENCY (R): Days since last transaction")
print("   ‚Ä¢ Higher recency = Higher risk (disengaged customer)")
print("   ‚Ä¢ Risk thresholds: <30d=Low, 30-90d=Medium, >90d=High")
print("\nFREQUENCY (F): Total transaction count")
print("   ‚Ä¢ Higher frequency = Lower risk (engaged customer)")
print("   ‚Ä¢ Industry standard: ‚â•3 transactions for credit assessment")
print("\nMONETARY (M): Total transaction value")
print("   ‚Ä¢ Using absolute values for clustering")
print("   ‚Ä¢ High value ‚â† necessarily low risk")

# Set snapshot date for recency calculation
snapshot_date = df['TransactionStartTime'].max()
print(f"\nüìÖ BUSINESS SNAPSHOT DATE: {snapshot_date.date()}")
print("   (All recency calculated relative to this date)")

print("\nüßÆ CALCULATING RFM METRICS...")
# Calculate RFM per customer
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
    'TransactionId': 'count',
    'Amount': 'sum'
}).rename(columns={
    'TransactionStartTime': 'recency_days',
    'TransactionId': 'transaction_frequency',
    'Amount': 'total_monetary_value'
})

# Handle negative monetary values (credits)
rfm['total_monetary_value'] = rfm['total_monetary_value'].abs()

# Calculate additional metrics
rfm['avg_transaction_value'] = rfm['total_monetary_value'] / rfm['transaction_frequency']
rfm['customer_lifetime_days'] = df.groupby('CustomerId')['TransactionStartTime'].apply(lambda x: (x.max() - x.min()).days)
rfm['transaction_intensity'] = rfm['transaction_frequency'] / (rfm['customer_lifetime_days'] + 1)  # Avoid division by zero

print(f"‚úÖ RFM calculated for {len(rfm):,} customers")

print("\nüìä RFM STATISTICS SUMMARY:")
rfm_summary = rfm[['recency_days', 'transaction_frequency', 'total_monetary_value']].describe()
print(rfm_summary.round(2))

# Save intermediate RFM data
os.makedirs('../../data/processed', exist_ok=True)
rfm.to_csv('../../data/processed/customer_rfm_raw.csv')
print(f"\nüíæ RFM data saved to: ../../data/processed/customer_rfm_raw.csv")


üéØ SECTION 3: RFM METRICS CALCULATION

üìù RFM METHODOLOGY EXPLANATION:
------------------------------------------------------------
RECENCY (R): Days since last transaction
   ‚Ä¢ Higher recency = Higher risk (disengaged customer)
   ‚Ä¢ Risk thresholds: <30d=Low, 30-90d=Medium, >90d=High

FREQUENCY (F): Total transaction count
   ‚Ä¢ Higher frequency = Lower risk (engaged customer)
   ‚Ä¢ Industry standard: ‚â•3 transactions for credit assessment

MONETARY (M): Total transaction value
   ‚Ä¢ Using absolute values for clustering
   ‚Ä¢ High value ‚â† necessarily low risk

üìÖ BUSINESS SNAPSHOT DATE: 2019-02-13
   (All recency calculated relative to this date)

üßÆ CALCULATING RFM METRICS...
‚úÖ RFM calculated for 3,742 customers

üìä RFM STATISTICS SUMMARY:
       recency_days  transaction_frequency  total_monetary_value
count      3,742.00               3,742.00              3,742.00
mean          30.46                  25.56            233,606.17
std           27.12          

In [36]:
# ============================================================================
# TASK 4.1: RFM CALCULATION FROM TRANSACTION DATA
# ============================================================================
print("\n" + "="*80)
print("üéØ TASK 4.1: CALCULATING RFM METRICS FROM TRANSACTIONS")
print("="*80)

print("üîÑ Calculating Recency, Frequency, Monetary for each customer...")

# 1. Set snapshot date for recency calculation
snapshot_date = df['TransactionStartTime'].max()
print(f"üìÖ Snapshot Date: {snapshot_date.date()}")

# 2. Calculate RFM per customer
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency
    'TransactionId': 'count',                                          # Frequency
    'Amount': ['sum', 'mean', 'std']                                   # Monetary
}).reset_index()

# 3. Flatten column names
rfm.columns = ['CustomerId', 'recency_days', 'transaction_frequency', 
               'total_monetary_value', 'avg_transaction_value', 'std_transaction_value']

# 4. Handle monetary values (use absolute value)
rfm['total_monetary_value'] = rfm['total_monetary_value'].abs()
rfm['std_transaction_value'] = rfm['std_transaction_value'].fillna(0)

print(f"‚úÖ RFM calculated for {len(rfm):,} customers")

# 5. Display RFM statistics
print("\nüìä RFM STATISTICS:")
rfm_summary = rfm[['recency_days', 'transaction_frequency', 'total_monetary_value']].describe()
print(rfm_summary.round(2))

# 6. Save RFM data - FIXED: Create directory first
import os

# Create the directory if it doesn't exist
output_dir = 'data/processed'
os.makedirs(output_dir, exist_ok=True)  # This creates the directory if needed

# Now save the file
output_path = os.path.join(output_dir, 'customer_rfm.csv')
rfm.to_csv(output_path, index=False)
print(f"\nüíæ RFM data saved: {output_path}")
print(f"   ‚Ä¢ Directory created: {output_dir}")
print(f"   ‚Ä¢ File size: {os.path.getsize(output_path)/1024:.1f} KB" if os.path.exists(output_path) else "   ‚Ä¢ File saved successfully")


üéØ TASK 4.1: CALCULATING RFM METRICS FROM TRANSACTIONS
üîÑ Calculating Recency, Frequency, Monetary for each customer...
üìÖ Snapshot Date: 2019-02-13
‚úÖ RFM calculated for 3,742 customers

üìä RFM STATISTICS:
       recency_days  transaction_frequency  total_monetary_value
count      3,742.00               3,742.00              3,742.00
mean          30.46                  25.56            233,606.17
std           27.12                  96.93          2,712,685.17
min            0.00                   1.00                  0.00
25%            5.00                   2.00              5,000.00
50%           24.00                   7.00             20,000.00
75%           53.00                  20.00             80,000.00
max           90.00               4,091.00        104,900,000.00

üíæ RFM data saved: data/processed\customer_rfm.csv
   ‚Ä¢ Directory created: data/processed
   ‚Ä¢ File size: 209.2 KB


In [37]:

# ============================================================================
# SECTION 4: RFM VISUALIZATION DASHBOARD
# ============================================================================
print("\n" + "="*80)
print("üìà SECTION 4: RFM VISUALIZATION DASHBOARD")
print("="*80)

print("üîÑ Creating professional RFM visualizations...")

# Create a 2x2 dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Recency Distribution with Risk Thresholds',
        'Frequency Distribution',
        'Monetary Value Distribution (Log Scale)',
        'RFM 3D Segmentation'
    ),
    specs=[
        [{'type': 'histogram'}, {'type': 'histogram'}],
        [{'type': 'histogram'}, {'type': 'scatter3d'}]
    ],
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

# 1. Recency Distribution
fig.add_trace(
    go.Histogram(
        x=rfm['recency_days'],
        nbinsx=50,
        marker_color='#FF6B6B',
        name='Recency',
        hovertemplate='%{x:.0f} days: %{y} customers'
    ),
    row=1, col=1
)

# Add risk threshold lines
fig.add_vline(x=30, line_dash="dash", line_color="green", 
              annotation_text="30d<br>Low Risk", annotation_position="top right",
              row=1, col=1)
fig.add_vline(x=90, line_dash="dash", line_color="orange",
              annotation_text="90d<br>High Risk", annotation_position="top right",
              row=1, col=1)

# 2. Frequency Distribution
fig.add_trace(
    go.Histogram(
        x=rfm['transaction_frequency'],
        nbinsx=50,
        marker_color='#4ECDC4',
        name='Frequency',
        hovertemplate='%{x:.0f} transactions: %{y} customers'
    ),
    row=1, col=2
)

# 3. Monetary Distribution (Log scale)
fig.add_trace(
    go.Histogram(
        x=np.log10(rfm['total_monetary_value'] + 1),
        nbinsx=50,
        marker_color='#45B7D1',
        name='Monetary (Log10)',
        hovertemplate='10^%{x:.1f} = $%{customdata:.0f}: %{y} customers',
        customdata=rfm['total_monetary_value']
    ),
    row=2, col=1
)

# 4. 3D RFM Visualization
fig.add_trace(
    go.Scatter3d(
        x=rfm['recency_days'],
        y=np.log10(rfm['transaction_frequency'] + 1),
        z=np.log10(rfm['total_monetary_value'] + 1),
        mode='markers',
        marker=dict(
            size=4,
            color=rfm['recency_days'],
            colorscale='RdYlGn_r',
            showscale=True,
            colorbar=dict(title="Risk Level<br>(Recency in days)")
        ),
        name='3D Segmentation',
        hovertemplate='Recency: %{x} days<br>Log10(Freq): %{y:.2f}<br>Log10(Value): %{z:.2f}'
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    height=900,
    title_text="<b>Bati Bank - RFM Customer Analysis Dashboard</b><br>Credit Risk Proxy Development",
    showlegend=False,
    template='plotly_white',
    title_x=0.5,
    title_font=dict(size=20)
)

# Update axes labels
fig.update_xaxes(title_text="Days Since Last Transaction", row=1, col=1)
fig.update_xaxes(title_text="Transaction Count", row=1, col=2)
fig.update_xaxes(title_text="Log10(Total Value)", row=2, col=1)

fig.update_yaxes(title_text="Customer Count", row=1, col=1)
fig.update_yaxes(title_text="Customer Count", row=1, col=2)
fig.update_yaxes(title_text="Customer Count", row=2, col=1)

# Update 3D plot labels
fig.update_scenes(
    xaxis_title="Recency (days)",
    yaxis_title="Log10(Frequency)",
    zaxis_title="Log10(Value)",
    row=2, col=2
)

print("‚úÖ Professional RFM dashboard created")
fig.show()

# Create additional individual plots for detailed analysis
print("\nüìä Creating additional analysis plots...")

# Plot 1: Recency vs Frequency scatter
fig2 = px.scatter(
    rfm, x='recency_days', y='transaction_frequency',
    color=np.log10(rfm['total_monetary_value'] + 1),
    title='Recency vs Frequency (Colored by Monetary Value)',
    labels={'x': 'Recency (days)', 'y': 'Frequency', 'color': 'Log10(Value)'}
)
fig2.show()

# Plot 2: Customer value pyramid
value_quartiles = pd.qcut(rfm['total_monetary_value'], q=4, 
                          labels=['Bronze (<25%)', 'Silver (25-50%)', 
                                  'Gold (50-75%)', 'Platinum (>75%)'])
value_counts = value_quartiles.value_counts()

fig3 = go.Figure(data=[go.Bar(
    x=value_counts.values,
    y=value_counts.index,
    orientation='h',
    marker_color=['#CD7F32', '#C0C0C0', '#FFD700', '#E5E4E2']
)])
fig3.update_layout(
    title='Customer Value Pyramid',
    xaxis_title='Number of Customers',
    yaxis_title='Value Tier'
)
fig3.show()

print("‚úÖ All visualizations completed successfully!")


üìà SECTION 4: RFM VISUALIZATION DASHBOARD
üîÑ Creating professional RFM visualizations...
‚úÖ Professional RFM dashboard created



üìä Creating additional analysis plots...


‚úÖ All visualizations completed successfully!


In [38]:
# ============================================================================
# SECTION 5: CLUSTERING METHODOLOGY & OPTIMAL K DETERMINATION
# ============================================================================
print("\n" + "="*80)
print("üî¨ SECTION 5: CLUSTERING METHODOLOGY & OPTIMAL K")
print("="*80)

print("\nüìä CLUSTERING APPROACH (Basel II Compliance):")
print("-" * 60)
print("‚Ä¢ Algorithm: K-Means (Industry standard for segmentation)")
print("‚Ä¢ Scaling: RobustScaler (Handles outliers effectively)")
print("‚Ä¢ Cluster count: 3 (As per project requirements)")
print("‚Ä¢ Reproducibility: random_state=42 (Audit trail)")
print("‚Ä¢ Validation: Multiple metrics for quality assurance")

# Prepare features for clustering
print("\nüîÑ PREPARING FEATURES FOR CLUSTERING...")
rfm_features = rfm[['recency_days', 'transaction_frequency', 'total_monetary_value']].copy()

# Apply log transformation for better clustering
rfm_features['log_monetary'] = np.log10(rfm_features['total_monetary_value'] + 1)
rfm_features['log_frequency'] = np.log10(rfm_features['transaction_frequency'] + 1)

# Select features for clustering
X_cluster = rfm_features[['recency_days', 'log_frequency', 'log_monetary']].values

# Scale features using RobustScaler (better for financial data)
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_cluster)

print(f"‚úÖ Features prepared: {X_scaled.shape[0]} customers, {X_scaled.shape[1]} features")

# Determine optimal number of clusters (for validation)
print("\nüîç DETERMINING OPTIMAL NUMBER OF CLUSTERS (Validation)...")
K_range = range(2, 9)
inertia = []
silhouette_scores = []
davies_bouldin_scores = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=20, max_iter=300)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    
    if k > 1:
        silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
        davies_bouldin_scores.append(davies_bouldin_score(X_scaled, kmeans.labels_))
    
    print(f"  k={k}: Inertia={kmeans.inertia_:,.0f}, "
          f"Silhouette={silhouette_scores[-1] if k>1 else 'N/A':.3f}")

# Create cluster validation plot
fig4 = go.Figure()

# Add inertia trace
fig4.add_trace(go.Scatter(
    x=list(K_range), y=inertia,
    mode='lines+markers',
    name='Inertia',
    line=dict(color='#FF6B6B', width=3),
    marker=dict(size=10)
))

# Add silhouette trace on secondary axis
fig4.add_trace(go.Scatter(
    x=list(K_range)[1:], y=silhouette_scores,
    mode='lines+markers',
    name='Silhouette Score',
    line=dict(color='#4ECDC4', width=3),
    marker=dict(size=10),
    yaxis='y2'
))

# Update layout with dual axes - CORRECTED VERSION
fig4.update_layout(
    title='Cluster Validation Metrics - Optimal K Determination',
    xaxis_title='Number of Clusters (k)',
    yaxis=dict(
        title='Inertia (Lower is better)',
        title_font=dict(color='#FF6B6B'),
        tickfont=dict(color='#FF6B6B')
    ),
    yaxis2=dict(
        title='Silhouette Score (Higher is better)',
        title_font=dict(color='#4ECDC4'),
        tickfont=dict(color='#4ECDC4'),
        overlaying='y',
        side='right'
    ),
    template='plotly_white',
    height=500,
    legend=dict(
        x=0.02,
        y=0.98,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='black',
        borderwidth=1
    )
)

# Add annotation for k=3
fig4.add_annotation(
    x=3, y=inertia[1],  # inertia[1] corresponds to k=3 (index 1 in inertia list)
    text="<b>Selected: k=3</b><br>(Project requirement)",
    showarrow=True,
    arrowhead=2,
    arrowsize=1,
    arrowwidth=2,
    arrowcolor="black",
    bgcolor="white",
    bordercolor="black",
    borderwidth=1,
    font=dict(size=12)
)

print("‚úÖ Cluster validation metrics calculated")
fig4.show()

# Display validation metrics table - CORRECTED VERSION
print("\nüìã CLUSTER VALIDATION METRICS SUMMARY:")

# Create lists with proper lengths and formatting
k_values = list(K_range)
inertia_formatted = [f'{i:,.0f}' for i in inertia]

# Handle silhouette scores (starts from k=2)
silhouette_formatted = []
for i, k in enumerate(k_values):
    if k == 2:  # First silhouette score at k=2
        silhouette_formatted.append(f'{silhouette_scores[0]:.3f}')
    elif k > 2:
        silhouette_formatted.append(f'{silhouette_scores[i-1]:.3f}')
    else:
        silhouette_formatted.append('N/A')

# Handle Davies-Bouldin scores (starts from k=2)
db_formatted = []
for i, k in enumerate(k_values):
    if k == 2:  # First DB score at k=2
        db_formatted.append(f'{davies_bouldin_scores[0]:.3f}')
    elif k > 2:
        db_formatted.append(f'{davies_bouldin_scores[i-1]:.3f}')
    else:
        db_formatted.append('N/A')

# Create the DataFrame
validation_df = pd.DataFrame({
    'k': k_values,
    'Inertia': inertia_formatted,
    'Silhouette': silhouette_formatted,
    'Davies-Bouldin': db_formatted
})

print(validation_df.to_string(index=False))

# Add interpretation of metrics
print("\nüìä METRIC INTERPRETATION:")
print("-" * 40)
print("‚Ä¢ Inertia: Lower is better (measures cluster compactness)")
print("‚Ä¢ Silhouette: -1 to 1, higher is better (measures separation)")
print("‚Ä¢ Davies-Bouldin: Lower is better (ratio of within to between cluster distance)")
print("‚Ä¢ k=3 selected as per project requirements")


üî¨ SECTION 5: CLUSTERING METHODOLOGY & OPTIMAL K

üìä CLUSTERING APPROACH (Basel II Compliance):
------------------------------------------------------------
‚Ä¢ Algorithm: K-Means (Industry standard for segmentation)
‚Ä¢ Scaling: RobustScaler (Handles outliers effectively)
‚Ä¢ Cluster count: 3 (As per project requirements)
‚Ä¢ Reproducibility: random_state=42 (Audit trail)
‚Ä¢ Validation: Multiple metrics for quality assurance

üîÑ PREPARING FEATURES FOR CLUSTERING...
‚úÖ Features prepared: 3742 customers, 3 features

üîç DETERMINING OPTIMAL NUMBER OF CLUSTERS (Validation)...
  k=2: Inertia=2,655, Silhouette=0.390
  k=3: Inertia=1,986, Silhouette=0.323
  k=4: Inertia=1,568, Silhouette=0.332
  k=5: Inertia=1,356, Silhouette=0.320
  k=6: Inertia=1,170, Silhouette=0.308
  k=7: Inertia=1,057, Silhouette=0.315
  k=8: Inertia=964, Silhouette=0.316
‚úÖ Cluster validation metrics calculated



üìã CLUSTER VALIDATION METRICS SUMMARY:
 k Inertia Silhouette Davies-Bouldin
 2   2,655      0.390          0.964
 3   1,986      0.390          0.964
 4   1,568      0.323          1.104
 5   1,356      0.332          1.006
 6   1,170      0.320          1.044
 7   1,057      0.308          1.009
 8     964      0.315          1.002

üìä METRIC INTERPRETATION:
----------------------------------------
‚Ä¢ Inertia: Lower is better (measures cluster compactness)
‚Ä¢ Silhouette: -1 to 1, higher is better (measures separation)
‚Ä¢ Davies-Bouldin: Lower is better (ratio of within to between cluster distance)
‚Ä¢ k=3 selected as per project requirements


In [39]:
# ============================================================================
# TASK 4.2: CUSTOMER SEGMENTATION WITH K-MEANS (k=3)
# ============================================================================
print("\n" + "="*80)
print("üîç TASK 4.2: CUSTOMER SEGMENTATION WITH K-MEANS")
print("="*80)

print("üîÑ Performing K-Means clustering (k=3) on RFM features...")

# 1. Prepare features for clustering
clustering_features = rfm[['recency_days', 'transaction_frequency', 'total_monetary_value']].copy()

# 2. Apply log transformations for better clustering
clustering_features['log_frequency'] = np.log1p(clustering_features['transaction_frequency'])
clustering_features['log_monetary'] = np.log1p(clustering_features['total_monetary_value'])
clustering_features = clustering_features[['recency_days', 'log_frequency', 'log_monetary']]

# 3. Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(clustering_features)

# 4. Apply K-Means with k=3 (as required)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
rfm['cluster'] = kmeans.fit_predict(features_scaled)

print(f"‚úÖ Clustering complete. {len(rfm['cluster'].unique())} clusters created.")

# 5. Analyze cluster characteristics
print("\nüìä CLUSTER ANALYSIS:")
for cluster_id in sorted(rfm['cluster'].unique()):
    cluster_data = rfm[rfm['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id}:")
    print(f"  ‚Ä¢ Customers: {len(cluster_data):,} ({len(cluster_data)/len(rfm)*100:.1f}%)")
    print(f"  ‚Ä¢ Avg Recency: {cluster_data['recency_days'].mean():.1f} days")
    print(f"  ‚Ä¢ Avg Frequency: {cluster_data['transaction_frequency'].mean():.1f}")
    print(f"  ‚Ä¢ Avg Monetary: ${cluster_data['total_monetary_value'].mean():,.0f}")


üîç TASK 4.2: CUSTOMER SEGMENTATION WITH K-MEANS
üîÑ Performing K-Means clustering (k=3) on RFM features...
‚úÖ Clustering complete. 3 clusters created.

üìä CLUSTER ANALYSIS:

Cluster 0:
  ‚Ä¢ Customers: 1,164 (31.1%)
  ‚Ä¢ Avg Recency: 18.7 days
  ‚Ä¢ Avg Frequency: 4.3
  ‚Ä¢ Avg Monetary: $17,308

Cluster 1:
  ‚Ä¢ Customers: 1,161 (31.0%)
  ‚Ä¢ Avg Recency: 64.7 days
  ‚Ä¢ Avg Frequency: 4.8
  ‚Ä¢ Avg Monetary: $36,325

Cluster 2:
  ‚Ä¢ Customers: 1,417 (37.9%)
  ‚Ä¢ Avg Recency: 12.1 days
  ‚Ä¢ Avg Frequency: 60.1
  ‚Ä¢ Avg Monetary: $572,924


In [40]:

# ============================================================================
# SECTION 6: K-MEANS CLUSTERING IMPLEMENTATION (k=3)
# ============================================================================
print("\n" + "="*80)
print("üéØ SECTION 6: K-MEANS CLUSTERING (k=3)")
print("="*80)

print("üîÑ Applying K-Means clustering with k=3...")

# Apply K-Means with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42, n_init=20, max_iter=300)
rfm['cluster'] = kmeans.fit_predict(X_scaled)

print(f"‚úÖ Clustering completed: {len(rfm['cluster'].unique())} clusters created")

# Analyze cluster characteristics
print("\nüìä CLUSTER CHARACTERISTICS ANALYSIS:")
print("-" * 60)

cluster_analysis = []
for cluster_id in sorted(rfm['cluster'].unique()):
    cluster_data = rfm[rfm['cluster'] == cluster_id]
    
    stats = {
        'Cluster': cluster_id,
        'Customers': len(cluster_data),
        'Percentage': f"{len(cluster_data)/len(rfm)*100:.1f}%",
        'Avg_Recency': f"{cluster_data['recency_days'].mean():.1f} days",
        'Avg_Frequency': f"{cluster_data['transaction_frequency'].mean():.1f}",
        'Avg_Monetary': f"${cluster_data['total_monetary_value'].mean():,.0f}",
        'Total_Value': f"${cluster_data['total_monetary_value'].sum():,.0f}"
    }
    cluster_analysis.append(stats)

# Display cluster analysis
cluster_df = pd.DataFrame(cluster_analysis)
print("\n" + cluster_df.to_string(index=False))

# Visualize clusters
print("\nüîÑ Creating cluster visualization...")

fig5 = px.scatter_3d(
    rfm, 
    x='recency_days',
    y=np.log10(rfm['transaction_frequency'] + 1),
    z=np.log10(rfm['total_monetary_value'] + 1),
    color='cluster',
    title='3D Customer Segments (Colored by Cluster)',
    labels={
        'x': 'Recency (days)',
        'y': 'Log10(Frequency)',
        'z': 'Log10(Value)',
        'cluster': 'Cluster'
    },
    color_discrete_sequence=['#FF6B6B', '#4ECDC4', '#45B7D1']
)

fig5.update_layout(
    scene=dict(
        xaxis_title='Recency (Risk Indicator)',
        yaxis_title='Log10(Frequency)',
        zaxis_title='Log10(Value)'
    ),
    height=700
)

fig5.show()

print("‚úÖ Cluster visualization created successfully")


üéØ SECTION 6: K-MEANS CLUSTERING (k=3)
üîÑ Applying K-Means clustering with k=3...
‚úÖ Clustering completed: 3 clusters created

üìä CLUSTER CHARACTERISTICS ANALYSIS:
------------------------------------------------------------

 Cluster  Customers Percentage Avg_Recency Avg_Frequency Avg_Monetary  Total_Value
       0       1617      43.2%   31.7 days           7.8      $60,032  $97,071,556
       1       1033      27.6%   50.0 days           2.3       $2,867   $2,961,336
       2       1092      29.2%   10.1 days          73.9     $708,902 $774,121,406

üîÑ Creating cluster visualization...


‚úÖ Cluster visualization created successfully


In [41]:
# ============================================================================
# TASK 4.3: HIGH-RISK CLUSTER IDENTIFICATION
# ============================================================================
print("\n" + "="*80)
print("‚ö†Ô∏è TASK 4.3: IDENTIFYING HIGH-RISK CLUSTER")
print("="*80)

print("üîç Analyzing clusters to identify high-risk customers...")

# 1. Calculate risk scores for each cluster
risk_scores = {}
for cluster_id in sorted(rfm['cluster'].unique()):
    cluster_data = rfm[rfm['cluster'] == cluster_id]
    
    # Normalize metrics (0-1 scale)
    recency_norm = cluster_data['recency_days'].mean() / rfm['recency_days'].max()  # Higher = riskier
    frequency_norm = 1 - (cluster_data['transaction_frequency'].mean() / rfm['transaction_frequency'].max())  # Lower = riskier
    monetary_norm = 1 - (cluster_data['total_monetary_value'].mean() / rfm['total_monetary_value'].max())    # Lower = riskier
    
    # Weighted risk score (Business logic: Recency 50%, Frequency 30%, Monetary 20%)
    risk_score = (recency_norm * 0.5) + (frequency_norm * 0.3) + (monetary_norm * 0.2)
    
    risk_scores[cluster_id] = {
        'risk_score': risk_score,
        'customers': len(cluster_data),
        'recency_mean': cluster_data['recency_days'].mean(),
        'frequency_mean': cluster_data['transaction_frequency'].mean(),
        'monetary_mean': cluster_data['total_monetary_value'].mean()
    }
    
    print(f"\nCluster {cluster_id}:")
    print(f"  ‚Ä¢ Risk Score: {risk_score:.3f}")
    print(f"  ‚Ä¢ Customers: {len(cluster_data):,}")
    print(f"  ‚Ä¢ Avg Recency: {cluster_data['recency_days'].mean():.1f} days")
    print(f"  ‚Ä¢ Avg Frequency: {cluster_data['transaction_frequency'].mean():.1f}")
    print(f"  ‚Ä¢ Avg Monetary: ${cluster_data['total_monetary_value'].mean():,.0f}")

# 2. Identify high-risk cluster
high_risk_cluster = max(risk_scores.items(), key=lambda x: x[1]['risk_score'])[0]
print(f"\n" + "="*60)
print(f"üéØ HIGH-RISK CLUSTER IDENTIFIED: CLUSTER {high_risk_cluster}")
print("="*60)
print(f"‚Ä¢ Risk Score: {risk_scores[high_risk_cluster]['risk_score']:.3f}")
print(f"‚Ä¢ Customers: {risk_scores[high_risk_cluster]['customers']:,}")
print(f"‚Ä¢ Business Interpretation: Least engaged customers (high recency, low frequency, low monetary value)")


‚ö†Ô∏è TASK 4.3: IDENTIFYING HIGH-RISK CLUSTER
üîç Analyzing clusters to identify high-risk customers...

Cluster 0:
  ‚Ä¢ Risk Score: 0.676
  ‚Ä¢ Customers: 1,617
  ‚Ä¢ Avg Recency: 31.7 days
  ‚Ä¢ Avg Frequency: 7.8
  ‚Ä¢ Avg Monetary: $60,032

Cluster 1:
  ‚Ä¢ Risk Score: 0.778
  ‚Ä¢ Customers: 1,033
  ‚Ä¢ Avg Recency: 50.0 days
  ‚Ä¢ Avg Frequency: 2.3
  ‚Ä¢ Avg Monetary: $2,867

Cluster 2:
  ‚Ä¢ Risk Score: 0.549
  ‚Ä¢ Customers: 1,092
  ‚Ä¢ Avg Recency: 10.1 days
  ‚Ä¢ Avg Frequency: 73.9
  ‚Ä¢ Avg Monetary: $708,902

üéØ HIGH-RISK CLUSTER IDENTIFIED: CLUSTER 1
‚Ä¢ Risk Score: 0.778
‚Ä¢ Customers: 1,033
‚Ä¢ Business Interpretation: Least engaged customers (high recency, low frequency, low monetary value)


In [42]:
# ============================================================================
# TASK 4.4: CREATING PROXY TARGET VARIABLE (is_high_risk)
# ============================================================================
print("\n" + "="*80)
print("üè∑Ô∏è TASK 4.4: CREATING TARGET VARIABLE 'is_high_risk'")
print("="*80)

# 1. Create binary target variable
rfm['is_high_risk'] = (rfm['cluster'] == high_risk_cluster).astype(int)

# 2. Display target distribution
print("üìä TARGET VARIABLE DISTRIBUTION:")
target_counts = rfm['is_high_risk'].value_counts().sort_index()
for label, count in target_counts.items():
    percentage = count / len(rfm) * 100
    risk_label = "HIGH RISK üî¥" if label == 1 else "LOW RISK üü¢"
    print(f"  {risk_label}: {count:,} customers ({percentage:.1f}%)")

# 3. Basel II compliance check
print(f"\nüìã BASEL II COMPLIANCE CHECK:")
basel_range = (0.02, 0.05)  # 2-5% typical default rate
actual_rate = target_counts[1] / len(rfm)
if basel_range[0] <= actual_rate <= basel_range[1]:
    print(f"  ‚úÖ PASS: High-risk rate {actual_rate*100:.1f}% within Basel II range ({basel_range[0]*100:.1f}%-{basel_range[1]*100:.1f}%)")
else:
    print(f"  ‚ö†Ô∏è REVIEW: High-risk rate {actual_rate*100:.1f}% outside Basel II range")

# 4. **FIXED: Create directory before saving**
print(f"\nüíæ PREPARING TO SAVE OUTPUT FILES...")

# Create the processed directory if it doesn't exist
output_dir = 'data/processed'
os.makedirs(output_dir, exist_ok=True)  # This creates the directory
print(f"  ‚úÖ Created directory: {output_dir}")

# Save the final dataset with target
output_path = f'{output_dir}/customer_rfm_with_target.csv'
rfm.to_csv(output_path, index=False)
print(f"  ‚úÖ FINAL OUTPUT SAVED: {output_path}")
print(f"     ‚Ä¢ File size: {os.path.getsize(output_path)/1024:.1f} KB")
print(f"     ‚Ä¢ Records: {len(rfm):,} customers")
print(f"     ‚Ä¢ Columns: {len(rfm.columns)}")

# Also save RFM without target for reference
rfm_basic_path = f'{output_dir}/customer_rfm.csv'
rfm[['CustomerId', 'recency_days', 'transaction_frequency', 'total_monetary_value', 
     'avg_transaction_value', 'std_transaction_value']].to_csv(rfm_basic_path, index=False)
print(f"  ‚úÖ Basic RFM saved: {rfm_basic_path}")

# 5. Create a summary file for documentation
# Get the high_risk_data first (make sure this variable exists)
high_risk_data = rfm[rfm['is_high_risk'] == 1]

summary = {
    'task': 'Task 4 - Proxy Target Creation',
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'data_summary': {
        'total_customers': int(len(rfm)),
        'high_risk_customers': int(target_counts[1]),
        'high_risk_rate': float(actual_rate * 100),
        'high_risk_cluster': int(high_risk_cluster),
        'features_created': list(rfm.columns)
    },
    'business_validation': {
        'basel_ii_compliant': bool(basel_range[0] <= actual_rate <= basel_range[1]),  # Convert to bool
        'high_risk_recency_avg': float(high_risk_data['recency_days'].mean()),
        'high_risk_frequency_avg': float(high_risk_data['transaction_frequency'].mean()),
        'high_risk_monetary_avg': float(high_risk_data['total_monetary_value'].mean())
    }
}

summary_path = f'{output_dir}/task4_summary.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=4)
print(f"  ‚úÖ Summary saved: {summary_path}")

print(f"\nüìÅ FILES CREATED IN {output_dir}:")
print("-" * 60)
for file in ['customer_rfm.csv', 'customer_rfm_with_target.csv', 'task4_summary.json']:
    file_path = os.path.join(output_dir, file)
    if os.path.exists(file_path):
        size_kb = os.path.getsize(file_path)/1024
        print(f"  ‚Ä¢ {file} ({size_kb:.1f} KB)")


üè∑Ô∏è TASK 4.4: CREATING TARGET VARIABLE 'is_high_risk'
üìä TARGET VARIABLE DISTRIBUTION:
  LOW RISK üü¢: 2,709 customers (72.4%)
  HIGH RISK üî¥: 1,033 customers (27.6%)

üìã BASEL II COMPLIANCE CHECK:
  ‚ö†Ô∏è REVIEW: High-risk rate 27.6% outside Basel II range

üíæ PREPARING TO SAVE OUTPUT FILES...
  ‚úÖ Created directory: data/processed
  ‚úÖ FINAL OUTPUT SAVED: data/processed/customer_rfm_with_target.csv
     ‚Ä¢ File size: 223.8 KB
     ‚Ä¢ Records: 3,742 customers
     ‚Ä¢ Columns: 8
  ‚úÖ Basic RFM saved: data/processed/customer_rfm.csv
  ‚úÖ Summary saved: data/processed/task4_summary.json

üìÅ FILES CREATED IN data/processed:
------------------------------------------------------------
  ‚Ä¢ customer_rfm.csv (209.2 KB)
  ‚Ä¢ customer_rfm_with_target.csv (223.8 KB)
  ‚Ä¢ task4_summary.json (0.8 KB)


In [43]:
# ============================================================================
# TASK 4.5: BUSINESS VALIDATION & SUMMARY
# ============================================================================
print("\n" + "="*80)
print("üìä TASK 4.5: BUSINESS VALIDATION & SUMMARY")
print("="*80)

print("üîç Validating proxy target with business logic...")

# 1. Analyze high-risk customer characteristics
high_risk_data = rfm[rfm['is_high_risk'] == 1]
low_risk_data = rfm[rfm['is_high_risk'] == 0]

print("\nüìà HIGH-RISK VS LOW-RISK PROFILE:")
print("-" * 60)
print(f"{'Metric':25} {'High-Risk':>15} {'Low-Risk':>15}")
print("-" * 60)
print(f"{'Avg Recency (days)':25} {high_risk_data['recency_days'].mean():>15.1f} {low_risk_data['recency_days'].mean():>15.1f}")
print(f"{'Avg Frequency':25} {high_risk_data['transaction_frequency'].mean():>15.1f} {low_risk_data['transaction_frequency'].mean():>15.1f}")
print(f"{'Avg Monetary ($)':25} {high_risk_data['total_monetary_value'].mean():>15,.0f} {low_risk_data['total_monetary_value'].mean():>15,.0f}")
print(f"{'Avg Transaction Value ($)':25} {high_risk_data['avg_transaction_value'].mean():>15,.0f} {low_risk_data['avg_transaction_value'].mean():>15,.0f}")

# 2. Business sense validation
print(f"\n‚úÖ BUSINESS VALIDATION:")
if (high_risk_data['recency_days'].mean() > low_risk_data['recency_days'].mean() and
    high_risk_data['transaction_frequency'].mean() < low_risk_data['transaction_frequency'].mean()):
    print("  ‚úì High-risk customers have higher recency (more inactive)")
    print("  ‚úì High-risk customers have lower frequency (less engaged)")
    print("  ‚úì Business logic validated: Proxy target makes sense")
else:
    print("  ‚ö†Ô∏è Unexpected pattern: Review cluster analysis")

# 3. Task completion summary
print(f"\nüèÜ TASK 4 COMPLETION SUMMARY:")
print("-" * 60)
print("‚úì RFM metrics calculated from transaction data")
print("‚úì Customers segmented into 3 clusters using K-Means")
print(f"‚úì High-risk cluster identified: Cluster {high_risk_cluster}")
print(f"‚úì Target variable created: 'is_high_risk' ({target_counts[1]:,} high-risk customers)")
print("‚úì Basel II compliance checked and documented")
print("‚úì Final dataset saved for Task 5 model training")
print("-" * 60)
print(f"\nüöÄ READY FOR TASK 5: Model Training with {len(rfm):,} labeled customers")


üìä TASK 4.5: BUSINESS VALIDATION & SUMMARY
üîç Validating proxy target with business logic...

üìà HIGH-RISK VS LOW-RISK PROFILE:
------------------------------------------------------------
Metric                          High-Risk        Low-Risk
------------------------------------------------------------
Avg Recency (days)                   50.0            23.0
Avg Frequency                         2.3            34.4
Avg Monetary ($)                    2,867         321,592
Avg Transaction Value ($)           1,450          21,155

‚úÖ BUSINESS VALIDATION:
  ‚úì High-risk customers have higher recency (more inactive)
  ‚úì High-risk customers have lower frequency (less engaged)
  ‚úì Business logic validated: Proxy target makes sense

üèÜ TASK 4 COMPLETION SUMMARY:
------------------------------------------------------------
‚úì RFM metrics calculated from transaction data
‚úì Customers segmented into 3 clusters using K-Means
‚úì High-risk cluster identified: Cluster 1
‚úì T

In [44]:

# ============================================================================
# SECTION 7: HIGH-RISK CLUSTER IDENTIFICATION
# ============================================================================
print("\n" + "="*80)
print("‚ö†Ô∏è SECTION 7: HIGH-RISK CLUSTER IDENTIFICATION")
print("="*80)

print("\nüìù RISK ASSESSMENT METHODOLOGY:")
print("-" * 60)
print("‚Ä¢ Primary Factor: Recency (50% weight)")
print("   - Higher recency = Higher risk")
print("‚Ä¢ Secondary Factor: Frequency (30% weight)")
print("   - Lower frequency = Higher risk")
print("‚Ä¢ Tertiary Factor: Monetary (20% weight)")
print("   - Lower monetary = Higher risk")
print("‚Ä¢ Basel II Alignment: Conservative approach")

print("\nüîç CALCULATING RISK SCORES FOR EACH CLUSTER...")

# Calculate risk scores for each cluster
risk_scores = {}
for cluster_id in sorted(rfm['cluster'].unique()):
    cluster_data = rfm[rfm['cluster'] == cluster_id]
    
    # Normalize metrics (0-1 scale)
    recency_norm = cluster_data['recency_days'].mean() / rfm['recency_days'].max()
    frequency_norm = 1 - (cluster_data['transaction_frequency'].mean() / rfm['transaction_frequency'].max())
    monetary_norm = 1 - (cluster_data['total_monetary_value'].mean() / rfm['total_monetary_value'].max())
    
    # Weighted risk score
    weighted_risk = (recency_norm * 0.5) + (frequency_norm * 0.3) + (monetary_norm * 0.2)
    
    risk_scores[cluster_id] = {
        'risk_score': weighted_risk,
        'customers': len(cluster_data),
        'recency_days': cluster_data['recency_days'].mean(),
        'frequency': cluster_data['transaction_frequency'].mean(),
        'monetary': cluster_data['total_monetary_value'].mean()
    }
    
    print(f"\nCluster {cluster_id}:")
    print(f"  ‚Ä¢ Risk Score: {weighted_risk:.3f}")
    print(f"  ‚Ä¢ Customers: {len(cluster_data):,}")
    print(f"  ‚Ä¢ Avg Recency: {cluster_data['recency_days'].mean():.1f} days")
    print(f"  ‚Ä¢ Avg Frequency: {cluster_data['transaction_frequency'].mean():.1f}")
    print(f"  ‚Ä¢ Avg Monetary: ${cluster_data['total_monetary_value'].mean():,.0f}")

# Identify high-risk cluster
high_risk_cluster = max(risk_scores.items(), key=lambda x: x[1]['risk_score'])[0]

print(f"\n" + "="*60)
print(f"üéØ HIGH-RISK CLUSTER IDENTIFIED: CLUSTER {high_risk_cluster}")
print("="*60)
print(f"‚Ä¢ Risk Score: {risk_scores[high_risk_cluster]['risk_score']:.3f}")
print(f"‚Ä¢ Customers: {risk_scores[high_risk_cluster]['customers']:,}")
print(f"‚Ä¢ Avg Recency: {risk_scores[high_risk_cluster]['recency_days']:.1f} days")
print(f"‚Ä¢ Business Interpretation: Least engaged customer segment")

# Visualize risk scores
print("\nüîÑ Creating risk score visualization...")

risk_df = pd.DataFrame({
    'Cluster': list(risk_scores.keys()),
    'Risk_Score': [s['risk_score'] for s in risk_scores.values()],
    'Customers': [s['customers'] for s in risk_scores.values()],
    'Recency': [s['recency_days'] for s in risk_scores.values()]
})

fig6 = px.bar(
    risk_df, 
    x='Cluster', 
    y='Risk_Score',
    color='Risk_Score',
    color_continuous_scale='RdYlGn_r',  # Red (high risk) to Green (low risk)
    title='Risk Score by Cluster',
    labels={'Risk_Score': 'Risk Score', 'Cluster': 'Cluster'},
    text='Risk_Score'
)

fig6.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig6.update_layout(
    yaxis_title='Risk Score (Higher = Riskier)',
    coloraxis_showscale=False,
    height=500
)

fig6.show()

print("‚úÖ High-risk cluster identification complete")


‚ö†Ô∏è SECTION 7: HIGH-RISK CLUSTER IDENTIFICATION

üìù RISK ASSESSMENT METHODOLOGY:
------------------------------------------------------------
‚Ä¢ Primary Factor: Recency (50% weight)
   - Higher recency = Higher risk
‚Ä¢ Secondary Factor: Frequency (30% weight)
   - Lower frequency = Higher risk
‚Ä¢ Tertiary Factor: Monetary (20% weight)
   - Lower monetary = Higher risk
‚Ä¢ Basel II Alignment: Conservative approach

üîç CALCULATING RISK SCORES FOR EACH CLUSTER...

Cluster 0:
  ‚Ä¢ Risk Score: 0.676
  ‚Ä¢ Customers: 1,617
  ‚Ä¢ Avg Recency: 31.7 days
  ‚Ä¢ Avg Frequency: 7.8
  ‚Ä¢ Avg Monetary: $60,032

Cluster 1:
  ‚Ä¢ Risk Score: 0.778
  ‚Ä¢ Customers: 1,033
  ‚Ä¢ Avg Recency: 50.0 days
  ‚Ä¢ Avg Frequency: 2.3
  ‚Ä¢ Avg Monetary: $2,867

Cluster 2:
  ‚Ä¢ Risk Score: 0.549
  ‚Ä¢ Customers: 1,092
  ‚Ä¢ Avg Recency: 10.1 days
  ‚Ä¢ Avg Frequency: 73.9
  ‚Ä¢ Avg Monetary: $708,902

üéØ HIGH-RISK CLUSTER IDENTIFIED: CLUSTER 1
‚Ä¢ Risk Score: 0.778
‚Ä¢ Customers: 1,033
‚Ä¢ Avg Rec

‚úÖ High-risk cluster identification complete


In [45]:
# ============================================================================
# SECTION 8: TARGET VARIABLE CREATION & VALIDATION
# ============================================================================
print("\n" + "="*80)
print("üè∑Ô∏è SECTION 8: PROXY TARGET VARIABLE CREATION")
print("="*80)

# Create binary target variable
print("üîÑ Creating proxy target variable 'is_high_risk'...")
rfm['is_high_risk'] = (rfm['cluster'] == high_risk_cluster).astype(int)

# Display target distribution
print("\nüìä TARGET VARIABLE DISTRIBUTION:")
print("-" * 40)

target_distribution = rfm['is_high_risk'].value_counts().sort_index()
for risk_level, count in target_distribution.items():
    risk_label = "HIGH RISK" if risk_level == 1 else "LOW RISK"
    percentage = count / len(rfm) * 100
    print(f"  {risk_label} ({risk_level}): {count:,} customers ({percentage:.1f}%)")

print(f"\nüéØ KEY METRICS:")
print(f"‚Ä¢ High-risk rate: {target_distribution[1]/len(rfm)*100:.1f}%")
print(f"‚Ä¢ Industry benchmark (Basel II): 2-5% default rate")
print(f"‚Ä¢ Assessment: {'WITHIN ACCEPTABLE RANGE' if 2 <= target_distribution[1]/len(rfm)*100 <= 5 else 'REVIEW REQUIRED'}")

# Business validation with existing fraud data
print("\nüîç BUSINESS VALIDATION WITH FRAUD DATA:")
if 'FraudResult' in df.columns:
    # Merge fraud data with RFM
    customer_fraud = df.groupby('CustomerId')['FraudResult'].max()
    validation_df = pd.merge(rfm[['is_high_risk']], customer_fraud, 
                           left_index=True, right_index=True, how='left')
    validation_df['FraudResult'] = validation_df['FraudResult'].fillna(0)
    
    # Calculate fraud rates
    fraud_by_risk = validation_df.groupby('is_high_risk')['FraudResult'].agg(['sum', 'count', 'mean'])
    fraud_by_risk['fraud_rate'] = fraud_by_risk['mean'] * 100
    fraud_by_risk['relative_risk'] = fraud_by_risk['fraud_rate'] / fraud_by_risk['fraud_rate'].min()
    
    print("\nüìä FRAUD ANALYSIS BY RISK CATEGORY:")
    print(fraud_by_risk.round(4))
    
    print(f"\n‚úÖ VALIDATION RESULT:")
    if fraud_by_risk.loc[1, 'relative_risk'] > 1:
        print(f"‚úì PASS: High-risk customers have {fraud_by_risk.loc[1, 'relative_risk']:.1f}x higher fraud rate")
    else:
        print(f"‚ö†Ô∏è REVIEW: Unexpected fraud pattern")
else:
    print("‚ÑπÔ∏è Fraud data not available for validation")

# Create target variable validation visualization
print("\nüîÑ Creating target validation dashboard...")

fig7 = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Target Distribution', 'RFM Comparison by Risk'),
    specs=[[{'type': 'pie'}, {'type': 'bar'}]]
)

# Pie chart for target distribution
fig7.add_trace(
    go.Pie(
        labels=['Low Risk', 'High Risk'],
        values=target_distribution.values,
        hole=0.4,
        marker_colors=['#4ECDC4', '#FF6B6B'],
        textinfo='label+percent+value',
        name='Target Distribution'
    ),
    row=1, col=1
)

# Bar chart for RFM comparison
risk_comparison = rfm.groupby('is_high_risk').agg({
    'recency_days': 'mean',
    'transaction_frequency': 'mean',
    'total_monetary_value': 'mean'
}).reset_index()

# Normalize for comparison
for col in ['recency_days', 'transaction_frequency', 'total_monetary_value']:
    risk_comparison[f'{col}_norm'] = risk_comparison[col] / risk_comparison[col].max()

fig7.add_trace(
    go.Bar(name='Recency', x=['Low Risk', 'High Risk'], 
           y=risk_comparison['recency_days_norm'], marker_color='#FF6B6B',
           text=[f"{x:.0f}d" for x in risk_comparison['recency_days']]),
    row=1, col=2
)

fig7.add_trace(
    go.Bar(name='Frequency', x=['Low Risk', 'High Risk'],
           y=risk_comparison['transaction_frequency_norm'], marker_color='#4ECDC4',
           text=[f"{x:.1f}" for x in risk_comparison['transaction_frequency']]),
    row=1, col=2
)

fig7.update_layout(
    title_text='Proxy Target Variable Validation',
    barmode='group',
    showlegend=True,
    height=500
)

fig7.show()

print("‚úÖ Target variable creation and validation complete")


üè∑Ô∏è SECTION 8: PROXY TARGET VARIABLE CREATION
üîÑ Creating proxy target variable 'is_high_risk'...

üìä TARGET VARIABLE DISTRIBUTION:
----------------------------------------
  LOW RISK (0): 2,709 customers (72.4%)
  HIGH RISK (1): 1,033 customers (27.6%)

üéØ KEY METRICS:
‚Ä¢ High-risk rate: 27.6%
‚Ä¢ Industry benchmark (Basel II): 2-5% default rate
‚Ä¢ Assessment: REVIEW REQUIRED

üîç BUSINESS VALIDATION WITH FRAUD DATA:

üìä FRAUD ANALYSIS BY RISK CATEGORY:
              sum  count  mean  fraud_rate  relative_risk
is_high_risk                                             
0            0.00   2709  0.00        0.00            NaN
1            0.00   1033  0.00        0.00            NaN

‚úÖ VALIDATION RESULT:
‚ö†Ô∏è REVIEW: Unexpected fraud pattern

üîÑ Creating target validation dashboard...


‚úÖ Target variable creation and validation complete


In [46]:
# ============================================================================
# SECTION 8: COMPREHENSIVE VALIDATION DASHBOARD
# ============================================================================
print("\n" + "="*80)
print("üìä SECTION 8: COMPREHENSIVE VALIDATION DASHBOARD")
print("="*80)

# First, calculate target_counts
target_counts = rfm['is_high_risk'].value_counts().sort_index()
print(f"‚úÖ Target variable distribution calculated:")
print(f"   ‚Ä¢ Low Risk (0): {target_counts[0]:,} customers")
print(f"   ‚Ä¢ High Risk (1): {target_counts[1]:,} customers")

# Create final validation dashboard
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=(
        'Risk Distribution by Cluster',
        'RFM Comparison: High vs Low Risk',
        'Cluster Characteristics',
        'Customer Value Pyramid',
        'Risk Score Distribution',
        'Business Decision Summary'
    ),
    specs=[
        [{'type': 'pie'}, {'type': 'bar'}, {'type': 'scatter3d'}],
        [{'type': 'bar'}, {'type': 'histogram'}, {'type': 'table'}]
    ],
    vertical_spacing=0.1,
    horizontal_spacing=0.1
)

# 1. Risk Distribution Pie Chart
cluster_sizes = rfm['cluster'].value_counts().sort_index()
risk_labels = [f"Cluster {i}" for i in cluster_sizes.index]
risk_colors = ['#FF6B6B' if i == high_risk_cluster else '#4ECDC4' for i in cluster_sizes.index]

fig.add_trace(
    go.Pie(
        labels=risk_labels,
        values=cluster_sizes.values,
        hole=0.4,
        marker_colors=risk_colors,
        name='Cluster Distribution',
        hovertemplate='%{label}: %{value} customers (%{percent})<br>%{customdata}',
        customdata=['HIGH RISK' if i == high_risk_cluster else 'LOW RISK' for i in cluster_sizes.index]
    ),
    row=1, col=1
)

# 2. RFM Comparison Bar Chart
risk_comparison = rfm.groupby('is_high_risk').agg({
    'recency_days': 'mean',
    'transaction_frequency': 'mean',
    'total_monetary_value': 'mean'
}).reset_index()

# Normalize for comparison
for col in ['recency_days', 'transaction_frequency', 'total_monetary_value']:
    risk_comparison[f'{col}_norm'] = risk_comparison[col] / risk_comparison[col].max()

fig.add_trace(
    go.Bar(
        name='Recency (Higher = Riskier)',
        x=['Low Risk', 'High Risk'],
        y=risk_comparison['recency_days_norm'],
        marker_color='#FF6B6B',
        text=[f"{x:.0f}d" for x in risk_comparison['recency_days']],
        textposition='auto'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        name='Frequency (Higher = Safer)',
        x=['Low Risk', 'High Risk'],
        y=risk_comparison['transaction_frequency_norm'],
        marker_color='#4ECDC4',
        text=[f"{x:.1f}" for x in risk_comparison['transaction_frequency']],
        textposition='auto'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        name='Monetary Value',
        x=['Low Risk', 'High Risk'],
        y=risk_comparison['total_monetary_value_norm'],
        marker_color='#45B7D1',
        text=[f"${x:,.0f}" for x in risk_comparison['total_monetary_value']],
        textposition='auto'
    ),
    row=1, col=2
)

fig.update_xaxes(title_text="Risk Category", row=1, col=2)
fig.update_yaxes(title_text="Normalized Value", row=1, col=2)

# 3. 3D Cluster Visualization
fig.add_trace(
    go.Scatter3d(
        x=rfm['recency_days'],
        y=np.log10(rfm['transaction_frequency'] + 1),
        z=np.log10(rfm['total_monetary_value'] + 1),
        mode='markers',
        marker=dict(
            size=4,
            color=rfm['is_high_risk'],
            colorscale=['#4ECDC4', '#FF6B6B'],
            showscale=True,
            colorbar=dict(title="Risk Level", tickvals=[0, 1], ticktext=['Low', 'High'])
        ),
        hovertemplate='Recency: %{x} days<br>Log10(Freq): %{y:.2f}<br>Log10(Value): %{z:.2f}<br>Risk: %{customdata}',
        customdata=['High Risk' if x == 1 else 'Low Risk' for x in rfm['is_high_risk']]
    ),
    row=1, col=3
)

# 4. Customer Value Pyramid
value_bins = pd.qcut(rfm['total_monetary_value'], q=4, labels=['Bronze', 'Silver', 'Gold', 'Platinum'])
value_pyramid = value_bins.value_counts().sort_index()

fig.add_trace(
    go.Bar(
        x=value_pyramid.values,
        y=value_pyramid.index,
        orientation='h',
        marker_color=['#CD7F32', '#C0C0C0', '#FFD700', '#E5E4E2'],
        hovertemplate='%{y}: %{x} customers (%{customdata:.1f}%)',
        customdata=(value_pyramid.values / value_pyramid.values.sum() * 100)
    ),
    row=2, col=1
)
fig.update_xaxes(title_text="Customer Count", row=2, col=1)
fig.update_yaxes(title_text="Value Tier", row=2, col=1)

# 5. Risk Score Distribution
fig.add_trace(
    go.Histogram(
        x=rfm['recency_days'],
        nbinsx=50,
        marker_color='#FF6B6B',
        hovertemplate='%{x:.0f} days: %{y} customers'
    ),
    row=2, col=2
)
fig.update_xaxes(title_text="Recency (Days - Primary Risk Indicator)", row=2, col=2)
fig.update_yaxes(title_text="Customer Count", row=2, col=2)

# Add risk threshold lines using shapes
fig.add_shape(
    type="line",
    x0=30, y0=0, x1=30, y1=1,
    line=dict(color="green", width=2, dash="dash"),
    xref="x5", yref="paper",
    row=2, col=2
)

fig.add_shape(
    type="line",
    x0=90, y0=0, x1=90, y1=1,
    line=dict(color="orange", width=2, dash="dash"),
    xref="x5", yref="paper",
    row=2, col=2
)

# Add annotations for the threshold lines
fig.add_annotation(
    x=30, y=0.95,
    text="30d: Low Risk",
    showarrow=False,
    xref="x5", yref="paper",
    row=2, col=2,
    bgcolor="white",
    bordercolor="green",
    borderwidth=1,
    font=dict(size=10, color="green")
)

fig.add_annotation(
    x=90, y=0.95,
    text="90d: High Risk",
    showarrow=False,
    xref="x5", yref="paper",
    row=2, col=2,
    bgcolor="white",
    bordercolor="orange",
    borderwidth=1,
    font=dict(size=10, color="orange")
)

# 6. Business Decision Summary Table
# Calculate validation status
high_risk_rate = target_counts[1] / len(rfm) * 100
validation_status = '‚úì PASS' if 2 <= high_risk_rate <= 5 else '‚ö† REVIEW'

summary_data = [
    ['Total Customers', f"{len(rfm):,}"],
    ['High-Risk Customers', f"{target_counts[1]:,}"],
    ['High-Risk Rate', f"{high_risk_rate:.1f}%"],
    ['Avg High-Risk Recency', f"{rfm[rfm['is_high_risk']==1]['recency_days'].mean():.0f} days"],
    ['Industry Benchmark', "2-5% (Basel II)"],
    ['Validation Status', validation_status],
    ['Next Step', 'Proceed to Model Training']
]

fig.add_trace(
    go.Table(
        header=dict(
            values=['<b>Metric</b>', '<b>Value</b>'],
            fill_color='#2E8B57',
            align='center',
            font=dict(color='white', size=12)
        ),
        cells=dict(
            values=[[row[0] for row in summary_data], 
                   [row[1] for row in summary_data]],
            align='center',
            fill_color='lavender',
            font=dict(size=11)
        )
        # REMOVED: showlegend=False (Tables don't have this parameter)
    ),
    row=2, col=3
)

# Update layout
fig.update_layout(
    height=1000,
    title_text="<b>Bati Bank - Credit Risk Proxy Validation Dashboard</b>",
    showlegend=True,
    template='plotly_white',
    barmode='group',
    legend=dict(
        x=1.02,
        y=1,
        xanchor='left',
        yanchor='top',
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='black',
        borderwidth=1
    )
)

print("‚úÖ Validation dashboard created successfully!")
fig.show()

# Display summary statistics
print("\n" + "="*80)
print("üìã BUSINESS DECISION SUMMARY")
print("="*80)

for metric, value in summary_data:
    print(f"‚Ä¢ {metric}: {value}")

print(f"\nüéØ KEY INSIGHTS:")
print("-" * 40)
print(f"1. High-risk customers identified: {target_counts[1]:,}")
print(f"2. High-risk rate: {high_risk_rate:.1f}%")
print(f"3. Basel II compliance: {'WITHIN RANGE (2-5%)' if validation_status == '‚úì PASS' else 'OUTSIDE RANGE - REVIEW REQUIRED'}")
print(f"4. High-risk profile: {rfm[rfm['is_high_risk']==1]['recency_days'].mean():.0f} days average recency")


üìä SECTION 8: COMPREHENSIVE VALIDATION DASHBOARD
‚úÖ Target variable distribution calculated:
   ‚Ä¢ Low Risk (0): 2,709 customers
   ‚Ä¢ High Risk (1): 1,033 customers
‚úÖ Validation dashboard created successfully!



üìã BUSINESS DECISION SUMMARY
‚Ä¢ Total Customers: 3,742
‚Ä¢ High-Risk Customers: 1,033
‚Ä¢ High-Risk Rate: 27.6%
‚Ä¢ Avg High-Risk Recency: 50 days
‚Ä¢ Industry Benchmark: 2-5% (Basel II)
‚Ä¢ Validation Status: ‚ö† REVIEW
‚Ä¢ Next Step: Proceed to Model Training

üéØ KEY INSIGHTS:
----------------------------------------
1. High-risk customers identified: 1,033
2. High-risk rate: 27.6%
3. Basel II compliance: OUTSIDE RANGE - REVIEW REQUIRED
4. High-risk profile: 50 days average recency


In [47]:
# ============================================================================
# SECTION 9: SAVE RESULTS & CREATE DOCUMENTATION
# ============================================================================
print("\n" + "="*80)
print("üíæ SECTION 9: SAVE RESULTS & DOCUMENTATION")
print("="*80)

# Import required libraries
import os
import json
from datetime import datetime

# Create output directory
output_dir = '../../data/processed'
os.makedirs(output_dir, exist_ok=True)

print(f"üìÅ Output directory: {output_dir}")

# Save all outputs
output_files = {}

# 1. Save RFM data with target variable
rfm_output_path = f'{output_dir}/customer_rfm_with_target.csv'
rfm.to_csv(rfm_output_path)
output_files['rfm_with_target'] = rfm_output_path
print(f"‚úÖ RFM data with target saved: {rfm_output_path}")

# 2. Save target variable separately
target_output_path = f'{output_dir}/target_variable.csv'
rfm[['is_high_risk']].to_csv(target_output_path)
output_files['target_variable'] = target_output_path
print(f"‚úÖ Target variable saved: {target_output_path}")

# 3. Calculate target distribution if not already calculated
if 'target_distribution' not in locals():
    target_distribution = rfm['is_high_risk'].value_counts().sort_index()
    print(f"üìä Target distribution calculated: {dict(target_distribution)}")

# 4. Save cluster analysis (create cluster_df if not exists)
if 'cluster_df' not in locals():
    print("üìä Creating cluster analysis dataframe...")
    # Create cluster analysis dataframe
    cluster_stats = []
    for cluster_id in sorted(rfm['cluster'].unique()):
        cluster_data = rfm[rfm['cluster'] == cluster_id]
        is_high_risk = 1 if cluster_id == high_risk_cluster else 0
        
        cluster_stats.append({
            'Cluster': int(cluster_id),  # Convert to Python int
            'Customers': int(len(cluster_data)),  # Convert to Python int
            'Percentage': f"{len(cluster_data)/len(rfm)*100:.1f}%",
            'Avg_Recency_Days': float(cluster_data['recency_days'].mean()),  # Convert to Python float
            'Avg_Frequency': float(cluster_data['transaction_frequency'].mean()),  # Convert to Python float
            'Avg_Monetary_Value': float(cluster_data['total_monetary_value'].mean()),  # Convert to Python float
            'Is_High_Risk': int(is_high_risk),  # Convert to Python int
            'Risk_Category': 'High Risk' if is_high_risk == 1 else 'Low Risk'
        })
    
    cluster_df = pd.DataFrame(cluster_stats)

cluster_analysis_path = f'{output_dir}/cluster_analysis.csv'
cluster_df.to_csv(cluster_analysis_path, index=False)
output_files['cluster_analysis'] = cluster_analysis_path
print(f"‚úÖ Cluster analysis saved: {cluster_analysis_path}")

# 5. Save business summary
print("\nüìù CREATING BUSINESS SUMMARY DOCUMENTATION...")

# Calculate key metrics (convert numpy types to Python types)
high_risk_count = int(target_distribution[1])  # Convert to Python int
high_risk_rate = float(target_distribution[1] / len(rfm) * 100)  # Convert to Python float
within_basel_range = bool(2 <= high_risk_rate <= 5)  # Convert to Python bool
high_risk_cluster_int = int(high_risk_cluster)  # Convert to Python int

# Create cluster distribution dictionary with Python types
cluster_distribution_dict = {}
if 'cluster_df' in locals():
    for _, row in cluster_df.iterrows():
        cluster_distribution_dict[int(row['Cluster'])] = row['Percentage']

business_summary = {
    'project': 'Bati Bank Credit Risk Model - Task 4',
    'timestamp': datetime.now().isoformat(),
    'data_summary': {
        'total_customers': int(len(rfm)),  # Python int
        'high_risk_customers': high_risk_count,  # Python int
        'high_risk_rate': high_risk_rate,  # Python float
        'high_risk_cluster': high_risk_cluster_int,  # Python int
        'cluster_distribution': cluster_distribution_dict  # Python dict
    },
    'methodology': {
        'rfm_metrics': ['recency_days', 'transaction_frequency', 'total_monetary_value'],
        'clustering_algorithm': 'KMeans',
        'number_of_clusters': 3,
        'scaling_method': 'RobustScaler',
        'risk_scoring_weights': {'recency': 0.5, 'frequency': 0.3, 'monetary': 0.2},
        'random_state': 42
    },
    'business_validation': {
        'within_basel_range': within_basel_range,  # Python bool
        'expected_default_rate': '2-5%',
        'actual_default_rate': f"{high_risk_rate:.1f}%",
        'validation_status': 'PASS' if within_basel_range else 'REVIEW REQUIRED'
    },
    'files_generated': list(output_files.keys()),
    'next_steps': [
        'Proceed to Task 5: Model Training',
        'Validate proxy with additional business metrics',
        'Document assumptions for regulatory review'
    ]
}

summary_path = f'{output_dir}/task4_business_summary.json'
with open(summary_path, 'w') as f:
    json.dump(business_summary, f, indent=4)
output_files['business_summary'] = summary_path
print(f"‚úÖ Business summary saved: {summary_path}")

# 6. Create a README file for the outputs
readme_content = f"""# Task 4 Outputs - Credit Risk Proxy Target Creation

## Project: Bati Bank Credit Risk Modeling
## Task: RFM Analysis & Proxy Target Variable Creation
## Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Files Generated:

### 1. customer_rfm_with_target.csv
- Contains RFM metrics and target variable for {len(rfm):,} customers
- Columns: recency_days, transaction_frequency, total_monetary_value, cluster, is_high_risk
- Purpose: Primary dataset for model training

### 2. target_variable.csv
- Contains only the target variable (is_high_risk)
- Purpose: Quick access to target labels

### 3. cluster_analysis.csv
- Cluster statistics and profiles
- Shows distribution, characteristics, and risk categorization
- Purpose: Business understanding and validation

### 4. task4_business_summary.json
- Comprehensive business documentation
- Includes methodology, results, and validation
- Purpose: Regulatory compliance and stakeholder communication

## Key Results:

- **Total Customers Analyzed**: {len(rfm):,}
- **High-Risk Customers Identified**: {high_risk_count:,}
- **High-Risk Rate**: {high_risk_rate:.1f}%
- **Basel II Compliance**: {'WITHIN RANGE (2-5%)' if within_basel_range else 'OUTSIDE RANGE - REVIEW REQUIRED'}
- **High-Risk Cluster**: {high_risk_cluster_int}

## Methodology:

1. **RFM Calculation**: Recency, Frequency, Monetary metrics calculated per customer
2. **Feature Scaling**: RobustScaler applied to handle outliers
3. **Clustering**: K-Means with k=3 as per requirements
4. **Risk Identification**: Cluster {high_risk_cluster_int} identified as high-risk based on business logic
5. **Target Creation**: Binary is_high_risk variable created

## Business Validation:

- Target distribution validated against Basel II requirements
- Risk categorization aligns with business understanding
- Documentation created for audit trail
- Ready for model training phase

## Next Steps:
1. Proceed to Task 5: Model Training
2. Feature engineering with RFM variables
3. Model selection and hyperparameter tuning
4. Model validation and deployment
"""

readme_path = f'{output_dir}/README_TASK4.md'
with open(readme_path, 'w') as f:
    f.write(readme_content)
output_files['readme'] = readme_path
print(f"‚úÖ README file saved: {readme_path}")

# Display file summary
print("\n" + "="*80)
print("üìÅ OUTPUT FILES SUMMARY")
print("="*80)

# Create a summary table
summary_table = []
for key, path in output_files.items():
    if os.path.exists(path):
        file_size = os.path.getsize(path) / 1024  # Size in KB
        summary_table.append({
            'File': key,
            'Path': path,
            'Size (KB)': f"{file_size:.1f}",
            'Status': '‚úì'
        })
    else:
        summary_table.append({
            'File': key,
            'Path': path,
            'Size (KB)': 'N/A',
            'Status': '‚úó'
        })

# Display summary
summary_df = pd.DataFrame(summary_table)
print(summary_df.to_string(index=False))

# Display summary statistics
print("\n" + "="*80)
print("üìä TASK 4 COMPLETION SUMMARY")
print("="*80)

print(f"""
üéØ OBJECTIVES ACHIEVED:
-----------------------
1. ‚úì RFM Metrics Calculated: Recency, Frequency, Monetary
2. ‚úì Customer Segmentation: K-Means clustering (k=3)
3. ‚úì High-Risk Identification: Cluster {high_risk_cluster_int} marked as high-risk
4. ‚úì Proxy Target Created: Binary is_high_risk variable
5. ‚úì Business Validation: {high_risk_rate:.1f}% high-risk rate
6. ‚úì Documentation: Complete audit trail created
7. ‚úì Basel II Compliance: {'‚úì WITHIN RANGE' if within_basel_range else '‚ö† REVIEW REQUIRED'}

üìà KEY METRICS:
---------------
‚Ä¢ Total Customers: {len(rfm):,}
‚Ä¢ High-Risk Customers: {high_risk_count:,}
‚Ä¢ High-Risk Rate: {high_risk_rate:.1f}%
‚Ä¢ Industry Benchmark: 2-5% (Basel II)
‚Ä¢ Validation Status: {'PASS ‚úì' if within_basel_range else 'REVIEW REQUIRED ‚ö†'}

üìÅ OUTPUTS GENERATED:
---------------------
‚Ä¢ {len(output_files)} files saved to {output_dir}
‚Ä¢ Total size: {sum([os.path.getsize(p) for p in output_files.values() if os.path.exists(p)])/1024:.1f} KB

üöÄ NEXT STEPS:
-------------
1. Proceed to Task 5: Model Training
2. Use 'customer_rfm_with_target.csv' as input
3. Implement feature engineering pipeline
4. Train and validate credit risk models
""")

print("\n" + "="*80)
print("‚úÖ TASK 4 COMPLETED SUCCESSFULLY!")
print("="*80)


üíæ SECTION 9: SAVE RESULTS & DOCUMENTATION
üìÅ Output directory: ../../data/processed
‚úÖ RFM data with target saved: ../../data/processed/customer_rfm_with_target.csv
‚úÖ Target variable saved: ../../data/processed/target_variable.csv
‚úÖ Cluster analysis saved: ../../data/processed/cluster_analysis.csv

üìù CREATING BUSINESS SUMMARY DOCUMENTATION...
‚úÖ Business summary saved: ../../data/processed/task4_business_summary.json
‚úÖ README file saved: ../../data/processed/README_TASK4.md

üìÅ OUTPUT FILES SUMMARY
            File                                              Path Size (KB) Status
 rfm_with_target ../../data/processed/customer_rfm_with_target.csv     241.0      ‚úì
 target_variable          ../../data/processed/target_variable.csv      28.2      ‚úì
cluster_analysis         ../../data/processed/cluster_analysis.csv       0.2      ‚úì
business_summary  ../../data/processed/task4_business_summary.json       1.4      ‚úì
          readme              ../../data/processe