In [1]:
import pandas as pd

# Load the uploaded dataset
file_path = 'merged_final.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,agent_id,charge_off_within_3_months,charge_off_within_6_months,charge_off_within_9_months,charge_off_within_12_months,credit_balance_mean,credit_balance_max,credit_balance_last,credit_utilization_mean,credit_utilization_max,...,Retail,Business,Clothing,Agricultural,Contractor,Transportation,Utility,Professional,approved_count,declined_count
0,0,0,0,0,0,3054.533833,4964.111415,2505.991415,0.623747,0.993,...,7.0,5.0,8.0,8.0,7.0,15.0,11.0,7.0,66.0,21.0
1,3,0,0,0,0,1826.206224,1990.626004,1990.626004,0.927758,0.995,...,27.0,53.0,12.0,20.0,5.0,11.0,10.0,16.0,23.0,183.0
2,6,0,0,0,0,649.106037,1279.9934,321.0834,0.227275,0.439,...,8.0,0.0,7.0,4.0,0.0,5.0,2.0,4.0,90.0,0.0
3,7,0,0,0,0,1621.523367,1984.664026,1941.344026,0.850923,0.998,...,17.0,36.0,14.0,16.0,10.0,8.0,10.0,8.0,48.0,112.0
4,8,0,0,0,0,5748.472329,9973.203428,7835.223428,0.584846,0.997,...,9.0,34.0,11.0,8.0,3.0,9.0,6.0,6.0,75.0,49.0


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np

# Combine charge_off columns into a single target for clustering
data['charge_off'] = (
    data[['charge_off_within_3_months', 'charge_off_within_6_months', 
          'charge_off_within_9_months', 'charge_off_within_12_months']]
    .max(axis=1)
)

# Select relevant features for clustering (excluding target and identifiers)
features = data.drop(columns=['agent_id', 'charge_off_within_3_months', 'charge_off_within_6_months',
                              'charge_off_within_9_months', 'charge_off_within_12_months', 'charge_off'])

# Handle missing values (if any) and normalize the features
features = features.fillna(features.mean())
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=42)  # Assuming 2 clusters: normal and anomalies
clusters = kmeans.fit_predict(features_scaled)

# Add clustering results to the dataset for analysis
data['cluster'] = clusters

# Analyze cluster assignments against charge_off
cluster_summary = data.groupby(['cluster', 'charge_off']).size().unstack(fill_value=0)

cluster_summary


charge_off,0,1
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4676,126
1,2474,76


In [3]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Evaluate the clusters by treating one cluster as anomalies (charge_off = 1)
# Assume the smaller cluster (more anomalies) represents charge_off = 1
predicted = (data['cluster'] == data['cluster'].value_counts().idxmin()).astype(int)
actual = data['charge_off']

# Calculate evaluation metrics
precision = precision_score(actual, predicted)
recall = recall_score(actual, predicted)
f1 = f1_score(actual, predicted)
accuracy = accuracy_score(actual, predicted)

precision, recall, f1, accuracy


(0.02980392156862745,
 0.37623762376237624,
 0.055232558139534885,
 0.6463547334058759)