In [278]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [279]:
# Import credit_record.csv
url = "https://raw.githubusercontent.com/manish-cr/CS3244-credit-approval/master/data/credit_record.csv"
df = pd.read_csv(url)

In [280]:
# Explore dataset
df.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [281]:
# Calculate the percentage of "X" and "C" statuses for each customer
x_percentage = df[df['STATUS'] == 'X'].groupby('ID')['MONTHS_BALANCE'].count() / df.groupby('ID')['MONTHS_BALANCE'].count()
c_percentage = df[df['STATUS'] == 'C'].groupby('ID')['MONTHS_BALANCE'].count() / df.groupby('ID')['MONTHS_BALANCE'].count()

# Calculate average months overdue for each customer 
avg_months_overdue = df[df['STATUS'].isin(['0', '1', '2', '3', '4', '5'])]
avg_months_overdue['STATUS'] = avg_months_overdue['STATUS'].astype(int)
avg_months_overdue = avg_months_overdue.groupby('ID')['STATUS'].mean()

# Calculate how many months account has been open for
account_length = df.groupby(['ID'])['MONTHS_BALANCE'].agg(min)

# Create a new dataframe with customer ID, average months overdue, X percentage, and C percentage
ids = pd.DataFrame({'ID': df['ID'].unique()})
result = ids.merge(account_length, on='ID', how='left')
result = result.merge(x_percentage, on='ID', how='left')
result = result.merge(c_percentage, on='ID', how='left')
result = result.merge(avg_months_overdue, on='ID', how='left')
result = result.rename(columns={'MONTHS_BALANCE_x': 'Account_Length', 'MONTHS_BALANCE_y': 'X_Percentage','MONTHS_BALANCE': 'C_Percentage', 'STATUS': 'Avg_Months_Overdue'})

# Fill NaN values with 0 
result['Account_Length'] = -result['Account_Length']
result['X_Percentage'].fillna(0, inplace=True)  
result['C_Percentage'].fillna(0, inplace=True)  
result['Avg_Months_Overdue'].fillna(-1, inplace=True)

result



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ID,Account_Length,X_Percentage,C_Percentage,Avg_Months_Overdue
0,5001711,3,0.25,0.000000,0.0
1,5001712,18,0.00,0.473684,0.0
2,5001713,21,1.00,0.000000,-1.0
3,5001714,14,1.00,0.000000,-1.0
4,5001715,59,1.00,0.000000,-1.0
...,...,...,...,...,...
45980,5150482,28,0.00,0.333333,0.0
45981,5150483,17,1.00,0.000000,-1.0
45982,5150484,12,0.00,0.076923,0.0
45983,5150485,1,0.00,0.000000,0.0


In [282]:
result.head()

Unnamed: 0,ID,Account_Length,X_Percentage,C_Percentage,Avg_Months_Overdue
0,5001711,3,0.25,0.0,0.0
1,5001712,18,0.0,0.473684,0.0
2,5001713,21,1.0,0.0,-1.0
3,5001714,14,1.0,0.0,-1.0
4,5001715,59,1.0,0.0,-1.0


In [283]:
result_normalised = result.copy()

# Define a custom normalization function
def custom_normalize(x, min_val, max_val):
    return (x - min_val) / (max_val - min_val)

# Define the columns to be normalized
columns_to_normalize = ['Avg_Months_Overdue', 'Account_Length']

# Calculate min and max values for the selected columns
min_values = result_normalised[columns_to_normalize].min()
max_values = result_normalised[columns_to_normalize].max()

# Apply custom normalization to the selected columns
for column in columns_to_normalize:
    result_normalised[column] = result_normalised[column].apply(custom_normalize, args=(min_values[column], max_values[column]))

In [284]:
result_normalised.head()

Unnamed: 0,ID,Account_Length,X_Percentage,C_Percentage,Avg_Months_Overdue
0,5001711,0.05,0.25,0.0,0.166667
1,5001712,0.3,0.0,0.473684,0.166667
2,5001713,0.35,1.0,0.0,0.0
3,5001714,0.233333,1.0,0.0,0.0
4,5001715,0.983333,1.0,0.0,0.0


In [285]:
# Apply K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=20)  
result_normalised['Label'] = kmeans.fit_predict(result_normalised[['Account_Length','X_Percentage', 'C_Percentage','Avg_Months_Overdue']])

In [286]:
# Group the DataFrame by the 'Cluster' column and count the data points in each cluster
cluster_counts = result_normalised.groupby('Label').size().reset_index(name='Count')

# Display the cluster counts
print(cluster_counts)

# 0 for good credit record and 1 for bad credit record 

   Label  Count
0      0  26397
1      1  19588


In [287]:
result_normalised.to_csv('../data/credit_cleaned.csv', encoding = 'utf-8-sig')