In [2]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [3]:
# Import credit_record.csv
url = "https://raw.githubusercontent.com/manish-cr/CS3244-credit-approval/master/data/credit_record.csv"
df = pd.read_csv(url)

In [4]:
# Split the data into training and testing sets, keeping customer IDs separate
train_ids, test_ids = train_test_split(df['ID'].unique(), test_size=0.2, random_state=42, stratify=None)

# Create the training and testing data subsets based on the selected customer IDs
train_data = df[df['ID'].isin(train_ids)]
test_data = df[df['ID'].isin(test_ids)]

In [10]:
# Calculate the percentage of "X" and "C" statuses for each customer
x_percentage_train = train_data[train_data['STATUS'] == 'X'].groupby('ID')['MONTHS_BALANCE'].count() / train_data.groupby('ID')['MONTHS_BALANCE'].count()
x_percentage_test = test_data[test_data['STATUS'] == 'X'].groupby('ID')['MONTHS_BALANCE'].count() / test_data.groupby('ID')['MONTHS_BALANCE'].count()

c_percentage_train = train_data[train_data['STATUS'] == 'C'].groupby('ID')['MONTHS_BALANCE'].count() / train_data.groupby('ID')['MONTHS_BALANCE'].count()
c_percentage_test = test_data[test_data['STATUS'] == 'C'].groupby('ID')['MONTHS_BALANCE'].count() / test_data.groupby('ID')['MONTHS_BALANCE'].count()

# Calculate average months overdue for each customer 
avg_months_overdue_train = train_data[train_data['STATUS'].isin(['0', '1', '2', '3', '4', '5'])]
avg_months_overdue_train['STATUS'] = avg_months_overdue_train['STATUS'].astype(int)
avg_months_overdue_train = avg_months_overdue_train.groupby('ID')['STATUS'].mean()

avg_months_overdue_test = test_data[test_data['STATUS'].isin(['0', '1', '2', '3', '4', '5'])]
avg_months_overdue_test['STATUS'] = avg_months_overdue_test['STATUS'].astype(int)
avg_months_overdue_test = avg_months_overdue_test.groupby('ID')['STATUS'].mean()

# Calculate how many months account has been open for
account_length_train = train_data.groupby(['ID'])['MONTHS_BALANCE'].agg(min)
account_length_test = test_data.groupby(['ID'])['MONTHS_BALANCE'].agg(min)

# Create DataFrames for training and testing data
result_train = pd.DataFrame({'ID': train_data['ID'].unique()})
result_test = pd.DataFrame({'ID': test_data['ID'].unique()})

# Merge data for training and testing
result_train = result_train.merge(account_length_train, on='ID', how='left')
result_test = result_test.merge(account_length_test, on='ID', how='left')
result_train = result_train.merge(x_percentage_train, on='ID', how='left')
result_test = result_test.merge(x_percentage_test, on='ID', how='left')
result_train = result_train.merge(c_percentage_train, on='ID', how='left')
result_test = result_test.merge(c_percentage_test, on='ID', how='left')
result_train = result_train.merge(avg_months_overdue_train, on='ID', how='left')
result_test = result_test.merge(avg_months_overdue_test, on='ID', how='left')

result_train = result_train.rename(columns={'MONTHS_BALANCE_x': 'Account_Length', 'MONTHS_BALANCE_y': 'X_Percentage','MONTHS_BALANCE': 'C_Percentage', 'STATUS': 'Avg_Months_Overdue'})
result_test = result_test.rename(columns={'MONTHS_BALANCE_x': 'Account_Length', 'MONTHS_BALANCE_y': 'X_Percentage','MONTHS_BALANCE': 'C_Percentage', 'STATUS': 'Avg_Months_Overdue'})

# Fill NaN values with 0 
result_train['Account_Length'] = -result_train['Account_Length']
result_test['Account_Length'] = -result_test['Account_Length']
result_train['X_Percentage'].fillna(0, inplace=True)  
result_test['X_Percentage'].fillna(0, inplace=True)
result_train['C_Percentage'].fillna(0, inplace=True)  
result_test['C_Percentage'].fillna(0, inplace=True)  
result_train['Avg_Months_Overdue'].fillna(-1, inplace=True)
result_test['Avg_Months_Overdue'].fillna(-1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  avg_months_overdue_train['STATUS'] = avg_months_overdue_train['STATUS'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  avg_months_overdue_test['STATUS'] = avg_months_overdue_test['STATUS'].astype(int)


In [16]:
result_train.head()

Unnamed: 0,ID,Account_Length,X_Percentage,C_Percentage,Avg_Months_Overdue
0,5001711,3,0.25,0.0,0.0
1,5001713,21,1.0,0.0,-1.0
2,5001714,14,1.0,0.0,-1.0
3,5001717,21,0.0,0.227273,0.0
4,5001718,38,0.25641,0.076923,0.076923


In [18]:
result_test.head()

Unnamed: 0,ID,Account_Length,X_Percentage,C_Percentage,Avg_Months_Overdue
0,5001712,18,0.0,0.473684,0.0
1,5001715,59,1.0,0.0,-1.0
2,5001719,42,0.0,0.953488,0.0
3,5001725,7,0.125,0.0,0.0
4,5001728,0,0.0,0.0,0.0


In [19]:
# Create a copy of result_train and result_test
result_train_normalised = result_train.copy()
result_test_normalised = result_test.copy()

# Define the custom normalization function
def custom_normalize(x, min_val, max_val):
    return (x - min_val) / (max_val - min_val)

# Define the columns to be normalized
columns_to_normalize = ['Avg_Months_Overdue', 'Account_Length']

# Calculate min and max values for the selected columns in the training data
min_values_train = result_train_normalised[columns_to_normalize].min()
max_values_train = result_train_normalised[columns_to_normalize].max()

# Calculate min and max values for the selected columns in the testing data
min_values_test = result_test_normalised[columns_to_normalize].min()
max_values_test = result_test_normalised[columns_to_normalize].max()

# Apply custom normalization to the selected columns in the training data
for column in columns_to_normalize:
    result_train_normalised[column] = result_train_normalised[column].apply(custom_normalize, args=(min_values_train[column], max_values_train[column]))

# Apply custom normalization to the selected columns in the testing data
for column in columns_to_normalize:
    result_test_normalised[column] = result_test_normalised[column].apply(custom_normalize, args=(min_values_test[column], max_values_test[column]))

In [20]:
result_train_normalised.head()

Unnamed: 0,ID,Account_Length,X_Percentage,C_Percentage,Avg_Months_Overdue
0,5001711,0.05,0.25,0.0,0.166667
1,5001713,0.35,1.0,0.0,0.0
2,5001714,0.233333,1.0,0.0,0.0
3,5001717,0.35,0.0,0.227273,0.166667
4,5001718,0.633333,0.25641,0.076923,0.179487


In [21]:
result_test_normalised.head()

Unnamed: 0,ID,Account_Length,X_Percentage,C_Percentage,Avg_Months_Overdue
0,5001712,0.3,0.0,0.473684,0.16955
1,5001715,0.983333,1.0,0.0,0.0
2,5001719,0.7,0.0,0.953488,0.16955
3,5001725,0.116667,0.125,0.0,0.16955
4,5001728,0.0,0.0,0.0,0.16955


In [22]:
# Apply K-Means clustering to the training data
kmeans = KMeans(n_clusters=2, random_state=20)  
result_train_normalised['Label'] = kmeans.fit_predict(result_train_normalised[['Account_Length', 'X_Percentage', 'C_Percentage', 'Avg_Months_Overdue']])

# Predict cluster labels for the testing data using the same K-Means model
result_test_normalised['Label'] = kmeans.fit_predict(result_test_normalised[['Account_Length', 'X_Percentage', 'C_Percentage', 'Avg_Months_Overdue']])

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [27]:
# Group the DataFrame by the 'Cluster' column and count the data points in each cluster
cluster_counts_train = result_train_normalised.groupby('Label').size().reset_index(name='Count')
cluster_counts_test = result_test_normalised.groupby('Label').size().reset_index(name='Count')

# Display the cluster counts
print("training cluster counts:")
print(cluster_counts_train)
print("test cluster counts:")
print(cluster_counts_test)

# 0 for good credit record and 1 for bad credit record 

training cluster counts:
   Label  Count
0      0  21135
1      1  15653
test cluster counts:
   Label  Count
0      0   3931
1      1   5266


In [287]:
result_train_normalised.to_csv('../data/credit_cleaned_training.csv', encoding = 'utf-8-sig')
result_test_normalised.to_csv('../data/credit_cleaned_test.csv', encoding = 'utf-8-sig')