In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("data/Score.csv")
df.head()

Unnamed: 0,Delay_from_due_date,Num_of_Delayed_Payment,Num_Credit_Inquiries,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly,Monthly_Balance,Credit_Score,Credit_Mix,...,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Monthly_Inhand_Salary,Changed_Credit_Limit,Outstanding_Debt,Total_EMI_per_month
0,3.0,7.0,4.0,26.82262,265.0,No,80.415295,312.494089,Good,Good,...,23.0,19114.12,3.0,4.0,3.0,4.0,1824.843333,11.27,809.98,49.574949
1,3.0,7.0,4.0,31.94496,265.0,No,118.280222,284.629162,Good,Good,...,23.0,19114.12,3.0,4.0,3.0,4.0,1824.843333,11.27,809.98,49.574949
2,3.0,7.0,4.0,28.609352,267.0,No,81.699521,331.209863,Good,Good,...,23.0,19114.12,3.0,4.0,3.0,4.0,1824.843333,11.27,809.98,49.574949
3,5.0,4.0,4.0,31.377862,268.0,No,199.458074,223.45131,Good,Good,...,23.0,19114.12,3.0,4.0,3.0,4.0,1824.843333,11.27,809.98,49.574949
4,6.0,4.0,4.0,24.797347,269.0,No,41.420153,341.489231,Good,Good,...,23.0,19114.12,3.0,4.0,3.0,4.0,1824.843333,11.27,809.98,49.574949


In [3]:
df = df[["Credit_Utilization_Ratio", 'Credit_History_Age', 'Amount_invested_monthly', 'Monthly_Balance',\
        'Annual_Income', 'Monthly_Inhand_Salary','Changed_Credit_Limit', 'Outstanding_Debt', \
        'Total_EMI_per_month', 'Credit_Score']] 

In [4]:
df.head()

Unnamed: 0,Credit_Utilization_Ratio,Credit_History_Age,Amount_invested_monthly,Monthly_Balance,Annual_Income,Monthly_Inhand_Salary,Changed_Credit_Limit,Outstanding_Debt,Total_EMI_per_month,Credit_Score
0,26.82262,265.0,80.415295,312.494089,19114.12,1824.843333,11.27,809.98,49.574949,Good
1,31.94496,265.0,118.280222,284.629162,19114.12,1824.843333,11.27,809.98,49.574949,Good
2,28.609352,267.0,81.699521,331.209863,19114.12,1824.843333,11.27,809.98,49.574949,Good
3,31.377862,268.0,199.458074,223.45131,19114.12,1824.843333,11.27,809.98,49.574949,Good
4,24.797347,269.0,41.420153,341.489231,19114.12,1824.843333,11.27,809.98,49.574949,Good


In [5]:
df.columns

Index(['Credit_Utilization_Ratio', 'Credit_History_Age',
       'Amount_invested_monthly', 'Monthly_Balance', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Changed_Credit_Limit', 'Outstanding_Debt',
       'Total_EMI_per_month', 'Credit_Score'],
      dtype='object')

In [6]:
df = df.replace([np.inf, -np.inf], np.nan)

In [7]:
df = df.dropna()

In [8]:
columns_to_remove_outliers = ['Credit_History_Age', 'Monthly_Balance', 'Outstanding_Debt', 'Total_EMI_per_month']

# Function to remove outliers using IQR method
def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    initial_rows = data.shape[0]
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    removed_rows = initial_rows - data.shape[0]
    print(f"Removed {removed_rows} rows from '{col}' column.")
    return data

# Remove outliers from the specified columns
for col in columns_to_remove_outliers:
    df = remove_outliers_iqr(df, col)

Removed 0 rows from 'Credit_History_Age' column.
Removed 7646 rows from 'Monthly_Balance' column.
Removed 3800 rows from 'Outstanding_Debt' column.
Removed 4788 rows from 'Total_EMI_per_month' column.


In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
col_list = df.select_dtypes(include="object").columns

for col in col_list:
    df["Credit_Score"] = le.fit_transform(df["Credit_Score"].astype(str))

In [10]:
df.head()

Unnamed: 0,Credit_Utilization_Ratio,Credit_History_Age,Amount_invested_monthly,Monthly_Balance,Annual_Income,Monthly_Inhand_Salary,Changed_Credit_Limit,Outstanding_Debt,Total_EMI_per_month,Credit_Score
0,26.82262,265.0,80.415295,312.494089,19114.12,1824.843333,11.27,809.98,49.574949,0
1,31.94496,265.0,118.280222,284.629162,19114.12,1824.843333,11.27,809.98,49.574949,0
2,28.609352,267.0,81.699521,331.209863,19114.12,1824.843333,11.27,809.98,49.574949,0
3,31.377862,268.0,199.458074,223.45131,19114.12,1824.843333,11.27,809.98,49.574949,0
4,24.797347,269.0,41.420153,341.489231,19114.12,1824.843333,11.27,809.98,49.574949,0


In [11]:
y = df["Credit_Score"]
final_features = ['Credit_History_Age', 'Monthly_Balance', 'Annual_Income', 'Changed_Credit_Limit', 'Outstanding_Debt']
X = df[final_features]

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [17]:
pred = neigh.predict(X_test)

In [18]:
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.680580437119312
              precision    recall  f1-score   support

           0       0.53      0.55      0.54      2912
           1       0.70      0.69      0.69      4904
           2       0.72      0.72      0.72      8930

    accuracy                           0.68     16746
   macro avg       0.65      0.65      0.65     16746
weighted avg       0.68      0.68      0.68     16746



In [20]:
input = pd.DataFrame.from_dict({'Credit_History_Age': [265],
'Monthly_Balance': [312.494089],
'Annual_Income': [19114.12],
'Changed_Credit_Limit': [11.27],
'Outstanding_Debt': [809.98]})

In [21]:
scaler

In [23]:
transformed_input = scaler.transform(input)

In [24]:
neigh.predict(transformed_input)

array([0])

In [37]:
le.inverse_transform([0, 1, 2, 1])

array(['Good', 'Poor', 'Standard', 'Poor'], dtype=object)

In [27]:
le.classes_

array(['Good', 'Poor', 'Standard'], dtype=object)

In [32]:
import pickle

with open('model_artifacts/label_encoder', 'wb') as f:
    pickle.dump(le, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('model_artifacts/scaler', 'wb') as f:
    pickle.dump(scaler, f,  protocol=pickle.HIGHEST_PROTOCOL)

with open('model_artifacts/model', 'wb') as f:
    pickle.dump(neigh, f,  protocol=pickle.HIGHEST_PROTOCOL)