In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score


from sklearn.feature_selection import chi2 
from sklearn.feature_selection import f_classif 


from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [3]:
df = pd.read_csv('/kaggle/input/datasetttttt/credit_score_dataset.csv')
df.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Credit_History_Age_Months
0,23,Scientist,19114.12,3,4,3,4,5,4,6.27,...,Good,809.98,31.377862,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,Good,268
1,23,Scientist,19114.12,3,4,3,4,6,0,11.27,...,Good,809.98,24.797347,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good,269
2,23,Scientist,19114.12,3,4,3,4,3,8,11.27,...,Good,809.98,22.537593,No,49.574949,178.344067,Low_spent_Small_value_payments,244.565317,Good,271
3,23,Scientist,19114.12,3,4,3,4,3,6,11.27,...,Good,809.98,23.933795,No,49.574949,24.785217,High_spent_Medium_value_payments,358.124168,Standard,0
4,28,Teacher,34847.84,2,4,6,1,7,1,7.42,...,Good,605.03,38.550848,No,18.816215,40.391238,High_spent_Large_value_payments,484.591214,Good,320


In [4]:
print(f'Dataset has {df.shape[0]} rows and {df.shape[1]} columns')

Dataset has 31711 rows and 21 columns


In [5]:
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
print("Numerical Features:", numerical_features)

Numerical Features: ['Age', 'Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance', 'Credit_History_Age_Months']


In [6]:
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Features:", categorical_features)

Categorical Features: ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour', 'Credit_Score']


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31711 entries, 0 to 31710
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        31711 non-null  int64  
 1   Occupation                 31711 non-null  object 
 2   Annual_Income              31711 non-null  float64
 3   Num_Bank_Accounts          31711 non-null  int64  
 4   Num_Credit_Card            31711 non-null  int64  
 5   Interest_Rate              31711 non-null  int64  
 6   Num_of_Loan                31711 non-null  int64  
 7   Delay_from_due_date        31711 non-null  int64  
 8   Num_of_Delayed_Payment     31711 non-null  int64  
 9   Changed_Credit_Limit       31711 non-null  float64
 10  Num_Credit_Inquiries       31711 non-null  float64
 11  Credit_Mix                 31711 non-null  object 
 12  Outstanding_Debt           31711 non-null  float64
 13  Credit_Utilization_Ratio   31711 non-null  flo

In [8]:
df.isnull().sum()

Age                          0
Occupation                   0
Annual_Income                0
Num_Bank_Accounts            0
Num_Credit_Card              0
Interest_Rate                0
Num_of_Loan                  0
Delay_from_due_date          0
Num_of_Delayed_Payment       0
Changed_Credit_Limit         0
Num_Credit_Inquiries         0
Credit_Mix                   0
Outstanding_Debt             0
Credit_Utilization_Ratio     0
Payment_of_Min_Amount        0
Total_EMI_per_month          0
Amount_invested_monthly      0
Payment_Behaviour            0
Monthly_Balance              0
Credit_Score                 0
Credit_History_Age_Months    0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,Credit_History_Age_Months
count,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0,31711.0
mean,35.135032,174904.5,4.415818,4.801583,10.256504,2.234114,14.985967,26.493299,8.60182,3.90303,776.983756,32.522218,59.287714,181.502288,439.647331,243.861026
std,11.037186,1415577.0,2.305062,1.673844,5.916633,1.700965,9.353937,215.388313,5.119076,2.813889,443.96846,5.135545,53.461204,196.253121,225.424866,108.853693
min,14.0,7006.52,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.23,20.832487,0.0,0.0,0.0,0.0
25%,26.0,22118.1,3.0,4.0,6.0,1.0,8.0,6.0,4.55,2.0,388.92,28.299138,16.414812,61.938256,293.841559,195.0
50%,35.0,36993.94,4.0,5.0,9.0,2.0,13.0,11.0,8.37,4.0,780.21,32.501616,46.162077,121.191802,369.698223,256.0
75%,44.0,74520.61,6.0,6.0,14.0,3.0,22.0,15.0,11.62,6.0,1182.5,36.731398,89.163419,225.891543,523.103061,329.0
max,56.0,24198060.0,10.0,10.0,34.0,9.0,60.0,4397.0,26.9,12.0,1499.92,49.564519,199.904691,1903.080048,1602.040519,404.0


In [10]:
df['Occupation'].unique()

array(['Scientist', 'Teacher', 'Entrepreneur', 'Developer', 'Lawyer',
       'Journalist', 'Engineer', 'Accountant', 'Musician', 'Architect',
       'Writer', 'Manager', 'Media_Manager', 'Doctor', 'Mechanic'],
      dtype=object)

In [11]:
import plotly.express as px


occupation_counts = df['Occupation'].value_counts().reset_index()
occupation_counts.columns = ['Occupation', 'Count']


fig = px.pie(
    occupation_counts,
    names='Occupation',
    values='Count',
    title='Distribution of Occupations'
)

fig.show()

In [12]:
df['Credit_Mix'].unique()

array(['Good', 'Standard', 'Bad'], dtype=object)

In [13]:
df['Payment_of_Min_Amount'].unique()

array(['No', 'NM', 'Yes'], dtype=object)

In [14]:
df['Payment_Behaviour'].unique()

array(['Low_spent_Small_value_payments',
       'High_spent_Medium_value_payments',
       'High_spent_Large_value_payments',
       'Low_spent_Medium_value_payments',
       'Low_spent_Large_value_payments',
       'High_spent_Small_value_payments'], dtype=object)

In [15]:
df['Credit_Score'].unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

In [16]:
df['Credit_Score'].value_counts()

Credit_Score
Standard    19730
Good         7551
Poor         4430
Name: count, dtype: int64

In [17]:
df['Credit_Score'] = df['Credit_Score'].apply(lambda x: 1 if x in ['Good', 'Standard'] else 0)
print(df['Credit_Score'].value_counts())


Credit_Score
1    27281
0     4430
Name: count, dtype: int64


In [18]:
df.sample(5)

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Credit_History_Age_Months
30918,48,Writer,110711.22,4,5,11,0,9,3,18.09,...,Good,251.06,28.870885,No,0.0,114.003074,High_spent_Medium_value_payments,1075.790426,1,202
14881,20,Mechanic,39052.47,5,4,9,2,3,11,2.65,...,Good,1284.22,32.435534,No,62.824217,134.342871,High_spent_Small_value_payments,404.470162,1,396
8368,30,Journalist,29111.07,8,6,11,2,19,18,13.43,...,Standard,279.95,25.341409,Yes,29.941931,73.122103,High_spent_Medium_value_payments,397.128217,1,130
31248,25,Accountant,123915.99,3,6,8,2,2,2,8.37,...,Good,1394.91,28.345468,No,156.988147,220.730714,High_spent_Large_value_payments,895.914389,1,204
7752,28,Mechanic,81481.02,3,4,7,2,3,0,4.11,...,Standard,211.88,27.916436,No,69.379837,448.829971,High_spent_Small_value_payments,439.098692,1,294


In [19]:
categorical_cols = ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour', 'Credit_Score']

In [20]:
label_enc = LabelEncoder()
for col in categorical_cols:
    df[col] = label_enc.fit_transform(df[col])



In [21]:
X1 = df[['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']]
y1 = df['Credit_Score']

In [22]:
chi_scores = chi2(X1, y1)
chi2_scores = pd.DataFrame({"Feature": X1.columns, "Score": chi_scores[0]})
print(chi2_scores.sort_values(by="Score", ascending=False))


                 Feature       Score
1             Credit_Mix  288.745515
3      Payment_Behaviour  127.452384
0             Occupation    4.753546
2  Payment_of_Min_Amount    0.001643


In [23]:
X_num_for_f_test = df[['Age', 'Annual_Income', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance', 'Credit_History_Age_Months']]

In [24]:
y_for_f_test = df['Credit_Score']

In [25]:
f_scores, p_values = f_classif(X_num_for_f_test, y_for_f_test)

In [26]:
f_scores_df = pd.DataFrame({'Feature': X_num_for_f_test.columns, 'F-Score': f_scores})

In [27]:
print(f_scores_df.sort_values(by='F-Score', ascending=False))

                      Feature      F-Score
6         Delay_from_due_date  1290.385124
3             Num_Credit_Card  1030.110256
9        Num_Credit_Inquiries   585.906436
4               Interest_Rate   453.047239
10           Outstanding_Debt   451.549924
5                 Num_of_Loan   274.356620
15  Credit_History_Age_Months    92.515418
12        Total_EMI_per_month    28.425328
8        Changed_Credit_Limit    27.130593
13    Amount_invested_monthly    20.350875
0                         Age    11.370137
2           Num_Bank_Accounts     6.047901
11   Credit_Utilization_Ratio     2.873588
7      Num_of_Delayed_Payment     2.290066
1               Annual_Income     0.064910
14            Monthly_Balance     0.030461


In [28]:
selected_features = [
    'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour',
    'Delay_from_due_date', 'Interest_Rate', 'Num_Credit_Card',
    'Num_Bank_Accounts', 'Changed_Credit_Limit', 'Num_Credit_Inquiries',
    'Num_of_Loan', 'Outstanding_Debt', 'Occupation'
]

In [29]:
X = df[selected_features]
y = df['Credit_Score'] 

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Ada Boost', AdaBoostClassifier(random_state=42)),
    ('XG Boost', XGBClassifier(random_state=42)),
    ('Naive Bayes', GaussianNB())
]

best_model = None
best_accuracy = 0.0


for name, model in models:
    
    pipeline = Pipeline([
        # ('imputer', SimpleImputer(strategy='most_frequent')),
        # ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    mean_accuracy = scores.mean()

    
    pipeline.fit(X_train_scaled, y_train)
    y_pred = pipeline.predict(X_test_scaled)

    
    accuracy = accuracy_score(y_test, y_pred)

    
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()

    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline


print("Best Model:", best_model)

Model: Random Forest
Cross-validation Accuracy: 0.9124490518834942
Test Accuracy: 0.9132902412107835

Model: Gradient Boosting
Cross-validation Accuracy: 0.8893094676609946
Test Accuracy: 0.8948447107047138

Model: Support Vector Machine
Cross-validation Accuracy: 0.8595474693829569
Test Accuracy: 0.8918492826738137

Model: Logistic Regression
Cross-validation Accuracy: 0.8735415787212647
Test Accuracy: 0.8795522623364339

Model: K-Nearest Neighbors
Cross-validation Accuracy: 0.8879298621676538
Test Accuracy: 0.8909033580324768

Model: Decision Tree
Cross-validation Accuracy: 0.8696783601126354
Test Accuracy: 0.8749802932366388

Model: Ada Boost
Cross-validation Accuracy: 0.8789814393730137
Test Accuracy: 0.8847548478637869

Model: XG Boost
Cross-validation Accuracy: 0.9020421825579881
Test Accuracy: 0.9052498817594198

Model: Naive Bayes
Cross-validation Accuracy: 0.8749211686748326
Test Accuracy: 0.8781333753744285

Best Model: Pipeline(steps=[('model', RandomForestClassifier(random_

In [33]:
import plotly.figure_factory as ff

cm = confusion_matrix(y_test, y_pred)
labels = ['Not Eligible', 'Eligible']


fig = ff.create_annotated_heatmap(
    z=cm,
    x=labels,
    y=labels,
    annotation_text=cm.astype(str),
    colorscale='Blues'
)


fig.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted Label',
    yaxis_title='True Label'
)

fig.show()

In [34]:
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report = df_report.iloc[:-1, :3]  


fig = px.imshow(
    df_report,
    text_auto=True,
    color_continuous_scale='Blues',
    title='Classification Report'
)

fig.update_layout(
    xaxis_title='Metrics',
    yaxis_title='Classes'
)

fig.show()

In [35]:
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go


y_prob = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC Curve (AUC = {roc_auc:.2f})'
))

# Diagonal reference line
fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    line=dict(dash='dash'),
    name='Random Guess'
))

fig.update_layout(
    title='ROC Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)

fig.show()

In [None]:
import pickle
pickle.dump(best_model, open('credit_score_model.pkl', 'wb'))

----

### **Making Prediction With Saved Model**

----

In [36]:
sample_data = pd.DataFrame({
    'Credit_Mix': [2],
    'Payment_of_Min_Amount': [1],
    'Payment_Behaviour': [0],
    'Delay_from_due_date': [10],
    'Interest_Rate': [10.5],
    'Num_Credit_Card': [3],
    'Num_Bank_Accounts': [5],
    'Changed_Credit_Limit': [1],
    'Num_Credit_Inquiries': [2],
    'Num_of_Loan': [2],
    'Outstanding_Debt': [5000],
    'Occupation': [2]
})


prediction =  best_model.predict(sample_data)

if prediction[0] == 1:
  print("User should be given a loan.")
else:
  print("User should not be given a loan.")


User should not be given a loan.
