In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool

In [2]:
df = pd.read_csv("sources/CreditPrediction.csv")

In [3]:
df = df.drop("Unnamed: 19", axis=1)
df = df.drop_duplicates(subset=["CLIENTNUM"], keep=False)
df = df.drop("CLIENTNUM", axis=1)
df

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,45.0,M,3,High School,Married,$60K - $80K,Blue,39.0,5.0,1,3,12691.0,777,1.335,1144,42,1.625,0.061
1,49.0,F,5,Graduate,,Less than $40K,Blue,44.0,6.0,1,2,8256.0,864,1.541,1291,33,3.714,0.105
2,51.0,M,3,Graduate,Married,$80K - $120K,Blue,36.0,4.0,1,0,3418.0,0,2.594,1887,20,2.333,0.000
3,40.0,F,4,High School,,Less than $40K,Blue,34.0,3.0,4,1,3313.0,2517,1.405,1171,20,2.333,0.760
4,40.0,M,3,Uneducated,Married,$60K - $80K,,21.0,5.0,1,0,4716.0,0,2.175,816,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,50.0,M,2,Graduate,Single,$40K - $60K,Blue,40.0,3.0,2,3,4003.0,1851,0.703,15476,117,0.857,0.462
10123,41.0,M,2,Unknown,Divorced,$40K - $60K,Blue,25.0,4.0,2,3,4277.0,2186,0.804,8764,69,0.683,0.511
10124,44.0,F,1,High School,,Less than $40K,Blue,36.0,5.0,3,4,5409.0,0,0.819,10291,60,0.818,0.000
10125,30.0,M,2,Graduate,Unknown,$40K - $60K,Blue,36.0,4.0,3,3,5281.0,0,0.535,8395,62,0.722,0.000


In [9]:
features = df.columns.difference(['Credit_Limit'])  # All columns except 'SalePrice'

target = 'Credit_Limit'

# Convert categorical features to strings

categorical_features = df[features].select_dtypes(include=['object']).columns

for feature in categorical_features:

    df[feature] = df[feature].astype(str)
 
# Split data into features and target

X = df[features]

y = df[target]
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Find categorical features for CatBoost

categorical_features_indices = np.where(X.dtypes == 'object')[0]



0        12691.0
1         8256.0
2         3418.0
3         3313.0
4         4716.0
          ...   
10122     4003.0
10123     4277.0
10124     5409.0
10125     5281.0
10126    10388.0
Name: Credit_Limit, Length: 10087, dtype: float64

In [10]:
# Check for missing values

missing_values = df.isnull().sum().sort_values(ascending=False)

missing_values = missing_values[missing_values > 0]

print("\nColumns with missing values:\n", missing_values)


Columns with missing values:
 Months_on_book              219
Total_Relationship_Count     20
dtype: int64


In [27]:
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features_indices)

test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features_indices)
 
# Train the CatBoost model

model = CatBoostRegressor(
    iterations=10000,
    learning_rate=0.1,
    eval_metric='RMSE',
    logging_level='Verbose',  # Can be 'Silent' to reduce output
    use_best_model=True, nan_mode='Min'
    
)

# Train the model with evaluation
model.fit(
    train_pool,
    eval_set=test_pool,
    early_stopping_rounds=50  # Stops if validation RMSE doesn't improve for 50 rounds
)

0:	learn: 8527.8356838	test: 8339.4374498	best: 8339.4374498 (0)	total: 22.3ms	remaining: 3m 42s
1:	learn: 7974.5902486	test: 7809.0306246	best: 7809.0306246 (1)	total: 46.3ms	remaining: 3m 51s
2:	learn: 7471.7994126	test: 7323.4079176	best: 7323.4079176 (2)	total: 69.1ms	remaining: 3m 50s
3:	learn: 6990.8229412	test: 6868.8991431	best: 6868.8991431 (3)	total: 90.8ms	remaining: 3m 47s
4:	learn: 6550.2294814	test: 6461.4789594	best: 6461.4789594 (4)	total: 112ms	remaining: 3m 43s
5:	learn: 6178.6958826	test: 6111.2158944	best: 6111.2158944 (5)	total: 132ms	remaining: 3m 39s
6:	learn: 5833.6410734	test: 5787.7453380	best: 5787.7453380 (6)	total: 152ms	remaining: 3m 37s
7:	learn: 5525.7166665	test: 5491.6444860	best: 5491.6444860 (7)	total: 172ms	remaining: 3m 34s
8:	learn: 5248.1800512	test: 5228.2633874	best: 5228.2633874 (8)	total: 193ms	remaining: 3m 34s
9:	learn: 5031.4582284	test: 5017.3600622	best: 5017.3600622 (9)	total: 216ms	remaining: 3m 35s
10:	learn: 4813.7232867	test: 4804.0

<catboost.core.CatBoostRegressor at 0x1e159e85c90>

In [28]:
# Make predictions on the test set
from sklearn.metrics import mean_squared_error, r2_score
y_pred = model.predict(test_pool)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse

9264082.919834448