In [584]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [585]:
df = pd.read_csv("sources/CreditPrediction.csv")
df

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Unnamed: 19
0,768805383,45.0,M,3,High School,...,1144,42,1.625,0.061,
1,818770008,49.0,F,5,Graduate,...,1291,33,3.714,0.105,
2,713982108,51.0,M,3,Graduate,...,1887,20,2.333,0.000,
3,769911858,40.0,F,4,High School,...,1171,20,2.333,0.760,
4,709106358,40.0,M,3,Uneducated,...,816,28,2.500,0.000,
...,...,...,...,...,...,...,...,...,...,...,...
10162,718673358,35.0,M,3,Doctorate,...,2137,52,0.486,0.112,
10163,715207458,46.0,F,1,Unknown,...,4802,90,0.800,0.529,
10164,803665983,52.0,M,0,Unknown,...,3829,72,0.532,0.796,
10165,713183508,39.0,F,1,High School,...,4861,82,0.822,0.421,


In [586]:
df = df.drop("Unnamed: 19", axis=1)
df = df.drop_duplicates(subset=["CLIENTNUM"], keep=False)
df = df.drop("CLIENTNUM", axis=1)
df = df.drop("Total_Ct_Chng_Q4_Q1", axis=1)


z_score = (df['Customer_Age'] - df['Customer_Age'].mean()) / df['Customer_Age'].std()
outliers = df[(abs(z_score) > 3)]
df = df.drop(outliers.index)

z_score = (df['Total_Relationship_Count'] - df['Total_Relationship_Count'].mean()) / df['Total_Relationship_Count'].std()
outliers = df[(abs(z_score) > 3)]
df = df.drop(outliers.index)

z_score = (df['Avg_Utilization_Ratio'] - df['Avg_Utilization_Ratio'].mean()) / df['Avg_Utilization_Ratio'].std()
outliers = df[(abs(z_score) > 3)]
df = df.drop(outliers.index)

df

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,...,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
0,45.0,M,3,High School,Married,...,777,1.335,1144,42,0.061
1,49.0,F,5,Graduate,,...,864,1.541,1291,33,0.105
2,51.0,M,3,Graduate,Married,...,0,2.594,1887,20,0.000
3,40.0,F,4,High School,,...,2517,1.405,1171,20,0.760
4,40.0,M,3,Uneducated,Married,...,0,2.175,816,28,0.000
...,...,...,...,...,...,...,...,...,...,...,...
10122,50.0,M,2,Graduate,Single,...,1851,0.703,15476,117,0.462
10123,41.0,M,2,Unknown,Divorced,...,2186,0.804,8764,69,0.511
10124,44.0,F,1,High School,,...,0,0.819,10291,60,0.000
10125,30.0,M,2,Graduate,Unknown,...,0,0.535,8395,62,0.000


In [None]:
df['Months_on_book'].fillna(value=df['Months_on_book'].mean(), inplace=True)
df['Total_Relationship_Count'].fillna(value=df['Total_Relationship_Count'].mean(), inplace=True)

In [587]:
X, y = df.drop('Credit_Limit', axis=1), df[['Credit_Limit']]
y

Unnamed: 0,Credit_Limit
0,12691.0
1,8256.0
2,3418.0
3,3313.0
4,4716.0
...,...
10122,4003.0
10123,4277.0
10124,5409.0
10125,5281.0


In [588]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

X.dtypes

Customer_Age                 float64
Gender                      category
Dependent_count                int64
Education_Level             category
Marital_Status              category
Income_Category             category
Card_Category               category
Months_on_book               float64
Total_Relationship_Count     float64
Months_Inactive_12_mon         int64
Contacts_Count_12_mon          int64
Total_Revolving_Bal            int64
Total_Amt_Chng_Q4_Q1         float64
Total_Trans_Amt                int64
Total_Trans_Ct                 int64
Avg_Utilization_Ratio        float64
dtype: object

In [589]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_test

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,...,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
1314,37.0,M,1,High School,Married,...,1799,0.750,1754,45,0.174
5518,52.0,F,0,Graduate,Married,...,1284,0.902,3989,62,0.716
8526,41.0,F,5,Graduate,Married,...,970,0.864,2801,51,0.645
9013,38.0,F,1,College,Unknown,...,1259,0.871,8677,96,0.483
3392,52.0,M,0,Unknown,,...,2247,1.003,2474,51,0.171
...,...,...,...,...,...,...,...,...,...,...,...
8290,52.0,F,1,Unknown,Married,...,2517,0.774,4608,61,0.828
720,38.0,M,2,College,Married,...,1847,0.845,1681,43,0.101
9510,51.0,F,0,Doctorate,,...,2517,0.786,13820,129,0.609
8016,36.0,F,1,Uneducated,Unknown,...,458,0.543,2383,39,0.312


In [590]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)



In [591]:

params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 25

evals = [(dtest_reg, "validation"), (dtrain_reg, "train"), ]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=1,
   # Activate early stopping
   early_stopping_rounds=5
)

[0]	validation-rmse:6818.70829	train-rmse:6976.94901
[1]	validation-rmse:5321.10996	train-rmse:5435.30979
[2]	validation-rmse:4330.04538	train-rmse:4413.91429
[3]	validation-rmse:3745.34854	train-rmse:3797.16585



    E.g. tree_method = "hist", device = "cuda"



[4]	validation-rmse:3376.12016	train-rmse:3416.74857
[5]	validation-rmse:3180.05319	train-rmse:3191.19411
[6]	validation-rmse:3055.11535	train-rmse:3051.41508
[7]	validation-rmse:2997.52200	train-rmse:2983.04942
[8]	validation-rmse:2974.31463	train-rmse:2901.41712
[9]	validation-rmse:2961.61932	train-rmse:2834.58093
[10]	validation-rmse:2954.66669	train-rmse:2778.19294
[11]	validation-rmse:2954.04778	train-rmse:2709.81852
[12]	validation-rmse:2933.56137	train-rmse:2677.76279
[13]	validation-rmse:2934.04494	train-rmse:2612.18438
[14]	validation-rmse:2929.63716	train-rmse:2578.41113
[15]	validation-rmse:2929.63693	train-rmse:2561.63946
[16]	validation-rmse:2937.69690	train-rmse:2493.04175
[17]	validation-rmse:2937.49218	train-rmse:2428.78120
[18]	validation-rmse:2934.83683	train-rmse:2406.26438
[19]	validation-rmse:2914.45119	train-rmse:2334.55251
[20]	validation-rmse:2912.52048	train-rmse:2280.49901
[21]	validation-rmse:2908.80118	train-rmse:2258.29574
[22]	validation-rmse:2908.64866	tr

In [592]:
preds = model.predict(dtest_reg)
mse = mean_squared_error(y_test, preds)
mse



    E.g. tree_method = "hist", device = "cuda"



8414842.84094371

In [593]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 2900.835


