In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Dataframe

Let's upload the dataframe and perform a verification of its values.

In [7]:
# Load train dataset
df = pd.read_csv("./supply_chain_train.csv")
# Load test dataset
df_test = pd.read_csv("./supply_chain_test.csv")
df

Unnamed: 0,train_idx,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,0,713071383,54,F,1,Unknown,Single,Unknown,Blue,36,...,3,3723.0,1728,1995.0,0.595,8554,99,0.678,0.464,1
1,1,714246333,58,F,4,High School,Married,Unknown,Blue,48,...,3,5396.0,1803,3593.0,0.493,2107,39,0.393,0.334,0
2,2,718206783,45,F,4,Unknown,Single,Less than $40K,Gold,36,...,3,15987.0,1648,14339.0,0.732,1436,36,1.250,0.103,1
3,3,721096983,34,F,2,Graduate,Single,Less than $40K,Blue,36,...,4,3625.0,2517,1108.0,1.158,2616,46,1.300,0.694,1
4,4,720028683,49,F,2,High School,Married,$40K - $60K,Blue,39,...,4,2720.0,1926,794.0,0.602,3806,61,0.794,0.708,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8096,8096,769053033,44,F,1,Graduate,Single,$40K - $60K,Blue,38,...,5,4142.0,2517,1625.0,0.809,2104,44,0.833,0.608,0
8097,8097,714406158,53,F,3,High School,Divorced,Unknown,Blue,36,...,6,7939.0,0,7939.0,0.551,2269,42,0.312,0.000,0
8098,8098,714140133,42,F,4,Graduate,Unknown,Less than $40K,Blue,32,...,2,2314.0,1547,767.0,0.804,4678,74,1.000,0.669,1
8099,8099,720244983,40,M,3,Unknown,Single,$40K - $60K,Blue,28,...,1,3563.0,1707,1856.0,0.506,1482,42,0.312,0.479,1


In [8]:
# Let's split the train dataset into train and validation
df_train, df_validation = train_test_split(df, test_size=0.2, shuffle = True)

Let us examine the potential values in each column of the dataset for any anomalous entries, such as extreme or abnormal values, null values, or any other forms of erroneous data.

In [4]:
for name in df_train.columns:
    print(f"\n {name.upper()}: {df_train[name].unique()}")


 TRAIN_IDX: [5386  888 7582 ... 5994 3214  926]

 CLIENTNUM: [780055308 714190983 779070033 ... 708968733 717797658 710455608]

 CUSTOMER_AGE: [43 51 63 55 54 37 50 41 35 40 47 61 46 44 52 45 38 65 58 36 49 57 29 33
 53 60 39 32 34 56 28 42 48 59 27 30 62 31 26 64 66 67 68]

 GENDER: ['F' 'M']

 DEPENDENT_COUNT: [3 2 0 5 4 1]

 EDUCATION_LEVEL: ['College' 'Graduate' 'Unknown' 'High School' 'Uneducated' 'Doctorate'
 'Post-Graduate']

 MARITAL_STATUS: ['Single' 'Divorced' 'Unknown' 'Married']

 INCOME_CATEGORY: ['Less than $40K' 'Unknown' '$80K - $120K' '$40K - $60K' '$120K +'
 '$60K - $80K']

 CARD_CATEGORY: ['Blue' 'Platinum' 'Silver' 'Gold']

 MONTHS_ON_BOOK: [28 32 44 50 36 42 31 29 26 39 52 34 19 35 33 56 37 27 46 41 48 18 24 15
 45 22 21 13 30 40 23 38 55 47 14 16 54 51 25 43 49 20 53 17]

 TOTAL_RELATIONSHIP_COUNT: [3 2 4 6 1 5]

 MONTHS_INACTIVE_12_MON: [1 3 4 6 2 5 0]

 CONTACTS_COUNT_12_MON: [3 2 4 1 5 6 0]

 CREDIT_LIMIT: [ 1438.3 34516.   1938.  ...  9684.  31987.   4620. ]


We should also verify the data types of each column in the dataset.

In [5]:
df_train.dtypes

train_idx                     int64
CLIENTNUM                     int64
Customer_Age                  int64
Gender                       object
Dependent_count               int64
Education_Level              object
Marital_Status               object
Income_Category              object
Card_Category                object
Months_on_book                int64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Credit_Limit                float64
Total_Revolving_Bal           int64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt               int64
Total_Trans_Ct                int64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
Attrition_Flag                int64
dtype: object

Next, we can create the "X" dataset by excluding the "Attrition_Flag" column, which is the target variable that we aim to predict. Conversely, the "Attrition_Flag" column will become the "y" dataset.

In [6]:
X_train, y_train = df_train.drop(['train_idx', 'CLIENTNUM', 'Attrition_Flag'] , axis = 1), np.array(df_train['Attrition_Flag'])
X_validation, y_validation = df_validation.drop(['train_idx', 'CLIENTNUM', 'Attrition_Flag'] , axis = 1), np.array(df_validation['Attrition_Flag'])
X_test = df_test.drop(['test_idx', 'CLIENTNUM'] , axis = 1)
print(X_train.shape, y_train.shape, X_test.shape)

(6480, 19) (6480,) (2026, 19)


# Model 

As the major part of the data is categorical a natural choise for the model is CatBoost.

CatBoost is a gradient boosting algorithm designed to work with categorical features. It was developed by Yandex, a Russian technology company, and is known for its ability to handle high-cardinality categorical variables, which are common in real-world datasets.

Here's how CatBoost works:

Encoding categorical features: CatBoost uses an efficient algorithm to encode categorical features. It can handle both numerical and categorical features, and it can automatically detect which features are categorical.

Building decision trees: CatBoost builds a decision tree ensemble, which consists of multiple decision trees. Each tree is built using a subset of the training data, and the algorithm uses gradient boosting to combine the trees.

Handling missing values: CatBoost can handle missing values in the input data. It uses gradient boosting to predict missing values, based on the available data.

Regularization: CatBoost uses regularization techniques to prevent overfitting. It applies L1, L2, and gradient-based regularization to the decision trees.

Early stopping: CatBoost uses early stopping to prevent overfitting. It stops training when the validation loss stops improving.

Prediction: To make predictions, CatBoost averages the predictions of all the decision trees in the ensemble.

Overall, CatBoost is a powerful algorithm for handling categorical data, and it has achieved state-of-the-art results in many machine learning tasks.

In [9]:
# First, select which columns are categorical
cat_features = np.array(df_train.columns[df_train.dtypes == 'object'])
# Create the model
model = CatBoostClassifier(iterations=200, learning_rate=0.1, depth=3)
# Train the model
model.fit(X_train, y_train, cat_features)

0:	learn: 0.5828113	total: 51.8ms	remaining: 10.3s
1:	learn: 0.5060745	total: 57ms	remaining: 5.64s
2:	learn: 0.4465348	total: 61.4ms	remaining: 4.03s
3:	learn: 0.3954406	total: 66ms	remaining: 3.23s
4:	learn: 0.3735718	total: 70.6ms	remaining: 2.75s
5:	learn: 0.3501029	total: 74.5ms	remaining: 2.41s
6:	learn: 0.3257990	total: 78.8ms	remaining: 2.17s
7:	learn: 0.3079383	total: 82.9ms	remaining: 1.99s
8:	learn: 0.2961382	total: 87.3ms	remaining: 1.85s
9:	learn: 0.2853776	total: 91.2ms	remaining: 1.73s
10:	learn: 0.2791387	total: 95.6ms	remaining: 1.64s
11:	learn: 0.2702550	total: 99.8ms	remaining: 1.56s
12:	learn: 0.2598737	total: 104ms	remaining: 1.49s
13:	learn: 0.2425743	total: 108ms	remaining: 1.43s
14:	learn: 0.2382485	total: 110ms	remaining: 1.36s
15:	learn: 0.2343197	total: 113ms	remaining: 1.3s
16:	learn: 0.2294525	total: 115ms	remaining: 1.24s
17:	learn: 0.2252930	total: 118ms	remaining: 1.19s
18:	learn: 0.2219465	total: 121ms	remaining: 1.15s
19:	learn: 0.2120109	total: 124ms	

<catboost.core.CatBoostClassifier at 0x7fc27e116920>

After training the model, we can make predictions for each dataset and subsequently compute their corresponding F1 scores.

In [19]:
# Predict for the train dataset (big models allow to obaint 1.0 of f1 score)
y_pred_train = model.predict_proba(X_train)
y_pred_train = (y_pred_train[:,0] < y_pred_train[:,1]) + [0]
# Predcit for the validation dataset
y_pred_validation = model.predict_proba(X_validation)
y_pred_validation = (y_pred_validation[:,0] < y_pred_validation[:,1]) + [0]
# Predict for the dest dataset
y_pred_test = model.predict_proba(X_test)
y_pred_test = (y_pred_test[:,0] < y_pred_test[:,1]) + [0]

In [20]:
# Print the f1 score for each of the datasets
print("Train f1 score: ", f1_score(y_train, y_pred_train, average='binary'))
print("Validation f1 score: ", f1_score(y_validation, y_pred_validation, average='binary'))

Train f1 score:  0.9994854076306696
Validation f1 score:  1.0


In [23]:
# Export the test predicted values in json format
df_final = df_test[['test_idx']]
df_final.insert(1, "target", y_pred_test, True)
df_final = df_final.drop("test_idx", axis = 1)
df_final.to_json('./predictions.json')

With this we obtain a score of 0.9369541217249626. Let's try to improve it!

# Bigger model
After several tries we have obtained the best scores with a slightly bigger model. Even bigger models show overffiting. The obained score is of 0.9386097812253803. Using a higher amount of iterations (300 instead of 200) improves even more the score: 0.9389207114862829.

In [22]:
cat_features = np.array(df_train.columns[df_train.dtypes == 'object'])
model = CatBoostClassifier(iterations=300, learning_rate=0.1, depth=5)
model.fit(X_train, y_train, cat_features)
y_pred_train = model.predict_proba(X_train)
y_pred_train = (y_pred_train[:,0] < y_pred_train[:,1]) + [0]
y_pred_validation = model.predict_proba(X_validation)
y_pred_validation = (y_pred_validation[:,0] < y_pred_validation[:,1]) + [0]
y_pred_test = model.predict_proba(X_test)
y_pred_test = (y_pred_test[:,0] < y_pred_test[:,1]) + [0]
print("Train f1 score: ", f1_score(y_train, y_pred_train, average='binary'))
print("Validation f1 score: ", f1_score(y_validation, y_pred_validation, average='binary'))
df_final = df_test[['test_idx']]
df_final.insert(1, "target", y_pred_test, True)
df_final = df_final.drop("test_idx", axis = 1)
df_final.to_json('./predictions_v2.json')

0:	learn: 0.5765947	total: 6.11ms	remaining: 1.83s
1:	learn: 0.4795162	total: 12.6ms	remaining: 1.87s
2:	learn: 0.4178818	total: 18.6ms	remaining: 1.84s
3:	learn: 0.3701079	total: 25.1ms	remaining: 1.86s
4:	learn: 0.3403494	total: 31ms	remaining: 1.83s
5:	learn: 0.3088408	total: 37.1ms	remaining: 1.82s
6:	learn: 0.2858648	total: 43.2ms	remaining: 1.81s
7:	learn: 0.2683470	total: 49.4ms	remaining: 1.8s
8:	learn: 0.2574700	total: 53.9ms	remaining: 1.74s
9:	learn: 0.2330701	total: 58.8ms	remaining: 1.71s
10:	learn: 0.2205480	total: 63.8ms	remaining: 1.68s
11:	learn: 0.2127615	total: 68.6ms	remaining: 1.65s
12:	learn: 0.2009719	total: 73.6ms	remaining: 1.62s
13:	learn: 0.1957583	total: 77.9ms	remaining: 1.59s
14:	learn: 0.1885590	total: 82.1ms	remaining: 1.56s
15:	learn: 0.1839061	total: 86.2ms	remaining: 1.53s
16:	learn: 0.1782684	total: 90.7ms	remaining: 1.51s
17:	learn: 0.1744621	total: 94.7ms	remaining: 1.48s
18:	learn: 0.1724108	total: 98.7ms	remaining: 1.46s
19:	learn: 0.1692424	tota

165:	learn: 0.0505283	total: 608ms	remaining: 490ms
166:	learn: 0.0500717	total: 611ms	remaining: 487ms
167:	learn: 0.0498828	total: 615ms	remaining: 483ms
168:	learn: 0.0498828	total: 618ms	remaining: 479ms
169:	learn: 0.0496842	total: 621ms	remaining: 475ms
170:	learn: 0.0495157	total: 625ms	remaining: 471ms
171:	learn: 0.0493134	total: 628ms	remaining: 468ms
172:	learn: 0.0490417	total: 632ms	remaining: 464ms
173:	learn: 0.0487244	total: 635ms	remaining: 460ms
174:	learn: 0.0486122	total: 639ms	remaining: 456ms
175:	learn: 0.0483172	total: 642ms	remaining: 452ms
176:	learn: 0.0481864	total: 645ms	remaining: 448ms
177:	learn: 0.0480473	total: 649ms	remaining: 445ms
178:	learn: 0.0479508	total: 652ms	remaining: 441ms
179:	learn: 0.0478882	total: 656ms	remaining: 437ms
180:	learn: 0.0475025	total: 659ms	remaining: 433ms
181:	learn: 0.0474211	total: 663ms	remaining: 430ms
182:	learn: 0.0472980	total: 666ms	remaining: 426ms
183:	learn: 0.0471614	total: 670ms	remaining: 422ms
184:	learn: 

Let's use the whole dataset to make the prediction

In [24]:
df_train = pd.read_csv("./supply_chain_train.csv")


X_train, y_train = df_train.drop(['train_idx', 'CLIENTNUM', 'Attrition_Flag'] , axis = 1), np.array(df_train['Attrition_Flag'])
X_validation, y_validation = df_validation.drop(['train_idx', 'CLIENTNUM', 'Attrition_Flag'] , axis = 1), np.array(df_validation['Attrition_Flag'])
X_test = df_test.drop(['test_idx', 'CLIENTNUM'] , axis = 1)

cat_features = np.array(df_train.columns[df_train.dtypes == 'object'])
model = CatBoostClassifier(iterations=900, learning_rate=0.1, depth=5)
model.fit(X_train, y_train, cat_features)
y_pred_train = model.predict_proba(X_train)
y_pred_train = (y_pred_train[:,0] < y_pred_train[:,1]) + [0]
y_pred_test = model.predict_proba(X_test)
y_pred_test = (y_pred_test[:,0] < y_pred_test[:,1]) + [0]
print("Train f1 score: ", f1_score(y_train, y_pred_train, average='binary'))
df_final = df_test[['test_idx']]
df_final.insert(1, "target", y_pred_test, True)
df_final = df_final.drop("test_idx", axis = 1)
df_final.to_json('./predictions.json')

0:	learn: 0.5765947	total: 6.54ms	remaining: 5.88s
1:	learn: 0.4795162	total: 13ms	remaining: 5.83s
2:	learn: 0.4178818	total: 18.9ms	remaining: 5.66s
3:	learn: 0.3701079	total: 24.9ms	remaining: 5.58s
4:	learn: 0.3403494	total: 31ms	remaining: 5.55s
5:	learn: 0.3088408	total: 37.7ms	remaining: 5.62s
6:	learn: 0.2858648	total: 44.3ms	remaining: 5.65s
7:	learn: 0.2683470	total: 51.8ms	remaining: 5.78s
8:	learn: 0.2574700	total: 57.8ms	remaining: 5.72s
9:	learn: 0.2330701	total: 63.2ms	remaining: 5.62s
10:	learn: 0.2205480	total: 68.4ms	remaining: 5.53s
11:	learn: 0.2127615	total: 73.6ms	remaining: 5.45s
12:	learn: 0.2009719	total: 78.7ms	remaining: 5.37s
13:	learn: 0.1957583	total: 83.9ms	remaining: 5.31s
14:	learn: 0.1885590	total: 88.7ms	remaining: 5.23s
15:	learn: 0.1839061	total: 93.1ms	remaining: 5.14s
16:	learn: 0.1782684	total: 97.1ms	remaining: 5.04s
17:	learn: 0.1744621	total: 101ms	remaining: 4.93s
18:	learn: 0.1724108	total: 104ms	remaining: 4.83s
19:	learn: 0.1692424	total: 

162:	learn: 0.0512870	total: 607ms	remaining: 2.75s
163:	learn: 0.0511319	total: 611ms	remaining: 2.74s
164:	learn: 0.0508607	total: 615ms	remaining: 2.74s
165:	learn: 0.0505283	total: 619ms	remaining: 2.73s
166:	learn: 0.0500717	total: 622ms	remaining: 2.73s
167:	learn: 0.0498828	total: 625ms	remaining: 2.72s
168:	learn: 0.0498828	total: 628ms	remaining: 2.72s
169:	learn: 0.0496842	total: 633ms	remaining: 2.72s
170:	learn: 0.0495157	total: 636ms	remaining: 2.71s
171:	learn: 0.0493134	total: 640ms	remaining: 2.71s
172:	learn: 0.0490417	total: 643ms	remaining: 2.7s
173:	learn: 0.0487244	total: 647ms	remaining: 2.7s
174:	learn: 0.0486122	total: 651ms	remaining: 2.7s
175:	learn: 0.0483172	total: 655ms	remaining: 2.69s
176:	learn: 0.0481864	total: 658ms	remaining: 2.69s
177:	learn: 0.0480473	total: 662ms	remaining: 2.69s
178:	learn: 0.0479508	total: 666ms	remaining: 2.68s
179:	learn: 0.0478882	total: 669ms	remaining: 2.68s
180:	learn: 0.0475025	total: 673ms	remaining: 2.67s
181:	learn: 0.0

333:	learn: 0.0287243	total: 1.21s	remaining: 2.04s
334:	learn: 0.0286647	total: 1.21s	remaining: 2.04s
335:	learn: 0.0286256	total: 1.21s	remaining: 2.04s
336:	learn: 0.0285297	total: 1.22s	remaining: 2.03s
337:	learn: 0.0284527	total: 1.22s	remaining: 2.03s
338:	learn: 0.0282963	total: 1.22s	remaining: 2.02s
339:	learn: 0.0282470	total: 1.23s	remaining: 2.02s
340:	learn: 0.0281576	total: 1.23s	remaining: 2.02s
341:	learn: 0.0280499	total: 1.23s	remaining: 2.01s
342:	learn: 0.0280122	total: 1.24s	remaining: 2.01s
343:	learn: 0.0278826	total: 1.24s	remaining: 2s
344:	learn: 0.0278238	total: 1.24s	remaining: 2s
345:	learn: 0.0276891	total: 1.25s	remaining: 2s
346:	learn: 0.0276113	total: 1.25s	remaining: 1.99s
347:	learn: 0.0275393	total: 1.25s	remaining: 1.99s
348:	learn: 0.0274428	total: 1.26s	remaining: 1.99s
349:	learn: 0.0273719	total: 1.26s	remaining: 1.98s
350:	learn: 0.0273289	total: 1.26s	remaining: 1.98s
351:	learn: 0.0271631	total: 1.27s	remaining: 1.97s
352:	learn: 0.0270601

506:	learn: 0.0182377	total: 1.81s	remaining: 1.4s
507:	learn: 0.0181852	total: 1.81s	remaining: 1.4s
508:	learn: 0.0181139	total: 1.81s	remaining: 1.39s
509:	learn: 0.0180498	total: 1.82s	remaining: 1.39s
510:	learn: 0.0180301	total: 1.82s	remaining: 1.39s
511:	learn: 0.0180300	total: 1.82s	remaining: 1.38s
512:	learn: 0.0180212	total: 1.83s	remaining: 1.38s
513:	learn: 0.0179766	total: 1.83s	remaining: 1.38s
514:	learn: 0.0179724	total: 1.83s	remaining: 1.37s
515:	learn: 0.0179484	total: 1.84s	remaining: 1.37s
516:	learn: 0.0178934	total: 1.84s	remaining: 1.36s
517:	learn: 0.0178382	total: 1.84s	remaining: 1.36s
518:	learn: 0.0177491	total: 1.85s	remaining: 1.36s
519:	learn: 0.0176306	total: 1.85s	remaining: 1.35s
520:	learn: 0.0175927	total: 1.86s	remaining: 1.35s
521:	learn: 0.0175491	total: 1.86s	remaining: 1.35s
522:	learn: 0.0175193	total: 1.86s	remaining: 1.34s
523:	learn: 0.0174487	total: 1.87s	remaining: 1.34s
524:	learn: 0.0173730	total: 1.87s	remaining: 1.34s
525:	learn: 0.

698:	learn: 0.0138193	total: 2.4s	remaining: 692ms
699:	learn: 0.0138193	total: 2.41s	remaining: 688ms
700:	learn: 0.0137839	total: 2.41s	remaining: 684ms
701:	learn: 0.0137838	total: 2.41s	remaining: 681ms
702:	learn: 0.0137353	total: 2.42s	remaining: 677ms
703:	learn: 0.0137353	total: 2.42s	remaining: 674ms
704:	learn: 0.0137059	total: 2.42s	remaining: 670ms
705:	learn: 0.0136597	total: 2.43s	remaining: 667ms
706:	learn: 0.0136597	total: 2.43s	remaining: 663ms
707:	learn: 0.0136160	total: 2.43s	remaining: 660ms
708:	learn: 0.0136160	total: 2.44s	remaining: 656ms
709:	learn: 0.0136159	total: 2.44s	remaining: 653ms
710:	learn: 0.0136159	total: 2.44s	remaining: 649ms
711:	learn: 0.0135621	total: 2.44s	remaining: 645ms
712:	learn: 0.0135156	total: 2.45s	remaining: 642ms
713:	learn: 0.0134723	total: 2.45s	remaining: 639ms
714:	learn: 0.0134483	total: 2.45s	remaining: 635ms
715:	learn: 0.0134482	total: 2.46s	remaining: 632ms
716:	learn: 0.0134357	total: 2.46s	remaining: 628ms
717:	learn: 0

899:	learn: 0.0116774	total: 3s	remaining: 0us
Train f1 score:  0.9994854076306696


Our final score is 0.9474488228749293. A noteworthy aspect to investigate is the significant discrepancy between the validation and test scores. One hypothesis is that the dataset was not randomly partitioned. We presume that if the dataset had been randomly divided, the score would have been similar to the validation score, ranging between 0.98 and 0.99.