Multiclass classification using the Corporate credit rating dataset

In [1]:
# Import relevant libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load in Corporate Credit Rating dataset as a Pandas dataframe

df = pd.read_csv('ratings.csv')

In [3]:
# Visualise dataset

df.head()

# 'rating' is the response variable

Unnamed: 0,spid,rating,COMMEQTA,LLPLOANS,COSTTOINCOME,ROE,LIQASSTA,SIZE
0,141126,7,0.088445,0.057333,0.58216,0.177753,0.374932,18.818966
1,342066,5,0.055974,0.00964,0.526015,-0.12258,0.497892,19.072266
2,366790,10,0.154322,0.014402,2.08455,-0.166647,0.087426,16.075995
3,146854,1,0.026977,0.002951,0.248881,0.102982,0.457657,18.101099
4,319262,2,0.096891,0.002645,0.544106,0.178183,0.148163,15.758143


In [4]:
# Confirms that it is a balanced dataset

df['rating'].value_counts()

7     500
5     500
10    500
1     500
2     500
4     500
8     500
3     500
9     500
6     500
Name: rating, dtype: int64

In [5]:
df.drop(['spid'], axis=1, inplace=True)  # The 'spid' is an identifier given to each sample, and has no use for finding patterns, hence we drop it

X = df.loc[:, df.columns != 'rating']  # Selects all columns apart from 'rating' as X, the explanatory variables
y = df.loc[:, df.columns == 'rating']  # Selects rating as y, the response variable

In [6]:
# Split X and y into training and test datasets and convert to numpy

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y)
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [7]:
# Import the relevant modules from Concrete ML

from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier
from concrete.ml.sklearn import RandomForestClassifier as ConcreteRandomForestClassifier
from xgboost.sklearn import XGBClassifier as SklearnXGBClassifier
from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit

XGBoost

In [8]:
# XGBoost requires the labels to be in the range [0, 9]
# This should only be ran once

y_train, y_test = y_train - 1, y_test - 1

Below is the concise training pipeline for Concrete ML's implemenmtation of XGBoost:

(includes hyperparameter tuning)

In [9]:
# Parameter grid used for GridSearchCV
  
param_grid = {
  
  "max_depth": list(range(1, 5)),
  
  "n_estimators": list(range(1, 201, 20)),  
  
  "learning_rate": [0.01, 0.1, 1],
  
  "n_bits": [3]  # 'n_bits' controls how many bits are used to quantise each value; generally more bits mean better accuracy, but slower to run   
} 

# We use shuffle split for cross validation
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [None]:
# Create a grid search variable and pass in the relevant arguments,
# then fit the variable with the training data

concrete_grid_search = GridSearchCV(
    ConcreteXGBClassifier(), param_grid, cv=cv, scoring='roc_auc'
)
concrete_grid_search.fit(X_train, y_train)

In [11]:
# Set of optimal parameters for 
concrete_best_params = concrete_grid_search.best_params_

In [12]:
# Now define the actual XGBoost model using optimised parameters
concrete_model = ConcreteXGBClassifier(**concrete_best_params)

In [13]:
# Training the actual model with the training data
concrete_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=1,
              min_child_weight=1, monotone_constraints='()', n_bits=3,
              n_estimators=1, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, subsample=1, tree_method='exact',
              validate_parameters=1)

In [14]:
# Compile the model to generate a FHE circuit

concrete_model.compile(X_train[100:])

In [15]:
# Generate an array containing the random permutation of integers in [0,49]
n_sample_to_test_fhe = 50
idx_test = np.random.choice(X_test.shape[0], n_sample_to_test_fhe, replace=False)

# This is so we can select a small random sample of size 'n_sample_to_test_fhe'
# in FHE, for a relatively quick test of model accuracy

X_test_fhe = X_test[idx_test]
y_test_fhe = y_test[idx_test]

In [None]:
# Train the same model from sklearn, and evaluate the accuracy

param_grid_sklearn = {
  
  "max_depth": list(range(1, 5)),
  
  "n_estimators": list(range(1, 201, 20)),  # Tune the number of decision trees used in XGBoost. Default is 100
  
  "learning_rate": [0.01, 0.1, 1],
  
  "eval_metric": ["logloss"]
} 

sklearn_grid_search = GridSearchCV(
    SklearnXGBClassifier(), param_grid_sklearn, cv=cv, scoring='roc_auc'
).fit(X_train, y_train)

sklearn_best_params = sklearn_grid_search.best_params_
sklearn_model = SklearnXGBClassifier(**sklearn_best_params)
sklearn_model.fit(X_train, y_train)

In [26]:
y_pred_clear = sklearn_model.predict(X_test_fhe)

In [27]:
y_pred_clear_q = concrete_model.predict(X_test_fhe)

In [28]:
y_preds_fhe = concrete_model.predict(X_test_fhe, execute_in_fhe=True)

In [30]:
from sklearn.metrics import accuracy_score

print(f'Accuracy score of clear model: {accuracy_score(y_test_fhe, y_pred_clear)}')
print(f'Accuracy score of clear quantised model: {accuracy_score(y_test_fhe, y_pred_clear_q)}')
print(f'Accuracy score of FHE model: {accuracy_score(y_test_fhe, y_preds_fhe)}')

Accuracy score of clear model: 0.48
Accuracy score of clear quantised model: 0.32
Accuracy score of FHE model: 0.32


Random Forest