In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read in the training and testing datasets as pandas dataframes

training = pd.read_csv('Training Data.csv')
testing = pd.read_csv('Test Data.csv')

0    221004
1     30996
Name: Risk_Flag, dtype: int64


In [None]:
training.head()  
# 'Risk_Flag' is the response variable, 1 indicating positive and 0 negative
# The other columns are the explanatory variables: 
#     some are numerical and some are categorical

In [None]:
counts = training['Risk_Flag'].value_counts() 
print(counts)  
# By looking at the number of samples of each category, we can see that the dataset is imbalanced  

In [3]:
# Here we undersample the data, randomly selecting the same number of samples where 'Risk_Flag == 0' 
# as there are samples where 'Risk_Flag == 1'

balance_idx = torch.randperm(counts[0])[:counts[1.0]].tolist() 
default = training[training['Risk_Flag']==1.0]
non_default = training[training['Risk_Flag']==0]
non_default = non_default.iloc[balance_idx]
training = pd.concat([default, non_default])

# Verify that the training dataset is now balanced
print(training['Risk_Flag'].value_counts())

1    30996
0    30996
Name: Risk_Flag, dtype: int64


In [4]:
# Encodes all categorical features into numerical ones, for both training and testing datasets


# Set all columns containing categorical features as type 'category'
training[['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']] = training[['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']].astype('category')
testing[['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']] = testing[['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']].astype('category')

# Define those columns as separate variables
cat_columns_train = training.select_dtypes(['category']).columns
cat_columns_test = testing.select_dtypes(['category']).columns

# Encodes those columns into integer representation by modifying the training and testing datasets
training[cat_columns_train] = training[cat_columns_train].apply(lambda x: x.cat.codes)
testing[cat_columns_test] = testing[cat_columns_test].apply(lambda x: x.cat.codes)

In [5]:
training, testing = training.astype('float'), testing.astype('float')

In [6]:
X_train = training.drop(['Risk_Flag'], axis=1)
y_train = training['Risk_Flag']

In [7]:
# For simplicity

X_train = X_train.drop(['Id','CITY','STATE','Experience'], axis=1)
X_train, y_train = X_train.to_numpy().astype(np.float32), y_train.to_numpy().astype(np.float32)

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [9]:
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression
from concrete.ml.sklearn import LogisticRegression as ConcreteLogisticRegression

In [10]:
logreg = SklearnLogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [11]:
y_train = y_train.astype(np.uint8)

In [12]:
q_logreg = ConcreteLogisticRegression(n_bits={"inputs": 5, "weights": 2})
q_logreg.fit(X_train, y_train)
q_logreg.compile(X_train)

In [13]:
test_results = pd.read_csv('Sample Prediction Dataset.csv')

In [14]:
test_results = test_results[['risk_flag']] 

combined_testset = pd.concat([testing, test_results], axis=1)

In [15]:
combined_testset.head()

Unnamed: 0,ID,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,risk_flag
0,1.0,7393090.0,59.0,19.0,1.0,2.0,0.0,26.0,181.0,28.0,4.0,13.0,0
1,2.0,1215004.0,25.0,5.0,1.0,2.0,0.0,24.0,131.0,14.0,5.0,10.0,0
2,3.0,8901342.0,50.0,12.0,1.0,2.0,0.0,30.0,290.0,14.0,9.0,14.0,1
3,4.0,1944421.0,49.0,9.0,0.0,2.0,1.0,1.0,171.0,14.0,3.0,12.0,0
4,5.0,13429.0,25.0,18.0,1.0,2.0,1.0,12.0,39.0,28.0,13.0,11.0,0


In [16]:
# Simplify dataset by discarding less significant features
# Analysis not included in this notebook
# Analysis from: https://www.kaggle.com/code/paallakchopraa/loan-prediction
combined_testset = combined_testset.drop(['ID','CITY','STATE','Experience'], axis=1)

In [17]:
testset_counts = combined_testset['risk_flag'].value_counts()

# undersampling for test set
balance_idx_test = torch.randperm(testset_counts[0])[:testset_counts[1.0]].tolist()

default_testing = combined_testset[combined_testset['risk_flag']==1.0]
non_default_testing = combined_testset[combined_testset['risk_flag']==0]
non_default_testing = non_default_testing.iloc[balance_idx_test]

balanced_testing = pd.concat([default_testing, non_default_testing])

In [18]:
balanced_testing['risk_flag'].value_counts()

1    3593
0    3593
Name: risk_flag, dtype: int64

In [30]:
balanced_testing_y = balanced_testing['risk_flag']
balanced_testing_X = balanced_testing.drop(['risk_flag'], axis=1)

In [31]:
balanced_testing_X = balanced_testing_X.to_numpy()
balanced_testing_y = balanced_testing_y.to_numpy()

In [32]:
# Randomises rows and convert to a numpy array
balanced_test_idx = np.random.permutation(100)


balanced_test_sample = balanced_testing_X[balanced_test_idx]

balanced_test_results = balanced_testing_y[balanced_test_idx]

In [None]:
# Test small sample (We don't know how long it's gonna take to test all 7000 samples in the balanced test set)

y_pred_test_clear = np.asarray(logreg.predict(balanced_test_sample))
y_pred_test_q = q_logreg.predict(balanced_test_sample)
y_pred_test_fhe = q_logreg.predict(balanced_test_sample, execute_in_fhe=True)

In [None]:
sklearn_accuracy = np.sum(y_pred_test_clear == balanced_test_results) / len(y_test) * 100
quantised_accuracy = (balanced_test_results == y_test).mean() * 100
fhe_accuracy = (balanced_test_results == y_test).mean() * 100

print(f"Sklearn accuracy: {sklearn_accuracy:.4f}")
print(f"Non Homomorphic Accuracy: {quantised_accuracy:.4f}")
print(f"Homomorphic Accuracy: {fhe_accuracy:.4f}")