Importing packages

In [1]:
import pandas as pd

from pgmpy.estimators import HillClimbSearch, BicScore, MmhcEstimator, BDeuScore, PC, K2Score, BDsScore, BayesianEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from pgmpy.readwrite.BIF import BIFWriter
from pgmpy.readwrite.BIF import BIFReader
from pgmpy.metrics import correlation_score

  from .autonotebook import tqdm as notebook_tqdm


Dataset load

In [2]:
credit_dataset = pd.read_csv('src/credit_risk_dataset.csv')

print(credit_dataset.shape)

(32581, 12)


Dataset cleanup

In [3]:
credit_dataset = credit_dataset.drop(credit_dataset[credit_dataset['person_age'] > 122].index)
credit_dataset = credit_dataset.drop(credit_dataset[credit_dataset['person_emp_length'] > 122].index)

In [4]:
for i in range (12):
    print(credit_dataset.columns[i])
    print(credit_dataset.iloc[:,i].isna().sum())

person_age
0
person_income
0
person_home_ownership
0
person_emp_length
895
loan_intent
0
loan_grade
0
loan_amnt
0
loan_int_rate
3115
loan_status
0
loan_percent_income
0
cb_person_default_on_file
0
cb_person_cred_hist_length
0


In [5]:
credit_dataset['person_emp_length'].fillna(value=credit_dataset['person_emp_length'].mean(), inplace=True)
credit_dataset['loan_int_rate'].fillna(value=credit_dataset['loan_int_rate'].mean(), inplace=True)

In [6]:
for i in range (12):
    print(credit_dataset.columns[i])
    print(credit_dataset.iloc[:,i].isna().sum())

person_age
0
person_income
0
person_home_ownership
0
person_emp_length
0
loan_intent
0
loan_grade
0
loan_amnt
0
loan_int_rate
0
loan_status
0
loan_percent_income
0
cb_person_default_on_file
0
cb_person_cred_hist_length
0


In [7]:
credit_dataset.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32574.0,32574.0,32574.0,32574.0,32574.0,32574.0,32574.0,32574.0
mean,27.718426,65878.48,4.782064,9588.018051,11.011529,0.21818,0.170202,5.804108
std,6.204987,52531.94,3.979128,6320.249598,3.081657,0.413017,0.106755,4.053873
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,8.49,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,11.011529,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.11,0.0,0.23,8.0
max,94.0,2039784.0,41.0,35000.0,23.22,1.0,0.83,30.0


In [8]:
def save_as_csv(dataframe, filename, index=False):
    dataframe.to_csv(path_or_buf=filename, index=index)
    print(f'{filename} saved')

def save_as_excel(dataframe, filename, index=True):
    dataframe.to_excel(filename, index=index)
    print(f'{filename} saved')

In [9]:
save_as_csv(credit_dataset, 'src/credit_risk_dataset_clean.csv')

src/credit_risk_dataset_clean.csv saved


Discretizing the dataset

In [10]:
discretized_dataset = pd.DataFrame()
discretized_dataset['person_age'] = pd.cut(credit_dataset['person_age'],
                                    right=False, 
                                    bins=[20,22,25,30,40,100], 
                                    labels=['young', 'young_adult', 'adult', 'middle_aged', 'senior'])

discretized_dataset['person_income'] = pd.cut(credit_dataset['person_income'],
                                    right=False, 
                                    bins=[0,10000,40000,80000,300000,2500000], 
                                    labels=['low', 'lower_middle', 'middle', 'upper_middle', 'high'])

discretized_dataset['person_emp_length'] = pd.cut(credit_dataset['person_emp_length'],
                                    right=False,
                                    bins=[0,2,5,10,20,42],
                                    labels=['unemployed', 'entry_level', 'mid_level', 'senior_level', 'retired'])

discretized_dataset['loan_amnt'] = pd.cut(credit_dataset['loan_amnt'],
                                    right=False,
                                    bins=[0,5000,10000,15000,25000,36000],
                                    labels=['low', 'lower_middle', 'middle', 'upper_middle', 'high'])

discretized_dataset['loan_int_rate'] = pd.cut(credit_dataset['loan_int_rate'],
                                    right=False,
                                    bins=[5,10,17.5,25],
                                    labels=['low', 'middle', 'high'])

discretized_dataset['loan_percent_income'] = pd.cut(credit_dataset['loan_percent_income'],
                                    right=False,
                                    bins=[0,0.1,0.2,0.4,0.85],
                                    labels=['low', 'lower_middle', 'middle', 'high'])

discretized_dataset['cb_person_cred_hist_length'] = pd.cut(credit_dataset['cb_person_cred_hist_length'],
                                    right=False,
                                    bins=[0,5,10,20,35],
                                    labels=['low', 'lower_middle', 'middle', 'high'])

discretized_dataset['person_home_ownership'] = credit_dataset['person_home_ownership']
discretized_dataset['loan_intent'] = credit_dataset['loan_intent']
discretized_dataset['loan_grade'] = credit_dataset['loan_grade']
discretized_dataset['cb_person_default_on_file'] = credit_dataset['cb_person_default_on_file']
discretized_dataset['loan_status'] = credit_dataset['loan_status']

In [11]:
save_as_csv(discretized_dataset, 'discretized_dataset.csv')

discretized_dataset.csv saved


In [12]:
dataset = pd.read_csv('src/discretized_dataset.csv')
dataset.shape

(32409, 12)

In [13]:
dataset['loan_status'].value_counts(normalize=True)

0    0.781295
1    0.218705
Name: loan_status, dtype: float64

Learning the a Bayesian Network

Blacklisting some edges

In [14]:
nodes = ['person_age','person_income','person_home_ownership','person_emp_length','loan_intent','loan_grade','loan_amnt','loan_int_rate','loan_status','loan_percent_income','cb_person_default_on_file','cb_person_cred_hist_length']

input_ = ['person_age']
age_dependent = ['person_emp_length']
person_dependent = ['person_income', 'person_home_ownership']
earlier_loan_dependent = ['cb_person_default_on_file']
loan_dependent = ['loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
output_ = ['loan_status']

restricted_edges = []

In [15]:
restricted_edges += [(a, b) for a in age_dependent for b in input_]
restricted_edges += [(a, b) for a in person_dependent for b in input_ + age_dependent]
restricted_edges += [(a, b) for a in earlier_loan_dependent for b in input_ + age_dependent + person_dependent]
restricted_edges += [(a, b) for a in loan_dependent for b in input_ + age_dependent + person_dependent + earlier_loan_dependent]
restricted_edges += [(a, b) for a in output_ for b in input_ + age_dependent + person_dependent + earlier_loan_dependent + loan_dependent]

restricted_edges += [('loan_percent_income', 'loan_amnt'), ('loan_percent_income', 'person_income')]

restricted_edges

[('person_emp_length', 'person_age'),
 ('person_income', 'person_age'),
 ('person_income', 'person_emp_length'),
 ('person_home_ownership', 'person_age'),
 ('person_home_ownership', 'person_emp_length'),
 ('cb_person_default_on_file', 'person_age'),
 ('cb_person_default_on_file', 'person_emp_length'),
 ('cb_person_default_on_file', 'person_income'),
 ('cb_person_default_on_file', 'person_home_ownership'),
 ('loan_intent', 'person_age'),
 ('loan_intent', 'person_emp_length'),
 ('loan_intent', 'person_income'),
 ('loan_intent', 'person_home_ownership'),
 ('loan_intent', 'cb_person_default_on_file'),
 ('loan_grade', 'person_age'),
 ('loan_grade', 'person_emp_length'),
 ('loan_grade', 'person_income'),
 ('loan_grade', 'person_home_ownership'),
 ('loan_grade', 'cb_person_default_on_file'),
 ('loan_amnt', 'person_age'),
 ('loan_amnt', 'person_emp_length'),
 ('loan_amnt', 'person_income'),
 ('loan_amnt', 'person_home_ownership'),
 ('loan_amnt', 'cb_person_default_on_file'),
 ('loan_int_rate',

Structure learning

In [16]:
hc = HillClimbSearch(dataset)
best_model = hc.estimate(black_list=restricted_edges, scoring_method=K2Score(dataset))
print(best_model.edges())

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 23/1000000 [00:12<154:46:05,  1.79it/s]

[('person_age', 'cb_person_cred_hist_length'), ('person_age', 'person_emp_length'), ('person_age', 'loan_intent'), ('person_age', 'person_income'), ('person_income', 'loan_percent_income'), ('person_income', 'loan_amnt'), ('person_income', 'person_home_ownership'), ('person_emp_length', 'person_income'), ('person_emp_length', 'person_home_ownership'), ('person_emp_length', 'cb_person_cred_hist_length'), ('loan_amnt', 'loan_percent_income'), ('loan_percent_income', 'loan_status'), ('cb_person_cred_hist_length', 'loan_intent'), ('person_home_ownership', 'loan_status'), ('person_home_ownership', 'loan_intent'), ('person_home_ownership', 'loan_grade'), ('person_home_ownership', 'cb_person_default_on_file'), ('loan_intent', 'loan_status'), ('loan_grade', 'loan_int_rate'), ('loan_grade', 'loan_status'), ('loan_grade', 'loan_amnt'), ('cb_person_default_on_file', 'loan_grade')]





In [17]:
def string_output(best_model_edges):
    result = 'digraph G \n {\n'
    for arc in best_model_edges:
        result += '\t' + arc[0] + ' -> ' + arc[1] + ';\n'
    result += '}'
    return result

In [18]:
print(f'A graph obtained by HC:\n{string_output(best_model.edges())}')

A graph obtained by HC:
digraph G 
 {
	person_age -> cb_person_cred_hist_length;
	person_age -> person_emp_length;
	person_age -> loan_intent;
	person_age -> person_income;
	person_income -> loan_percent_income;
	person_income -> loan_amnt;
	person_income -> person_home_ownership;
	person_emp_length -> person_income;
	person_emp_length -> person_home_ownership;
	person_emp_length -> cb_person_cred_hist_length;
	loan_amnt -> loan_percent_income;
	loan_percent_income -> loan_status;
	cb_person_cred_hist_length -> loan_intent;
	person_home_ownership -> loan_status;
	person_home_ownership -> loan_intent;
	person_home_ownership -> loan_grade;
	person_home_ownership -> cb_person_default_on_file;
	loan_intent -> loan_status;
	loan_grade -> loan_int_rate;
	loan_grade -> loan_status;
	loan_grade -> loan_amnt;
	cb_person_default_on_file -> loan_grade;
}


Parameter learning

In [19]:
from pgmpy.estimators import ExpectationMaximization as EM

In [27]:
bn = BayesianNetwork(best_model.edges())
state_names = {'person_age': ['young', 'young_adult', 'adult', 'middle_aged', 'senior'],
                'person_income': ['low', 'lower_middle', 'middle', 'upper_middle', 'high'],
                'person_emp_length': ['unemployed', 'entry_level', 'mid_level', 'senior_level', 'retired'],
                'loan_amnt': ['low', 'lower_middle', 'middle', 'upper_middle', 'high'],
                'loan_int_rate': ['low', 'middle', 'high'],
                'loan_percent_income': ['low', 'lower_middle', 'middle', 'high'],
                'cb_person_cred_hist_length': ['low', 'lower_middle', 'middle', 'high'],
                'person_home_ownership': ['MORTGAGE', 'OTHER', 'OWN', 'RENT'],
                'loan_intent': ['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE'],
                'loan_grade': ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
                'cb_person_default_on_file': ['Y', 'N'],
                'loan_status': [0, 1]}
bn.fit(dataset, estimator=BayesianEstimator, prior_type="BDeu", state_names=state_names)

print(bn.states)

{'person_age': ['young', 'young_adult', 'adult', 'middle_aged', 'senior'], 'cb_person_cred_hist_length': ['low', 'lower_middle', 'middle', 'high'], 'person_emp_length': ['unemployed', 'entry_level', 'mid_level', 'senior_level', 'retired'], 'loan_intent': ['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE'], 'person_home_ownership': ['MORTGAGE', 'OTHER', 'OWN', 'RENT'], 'person_income': ['low', 'lower_middle', 'middle', 'upper_middle', 'high'], 'loan_percent_income': ['low', 'lower_middle', 'middle', 'high'], 'loan_amnt': ['low', 'lower_middle', 'middle', 'upper_middle', 'high'], 'loan_grade': ['A', 'B', 'C', 'D', 'E', 'F', 'G'], 'loan_status': [0, 1], 'cb_person_default_on_file': ['Y', 'N'], 'loan_int_rate': ['low', 'middle', 'high']}


In [28]:
cpd = bn.get_cpds("loan_status")

from pgmpy.factors.discrete.CPD import TabularCPD

def print_full(cpd):
    backup = TabularCPD._truncate_strtable
    TabularCPD._truncate_strtable = lambda self, x: x
    print(cpd)
    TabularCPD._truncate_strtable = backup

print_full(cpd)

+-----------------------+---------------------------------+--------------------------------+--------------------------------+--------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+---------------------------------+--------------------------------+--------------------------------+--------------------------------+---------------------------------+--------------------------------+--------------------------------+--------------------------------+---------------------------------+------------------------------+----------------------------+-----------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+---------------------------------+------------------------------+-----------------------------+-----------------------------+---------------------------------+-----

Inference

In [29]:
inference = VariableElimination(bn)

In [30]:
phi_query = inference.query(['loan_status'])
print(phi_query)

+----------------+--------------------+
| loan_status    |   phi(loan_status) |
| loan_status(0) |             0.7803 |
+----------------+--------------------+
| loan_status(1) |             0.2197 |
+----------------+--------------------+


In [31]:
phi_query = inference.query(['loan_status'], evidence={'person_home_ownership': 'RENT'})
print(phi_query)

+----------------+--------------------+
| loan_status    |   phi(loan_status) |
| loan_status(0) |             0.6828 |
+----------------+--------------------+
| loan_status(1) |             0.3172 |
+----------------+--------------------+


In [32]:
inference = VariableElimination(bn)
phi_query = inference.query(['loan_status'], evidence={'person_home_ownership': 'RENT', 'loan_grade': 'A'})
print(phi_query)

+----------------+--------------------+
| loan_status    |   phi(loan_status) |
| loan_status(0) |             0.8246 |
+----------------+--------------------+
| loan_status(1) |             0.1754 |
+----------------+--------------------+


In [33]:
inference = VariableElimination(bn)
phi_query = inference.query(['loan_status'], evidence={'person_home_ownership': 'RENT', 'loan_grade': 'A', 'loan_percent_income': 'high'})
print(phi_query)

+----------------+--------------------+
| loan_status    |   phi(loan_status) |
| loan_status(0) |             0.0001 |
+----------------+--------------------+
| loan_status(1) |             0.9999 |
+----------------+--------------------+


In [34]:
inference = VariableElimination(bn)
phi_query = inference.query(['loan_grade'], evidence={'loan_status': 0})
print(phi_query)

+---------------+-------------------+
| loan_grade    |   phi(loan_grade) |
| loan_grade(A) |            0.3806 |
+---------------+-------------------+
| loan_grade(B) |            0.3421 |
+---------------+-------------------+
| loan_grade(C) |            0.2021 |
+---------------+-------------------+
| loan_grade(D) |            0.0584 |
+---------------+-------------------+
| loan_grade(E) |            0.0133 |
+---------------+-------------------+
| loan_grade(F) |            0.0032 |
+---------------+-------------------+
| loan_grade(G) |            0.0003 |
+---------------+-------------------+


In [35]:
inference = VariableElimination(bn)
phi_query = inference.query(['loan_grade'], evidence={'loan_status': 1})
print(phi_query)

+---------------+-------------------+
| loan_grade    |   phi(loan_grade) |
| loan_grade(A) |            0.1511 |
+---------------+-------------------+
| loan_grade(B) |            0.2433 |
+---------------+-------------------+
| loan_grade(C) |            0.1860 |
+---------------+-------------------+
| loan_grade(D) |            0.3008 |
+---------------+-------------------+
| loan_grade(E) |            0.0881 |
+---------------+-------------------+
| loan_grade(F) |            0.0227 |
+---------------+-------------------+
| loan_grade(G) |            0.0080 |
+---------------+-------------------+


In [36]:
inference = VariableElimination(bn)
phi_query = inference.query(['loan_amnt'], evidence={'loan_status': 0, 'person_income': 'lower_middle'})
print(phi_query)

+-------------------------+------------------+
| loan_amnt               |   phi(loan_amnt) |
| loan_amnt(low)          |           0.4470 |
+-------------------------+------------------+
| loan_amnt(lower_middle) |           0.4092 |
+-------------------------+------------------+
| loan_amnt(middle)       |           0.1284 |
+-------------------------+------------------+
| loan_amnt(upper_middle) |           0.0153 |
+-------------------------+------------------+
| loan_amnt(high)         |           0.0001 |
+-------------------------+------------------+


I/O for the built model

In [37]:
writer = BIFWriter(model = bn)
writer.write_bif('bayes_credit_k2_hill_climb_with_restricted.bif')

In [38]:
reader = BIFReader('bayes_credit_k2_hill_climb_with_restricted.bif')
print('model loaded from bif file')

bn_read = reader.get_model()
print('model deserialized')

model loaded from bif file
model deserialized


In [39]:
inference_read = VariableElimination(bn_read)
q = inference_read.query(['loan_status'])
dictionary = dict(zip(q.state_names['loan_status'], q.values))
print(dictionary)

{'0': 0.7803377939523946, '1': 0.21966220604760545}
