# Applying Categorical Machine Learning Techniques

In [1]:
import itertools
import pandas as pd
import numpy as np
from financial_data import *
from misc import *
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime as dt

from dataclasses import dataclass

from sklearn import svm
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import zero_one_loss
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import graphviz
%matplotlib inline

In [2]:
combined = load_combined()

In [3]:
combined.head()

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,duration,payments,status,district_id,frequency,account_date,...,no_of_municipalities_with_inhabitants_between_1000_to_9999,no_of_municipalities_with_inhabitants_greater_than_10000,no_of_cities,ratio_of_urban_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,no_of_enterpreneurs_per_1000_inhabitants,no_of_commited_crimes_95,no_of_commited_crimes_96
0,4959,2,1994-01-05,0.12968,0.25,0.319488,A,1,monthly,1993-02-26,...,0.0,0.2,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
1,4961,19,1996-04-29,0.043179,0.0,0.231001,B,21,monthly,1995-04-07,...,0.35,0.2,0.6,0.500756,0.224329,0.183099,0.182832,0.488372,0.017452,0.014926
2,4962,25,1997-12-08,0.043179,0.0,0.231001,A,68,monthly,1996-07-28,...,0.9,0.4,0.5,0.352496,0.402392,0.535211,0.478261,0.174419,0.056623,0.050896
3,4967,37,1998-10-14,0.535129,1.0,0.520924,D,20,monthly,1997-08-18,...,0.15,0.2,0.3,0.370651,0.098623,0.338028,0.35786,0.453488,0.008779,0.006659
4,4968,38,1998-04-19,0.18052,0.75,0.208516,C,19,weekly,1997-08-08,...,0.15,0.2,0.3,0.284418,0.065899,0.408451,0.395764,0.453488,0.002133,0.002148


First we will define our target variable (status) and convert A-D into numeric form (0-3) for use in our learning algorithms

We will also remove numerical only used for identification/processing (loan_id, account_id) since it is impossile that these will impact the status of a loan (district ID information will be gained by the dummy variable for district name)

In [4]:
target = combined['status']

In [5]:
def stat_number(val):
    if val == 'A':
        return 0
    elif val == 'B':
        return 1
    elif val == 'C':
        return 2
    else: return 3

In [6]:
target = target.apply(stat_number)

In [7]:
combined.drop('status', axis = 1, inplace = True)

In [8]:
dropped_cols = ['loan_id', 'account_id', 'district_id'] 
combined.drop(dropped_cols, axis = 1, inplace = True)

As we have some categorical variables, we will have to turn these into dummy variables (so we have a numerical method). We will convert those variables

In [9]:
combined_num = pd.get_dummies(combined)

We will also have to convert our datetime values to ordinal values

In [10]:
datevals = ['loan_date', 'account_date', 'card_issued', 'owner_birth']
for val in datevals:
    combined_num[val]= combined_num[val].map(dt.datetime.toordinal)

In [11]:
for x in combined_num.dtypes: print(x)

int64
float64
float64
float64
int64
int64
int64
int64
int64
int64
float64
int64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8


In [12]:
combined_num.head()

Unnamed: 0,loan_date,loan_amount,duration,payments,account_date,clients,num_cards,card_issued,owner_birth,num_credits,...,district_name_Zlin,district_name_Znojmo,region_Prague,region_central Bohemia,region_east Bohemia,region_north Bohemia,region_north Moravia,region_south Bohemia,region_south Moravia,region_west Bohemia
0,727933,0.12968,0.25,0.319488,727620,2,0,693596,710066,152,...,0,0,1,0,0,0,0,0,0,0
1,728778,0.043179,0.0,0.231001,728390,1,0,693596,707952,152,...,0,0,0,0,0,0,0,1,0,0
2,729366,0.043179,0.0,0.231001,728868,1,0,693596,716280,82,...,0,0,0,0,0,0,1,0,0,0
3,729676,0.535129,1.0,0.520924,729254,1,0,693596,712826,36,...,0,0,0,0,0,0,0,1,0,0
4,729498,0.18052,0.75,0.208516,729244,1,0,693596,708234,48,...,0,0,0,0,0,0,0,1,0,0


## Creating a first-look decision tree classifier

Now applying a basic decision tree classifier 3 layers down (note that Class 0 = A, Class 1 = B, Class 2 = C, Class 3 = D)

In [13]:
@dataclass
class Hparams:
    """Hyperparameters for our models"""
    max_depth: int = 3 
    min_samples_leaf: int = 1
    n_estimators: int = 400
    learning_rate: float = 1.0

In [14]:
hparams = Hparams()

In [15]:

x_train, x_test, y_train, y_test = train_test_split(
    combined_num, target, test_size=0.33, random_state=42
)

In [16]:
dt_stump = DecisionTreeClassifier(
    max_depth=hparams.max_depth, 
    min_samples_leaf=hparams.min_samples_leaf
)

dt_stump.fit(x_train, y_train)
dt_stump_err = 1.0 - dt_stump.score(x_test, y_test)

In [17]:
class_names = ['0', '1', '2', '3']

dot_data = tree.export_graphviz(dt_stump, out_file=None, 
                      feature_names=list(combined_num.columns),  
                      class_names=class_names,  
                      filled=True, rounded=True,  
                      special_characters=True) 

graph = graphviz.Source(dot_data)  
graph

Format: "svg" not recognized. Use one of:


CalledProcessError: Command '['dot', '-Tsvg']' returned non-zero exit status 1. [stderr: b'Format: "svg" not recognized. Use one of:\r\n']

<graphviz.files.Source at 0x2a054bb1160>