# Import Modules

This notebook is based on the home loan default data set from [kaggle](https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction). Therefore, to make it working with meaningful results, please download the data from kaggle.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging as logging

from sklearn import tree
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_val_score

import pailab
from pylab import rcParams

%matplotlib inline

logging.basicConfig(level=logging.FATAL)

# Helperfunctions

In [2]:
#
import graphviz 
def plot_tree(tree_clf, feature_names = None):
    dot_data = tree.export_graphviz(tree_clf, out_file=None, 
        feature_names=feature_names,
        filled=True, rounded=True,  
        special_characters=True)  

    graph = graphviz.Source(dot_data)
    return graph

def plot_roc(tree_clf, x, y):
    
    for k,v in tree_clf.items():
        y_score = v.predict_proba(x)
        fpr, tpr, _ = roc_curve(y[:], y_score[:, 1])
        roc_auc = roc_auc_score(y[:],y_score[:,1])# auc(fpr, tpr)
        lw = 2
        plt.plot(fpr, tpr, '-x', #color='darkorange',
                 lw=lw, label = k + ' (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    #plt.show()
    return plt

def plot_probs(tree_clf, x, y):
    alpha = 1.0
    if len(tree_clf)> 1:
        alpha = 0.5;
    for k,v in tree_clf.items():
        y_score = v.predict_proba(x)
        plt.hist(y_score[:,1], alpha=alpha, label = k, bins = 50)
    plt.legend()
    return plt

def plot_feature_importance(clf, feature_names):
    feature_importance = {}
    for k,v in clf.items():
        feature_importance[k] = v.feature_importances_
    feature_importance = pd.DataFrame(feature_importance)
    feature_importance['FEATURE_NAME'] = feature_names
    feature_importance.plot.bar(x='FEATURE_NAME')
    plt.title('Feature Importance')

    
def tree_statistics(clf, x, y):
    name = []
    num_leaves = []
    depth = []
    roc_score = []
    for k,v in clf.items():
        name.append(k)
        num_leaves.append(v.get_n_leaves())
        depth.append(v.get_depth())
        y_score = v.predict_proba(x)
        roc_score.append(roc_auc_score(y[:],y_score[:,1]))
    result = pd.DataFrame({'name': name, 'num_leaves': num_leaves, 'depth': depth, 'roc_auc_score': roc_score})
    return result[['name', 'depth', 'num_leaves', 'roc_auc_score']]

def forest_statistics(clf, x, y):
    name = []
    min_depth = []
    max_depth = []
    mean_depth = []
    min_leaves = []
    max_leaves = []
    mean_leaves =[]
    roc_score = []
    for k,v in clf.items():
        name.append(k)
        leaves = [tree.get_n_leaves() for tree in v.estimators_]
        min_leaves.append(np.min(leaves))
        mean_leaves.append(np.mean(leaves))
        max_leaves.append(np.max(leaves))
        depth = [tree.get_depth() for tree in v.estimators_]
        min_depth.append(np.min(depth))
        max_depth.append(np.max(depth))
        mean_depth.append(np.mean(depth))
        y_score = v.predict_proba(x)
        roc_score.append(roc_auc_score(y[:],y_score[:,1]))
    result = pd.DataFrame({'name': name, 'min_leaves': min_leaves, 'max_leaves': max_leaves, 'mean_leaves': mean_leaves,
                           'min_depth': min_depth, 'max_depth': max_depth, 'mean_depth': mean_depth, 'roc_auc_score': roc_score})
    return result[['name', 'min_depth', 'max_depth', 'mean_depth', 'min_leaves', 'max_leaves', 'mean_leaves', 'roc_auc_score']]

def plot_pruning(pruning_path, start = 0, end = -1):
    ccp_alphas, impurities = pruning_path.ccp_alphas, path.impurities
    plt.plot(ccp_alphas[start:end], impurities[start:end], marker='o', drawstyle="steps-post")
    plt.xlabel("effective alpha")
    plt.ylabel("total impurity of leaves")
    plt.title("Total Impurity vs effective alpha")
    
def replace_type(column):
    result = []
    for occtype in full_data[column].unique():
        result.append((full_data[full_data[column]==occtype]['TARGET'].sum(), occtype,))
    result.sort()
    replace_dict = {}
    for i in range(len(result)):
        replace_dict[result[i][1]] = i
    return replace_dict


# Setup Initial Repository

In [3]:
from pailab import MLRepo

In [4]:
# setting up the repository
config = None
if False:
    config = {'user': 'test_user',
          'workspace': 'c:/temp',
          'repo_store': 
          {
              'type': 'git_handler',  
              'config': {
                  'folder': 'c:/temp', 
                  'file_format': 'pck'
              }
          },
          'numpy_store':
          {
              'type': 'hdf_handler',
              'config':{
                  'folder': 'c:/temp/hdf',
                  'version_files': True
              }
          }
         }
ml_repo = MLRepo( user = 'test_user', config=None)
from pailab.tools.tree import MLTree
MLTree.add_tree(ml_repo)

## Add Raw-Data to repository

In [5]:
full_data = pd.read_csv('application_train.zip')
full_data['CREDIT_INCOME_RATIO'] = full_data['AMT_CREDIT'] / full_data['AMT_INCOME_TOTAL']
full_data['ANNUITY_INCOME_RATIO'] = full_data['AMT_ANNUITY'] / full_data['AMT_INCOME_TOTAL']
full_data['CREDIT_TERM'] = full_data['AMT_ANNUITY'] / full_data['AMT_CREDIT']
full_data['YEARS_EMPLOYED_AGE_RATIO'] = full_data['YEARS_EMPLOYED']/full_data['AGE']
full_data['GOODS_PRICE_CREDIT_RATIO'] = full_data['AMT_GOODS_PRICE'] / full_data['AMT_CREDIT']
full_data['YEARS_EMPLOYED_ERROR'] = full_data['YEARS_EMPLOYED']<0

# transformiere strings in Zahlen
full_data['NAME_EDUCATION_TYPE'].replace(replace_type('NAME_EDUCATION_TYPE'), inplace=True)
full_data['CODE_GENDER'].replace({'M':0.0, 'F':1.0, 'XNA':2.0}, inplace=True)
full_data['FLAG_OWN_CAR'].replace({'N':0.0, 'Y':1.0}, inplace=True)
full_data['FLAG_OWN_REALTY'].replace({'N':0.0, 'Y':1.0}, inplace=True)
full_data['OCCUPATION_TYPE'].replace(replace_type('OCCUPATION_TYPE'), inplace=True)

#drop rows with nans in column that are not handled by imputers
full_data.dropna(subset=['CREDIT_INCOME_RATIO', 'ANNUITY_INCOME_RATIO', 'CREDIT_TERM', 'YEARS_EMPLOYED_AGE_RATIO', 'GOODS_PRICE_CREDIT_RATIO',
                 'AGE', 'YEARS_EMPLOYED', 'YEARS_EMPLOYED_ERROR', 'NAME_EDUCATION_TYPE', 'CNT_CHILDREN', 'REGION_RATING_CLIENT', 
                  'REGION_RATING_CLIENT_W_CITY','REGION_POPULATION_RELATIVE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                  'REGION_RATING_CLIENT_W_CITY', 'OCCUPATION_TYPE'], inplace=True)

In [6]:
input_names = [x for x in full_data.columns if x is not 'TARGET']
input_names.remove('TARGET')

In [7]:
ml_repo.tree.raw_data.add('data', full_data, input_variables=input_names, target_variables='TARGET')

## Add Preprocessors to the repository

### Select subset of columns as input
In case we dot not need all columns of the data for one of our models, we may add a preproessor selecting only certain columns.

In [8]:
import pailab.externals.numpy_interface as numpy_interface
select_columns = ['CREDIT_INCOME_RATIO', 'ANNUITY_INCOME_RATIO', 'CREDIT_TERM', 'YEARS_EMPLOYED_AGE_RATIO', 'GOODS_PRICE_CREDIT_RATIO',
                 'AGE', 'YEARS_EMPLOYED', 'YEARS_EMPLOYED_ERROR', 'NAME_EDUCATION_TYPE', 'CNT_CHILDREN', 'REGION_RATING_CLIENT', 
                  'REGION_RATING_CLIENT_W_CITY','REGION_POPULATION_RELATIVE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                  'REGION_RATING_CLIENT_W_CITY', 'OCCUPATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
numpy_interface.add_preprocessor_select_columns(ml_repo, preprocessor_name='column_selector_1', 
                                                preprocessor_param={'columns':select_columns})

### Impute Missing Data

To fill in missing data, we add the SimpleImputer using the median from SKLearn to impute missing values.

In [9]:
import pailab.externals.sklearn_interface as sklearn_interface
from sklearn.preprocessing import StandardScaler
sklearn_interface.add_preprocessor(ml_repo, SimpleImputer(strategy = 'median', add_indicator=True), 
                                   preprocessor_name='SKLSimpleImputer', columns =  ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'], 
                                  output_columns = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 
                                                  'EXT_SOURCE_1_MISSING', 'EXT_SOURCE_2_MISSING', 'EXT_SOURCE_3_MISSING'])

### Remove NaN

In [10]:
numpy_interface.add_preprocessor_remove_rows_nan(ml_repo, preprocessor_name='RemoveNaN')

### Add Polynomial Features

In [11]:
from sklearn.preprocessing import PolynomialFeatures
sklearn_interface.add_preprocessor(ml_repo,PolynomialFeatures(degree = 3), preprocessor_name='PolynomialFeatures', 
                                   columns =  ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'])

## Add Trainings- and Testdata

In [12]:
ml_repo.tree.training_data.add('training_data', 'raw_data/data', end_index = int(0.8*full_data.shape[0]))
ml_repo.tree.test_data.add('test_data', 'raw_data/data', start_index = int(0.8*full_data.shape[0])+1)

## Add DecisionTree

To estimate the default of the home loans we start with a simple decision tree classifier from scikit-learn.

In [13]:
import pailab.externals.sklearn_interface as sklearn_interface
sklearn_interface.add_model(ml_repo, tree.DecisionTreeClassifier(max_depth=3, min_samples_split = 2, criterion = 'gini', random_state=42), 
                           preprocessors=['column_selector_1', 'SKLSimpleImputer', 'RemoveNaN', 'PolynomialFeatures'])

In [14]:
ml_repo.run_training()

('DecisionTreeClassifier/jobs/training',
 'd0187314-5fdc-11ea-a241-b88a609550eb')

## Add Measures

Now that we have run the training, we would like to measure the quality of results. Here, we have the possibiliy to add certain measures to the repository. Here, we use the Area Under the Receiver Operator Characteristic Curve (ROC AUC or simply AUC), see e.g. [scikit-learn](https://scikit-learn.org/stable/modules/model_evaluation.html#roc-metrics) for more details.

In [15]:
ml_repo.add_measure(pailab.MeasureConfiguration.ROC_AUC)

To compute the AUC we need to call run_evaluation.

In [16]:
# Note that without run_descendants=True the method would only run a prediction but 
# not compute the following step computing the scores and we would have to call 
# the function run_measures separately.
ml_repo.run_evaluation(run_descendants=True) 
#ml_repo.run_measures() #uncomment this line if you do not use run_descendants=True above

[('DecisionTreeClassifier/model/jobs/eval_job/training_data',
  'd2680d78-5fdc-11ea-bea2-b88a609550eb'),
 ('DecisionTreeClassifier/model/jobs/eval_job/test_data',
  'd31c7c42-5fdc-11ea-99b6-b88a609550eb')]

In [17]:
# The method get_names shows us that we have two objects (the AUC on training and test data)
ml_repo.get_names(pailab.MLObjectType.MEASURE)

['DecisionTreeClassifier/measure/training_data/roc_auc',
 'DecisionTreeClassifier/measure/test_data/roc_auc']

In [18]:
# to retrieve the value we retrieve the respective objects that contains the value
measure = ml_repo.get('DecisionTreeClassifier/measure/training_data/roc_auc')
print('AUC on training set: ' + str(measure.value))
measure = ml_repo.get('DecisionTreeClassifier/measure/test_data/roc_auc')
print('AUC on test set: ' + str(measure.value))


AUC on training set: 0.5
AUC on test set: 0.5


As we can see, this is a very poor result (indeed the max_depth of the tree was set to 3 which is very low for the number of data). We now repeat the training for different depth parameter. For this, we just get the current model parameter, modify the depth parameter and call *run_training* again. Note that the sklearn interface stores the sklearn specific parameters in a dictionary with name *sklearn_params*.

In [19]:
if False:
    for i in [2,4, 8, 12, 16, 20, 24, 32, 48, 64, 96, 128]:
        param =  ml_repo.get('DecisionTreeClassifier/model_param')
        param.sklearn_params['max_depth'] = i
        ml_repo.add(param)
        ml_repo.run_training(run_descendants=True)

## Analysis

In [20]:
from pailab.analysis import plot
from pailab.analysis import tools_jupyter

In [21]:
tools_jupyter.widget_repo.set_repo(ml_repo)

In [22]:
measure_vs_param = tools_jupyter.PlotMeasureVsParameter()
display(measure_vs_param.get_widget())

VBox(children=(HTML(value='<h3 style="Color: white; background-color:#d1d1e0; text-align: center"> Measure vs …

{'name': '_property_lock', 'old': traitlets.Undefined, 'new': {'index': 0}, 'owner': Dropdown(options=('DecisionTreeClassifier/model',), value=None), 'type': 'change'}
{'name': 'label', 'old': None, 'new': 'DecisionTreeClassifier/model', 'owner': Dropdown(options=('DecisionTreeClassifier/model',), value=None), 'type': 'change'}
{'name': 'value', 'old': None, 'new': 'DecisionTreeClassifier/model', 'owner': Dropdown(options=('DecisionTreeClassifier/model',), value='DecisionTreeClassifier/model'), 'type': 'change'}
DecisionTreeClassifier/model
{'name': 'index', 'old': None, 'new': 0, 'owner': Dropdown(options=('DecisionTreeClassifier/model',), value='DecisionTreeClassifier/model'), 'type': 'change'}
DecisionTreeClassifier/model
{'name': '_property_lock', 'old': {'index': 0}, 'new': {}, 'owner': Dropdown(options=('DecisionTreeClassifier/model',), value='DecisionTreeClassifier/model'), 'type': 'change'}
DecisionTreeClassifier/model


In [23]:
plot.measure_by_parameter(ml_repo, ['DecisionTreeClassifier/measure/training_data/roc_auc'], 'max_depth')

Figure({
    'data': [{'mode': 'markers',
              'name': ('DecisionTreeClassifier/measure' ... '40-5fdc-11ea-baae-b88a609550eb'),
              'text': [model version:
                       d2566040-5fdc-11ea-87f3-b88a609550eb<br>training_data:
                       d00eb240-5fdc-11ea-baae-b88a609550eb<br>train_data:
                       d00eb240-5fdc-11ea-baae-b88a609550eb],
              'type': 'scatter',
              'uid': '2e31b098-ff83-4b33-a516-e54aa0875d72',
              'x': array([3], dtype=int64),
              'y': array([0.5])}],
    'layout': {'title': {'text': 'measure by parameter'},
               'xaxis': {'title': {'text': 'max_depth'}},
               'yaxis': {'title': {'text': 'roc_auc'}}}
})

In [24]:
model_param = ml_repo.get_names(pailab.MLObjectType.MODEL_PARAM)
model_param = ml_repo.get(model_param[0])
model_param.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 42,
 'splitter': 'best'}

In [25]:
from pailab.ml_repo.repo import NamingConventions
NamingConventions.get_model_param_name

<function pailab.ml_repo.repo.NamingConventions.get_model_param_name>

In [26]:
repo_overview = tools_jupyter.ObjectOverviewList(ml_repo)
display(repo_overview.get_widget())

VBox(children=(HTML(value='<h3 style="Color: white; background-color:#d1d1e0; text-align: center"> Object Over…