# Module importieren

Home loan default data from [kaggle](https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging as logging

from sklearn import tree
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_val_score

import pailab
from pylab import rcParams

%matplotlib inline

logging.basicConfig(level=logging.FATAL)

# Initialisiere Repository

In [2]:
from pailab import MLRepo

In [3]:
# setting up the repository
config = None
if False:
    config = {'user': 'test_user',
          'workspace': 'c:/temp',
          'repo_store': 
          {
              'type': 'git_handler',  
              'config': {
                  'folder': 'c:/temp', 
                  'file_format': 'pck'
              }
          },
          'numpy_store':
          {
              'type': 'hdf_handler',
              'config':{
                  'folder': 'c:/temp/hdf',
                  'version_files': True
              }
          }
         }
ml_repo = MLRepo( user = 'test_user', config=None)
from pailab.tools.tree import MLTree
MLTree.add_tree(ml_repo)

INFO:root:Get mapping.
INFO:root:No mapping found, creating new mapping.
DEBUG:root:Initializing map with kwargs: {}
DEBUG:pailab.ml_repo.memory_handler:repo_mapping added with version d620ab48-48c8-11ea-858e-fc084a6691eb, category: MAPPING


# Hilfsfunktionen

In [4]:
import graphviz 
def plot_tree(tree_clf, feature_names = None):
    dot_data = tree.export_graphviz(tree_clf, out_file=None, 
        feature_names=feature_names,
        filled=True, rounded=True,  
        special_characters=True)  

    graph = graphviz.Source(dot_data)
    return graph

def plot_roc(tree_clf, x, y):
    
    for k,v in tree_clf.items():
        y_score = v.predict_proba(x)
        fpr, tpr, _ = roc_curve(y[:], y_score[:, 1])
        roc_auc = roc_auc_score(y[:],y_score[:,1])# auc(fpr, tpr)
        lw = 2
        plt.plot(fpr, tpr, '-x', #color='darkorange',
                 lw=lw, label = k + ' (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    #plt.show()
    return plt

def plot_probs(tree_clf, x, y):
    alpha = 1.0
    if len(tree_clf)> 1:
        alpha = 0.5;
    for k,v in tree_clf.items():
        y_score = v.predict_proba(x)
        plt.hist(y_score[:,1], alpha=alpha, label = k, bins = 50)
    plt.legend()
    return plt

def plot_feature_importance(clf, feature_names):
    feature_importance = {}
    for k,v in clf.items():
        feature_importance[k] = v.feature_importances_
    feature_importance = pd.DataFrame(feature_importance)
    feature_importance['FEATURE_NAME'] = feature_names
    feature_importance.plot.bar(x='FEATURE_NAME')
    plt.title('Feature Importance')

    
def tree_statistics(clf, x, y):
    name = []
    num_leaves = []
    depth = []
    roc_score = []
    for k,v in clf.items():
        name.append(k)
        num_leaves.append(v.get_n_leaves())
        depth.append(v.get_depth())
        y_score = v.predict_proba(x)
        roc_score.append(roc_auc_score(y[:],y_score[:,1]))
    result = pd.DataFrame({'name': name, 'num_leaves': num_leaves, 'depth': depth, 'roc_auc_score': roc_score})
    return result[['name', 'depth', 'num_leaves', 'roc_auc_score']]

def forest_statistics(clf, x, y):
    name = []
    min_depth = []
    max_depth = []
    mean_depth = []
    min_leaves = []
    max_leaves = []
    mean_leaves =[]
    roc_score = []
    for k,v in clf.items():
        name.append(k)
        leaves = [tree.get_n_leaves() for tree in v.estimators_]
        min_leaves.append(np.min(leaves))
        mean_leaves.append(np.mean(leaves))
        max_leaves.append(np.max(leaves))
        depth = [tree.get_depth() for tree in v.estimators_]
        min_depth.append(np.min(depth))
        max_depth.append(np.max(depth))
        mean_depth.append(np.mean(depth))
        y_score = v.predict_proba(x)
        roc_score.append(roc_auc_score(y[:],y_score[:,1]))
    result = pd.DataFrame({'name': name, 'min_leaves': min_leaves, 'max_leaves': max_leaves, 'mean_leaves': mean_leaves,
                           'min_depth': min_depth, 'max_depth': max_depth, 'mean_depth': mean_depth, 'roc_auc_score': roc_score})
    return result[['name', 'min_depth', 'max_depth', 'mean_depth', 'min_leaves', 'max_leaves', 'mean_leaves', 'roc_auc_score']]

def plot_pruning(pruning_path, start = 0, end = -1):
    ccp_alphas, impurities = pruning_path.ccp_alphas, path.impurities
    plt.plot(ccp_alphas[start:end], impurities[start:end], marker='o', drawstyle="steps-post")
    plt.xlabel("effective alpha")
    plt.ylabel("total impurity of leaves")
    plt.title("Total Impurity vs effective alpha")
    
def replace_type(column):
    result = []
    for occtype in full_data[column].unique():
        result.append((full_data[full_data[column]==occtype]['TARGET'].sum(), occtype,))
    result.sort()
    replace_dict = {}
    for i in range(len(result)):
        replace_dict[result[i][1]] = i
    return replace_dict


# Add Raw-Data to repository

In [5]:
full_data = pd.read_csv('application_train.csv')
full_data['CREDIT_INCOME_RATIO'] = full_data['AMT_CREDIT'] / full_data['AMT_INCOME_TOTAL']
full_data['ANNUITY_INCOME_RATIO'] = full_data['AMT_ANNUITY'] / full_data['AMT_INCOME_TOTAL']
full_data['CREDIT_TERM'] = full_data['AMT_ANNUITY'] / full_data['AMT_CREDIT']
full_data['YEARS_EMPLOYED_AGE_RATIO'] = full_data['YEARS_EMPLOYED']/full_data['AGE']
full_data['GOODS_PRICE_CREDIT_RATIO'] = full_data['AMT_GOODS_PRICE'] / full_data['AMT_CREDIT']
full_data['YEARS_EMPLOYED_ERROR'] = full_data['YEARS_EMPLOYED']<0

# transformiere strings in Zahlen
full_data['NAME_EDUCATION_TYPE'].replace(replace_type('NAME_EDUCATION_TYPE'), inplace=True)
full_data['CODE_GENDER'].replace({'M':0.0, 'F':1.0, 'XNA':2.0}, inplace=True)
full_data['FLAG_OWN_CAR'].replace({'N':0.0, 'Y':1.0}, inplace=True)
full_data['FLAG_OWN_REALTY'].replace({'N':0.0, 'Y':1.0}, inplace=True)
full_data['OCCUPATION_TYPE'].replace(replace_type('OCCUPATION_TYPE'), inplace=True)

#drop rows with nans in column that are not handled by imputers
full_data.dropna(subset=['CREDIT_INCOME_RATIO', 'ANNUITY_INCOME_RATIO', 'CREDIT_TERM', 'YEARS_EMPLOYED_AGE_RATIO', 'GOODS_PRICE_CREDIT_RATIO',
                 'AGE', 'YEARS_EMPLOYED', 'YEARS_EMPLOYED_ERROR', 'NAME_EDUCATION_TYPE', 'CNT_CHILDREN', 'REGION_RATING_CLIENT', 
                  'REGION_RATING_CLIENT_W_CITY','REGION_POPULATION_RELATIVE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                  'REGION_RATING_CLIENT_W_CITY', 'OCCUPATION_TYPE'], inplace=True)

In [7]:
input_names = [x for x in full_data.columns if x is not 'TARGET']
input_names.remove('TARGET')

In [8]:
ml_repo.tree.raw_data.add('data', full_data, input_variables=input_names, target_variables='TARGET')

DEBUG:pailab.ml_repo.memory_handler:raw_data/data added with version db2a5592-48c8-11ea-a82b-fc084a6691eb, category: RAW_DATA
DEBUG:pailab.ml_repo.memory_handler:Adding data for raw_data/data and version db2a5592-48c8-11ea-a82b-fc084a6691eb
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version db2a5592-48c8-11ea-a82b-fc084a6691eb, category: COMMIT_INFO
DEBUG:root:Getting raw_data/data, version db2a5592-48c8-11ea-a82b-fc084a6691eb


# Add Preprocessors to the repository

## Select subset of columns as input
In case we dot not need all columns of the data for on of our models, we may add a preproessor selecting only certain columns.

In [9]:
import pailab.externals.numpy_interface as numpy_interface
select_columns = ['CREDIT_INCOME_RATIO', 'ANNUITY_INCOME_RATIO', 'CREDIT_TERM', 'YEARS_EMPLOYED_AGE_RATIO', 'GOODS_PRICE_CREDIT_RATIO',
                 'AGE', 'YEARS_EMPLOYED', 'YEARS_EMPLOYED_ERROR', 'NAME_EDUCATION_TYPE', 'CNT_CHILDREN', 'REGION_RATING_CLIENT', 
                  'REGION_RATING_CLIENT_W_CITY','REGION_POPULATION_RELATIVE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                  'REGION_RATING_CLIENT_W_CITY', 'OCCUPATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
numpy_interface.add_preprocessor_select_columns(ml_repo, preprocessor_name='column_selector_1', 
                                                preprocessor_param={'columns':select_columns})

DEBUG:pailab.ml_repo.memory_handler:column_selector_1/select_columns added with version db2e99d8-48c8-11ea-82a8-fc084a6691eb, category: PREPROCESSING_TRANSFORMING_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version db2e99d8-48c8-11ea-82a8-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:column_selector_1/preprocessor_param added with version db2f35f4-48c8-11ea-9cc9-fc084a6691eb, category: PREPROCESSOR_PARAM
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version db2f35f4-48c8-11ea-9cc9-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:column_selector_1 added with version db2fd1fe-48c8-11ea-a934-fc084a6691eb, category: PREPROCESSOR
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version db2fd1fe-48c8-11ea-a934-fc084a6691eb, category: COMMIT_INFO


## Impute Missing Data

To fill in missing data, we add the SimpleImputer using the mdeian from SKLearn to impute missing values.

In [19]:
import pailab.externals.sklearn_interface as sklearn_interface
from sklearn.preprocessing import StandardScaler
sklearn_interface.add_preprocessor(ml_repo, SimpleImputer(strategy = 'median', add_indicator=True), 
                                   preprocessor_name='SKLSimpleImputer', columns =  ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'], 
                                  output_columns = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 
                                                  'EXT_SOURCE_1_MISSING', 'EXT_SOURCE_2_MISSING', 'EXT_SOURCE_3_MISSING'])

DEBUG:pailab.ml_repo.memory_handler:SKLSimpleImputer/transform_sklearn added with version 76b1c482-48c9-11ea-be65-fc084a6691eb, category: PREPROCESSING_TRANSFORMING_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version 76b1c482-48c9-11ea-be65-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:SKLSimpleImputer/fit_sklearn added with version 76b26080-48c9-11ea-bc21-fc084a6691eb, category: PREPROCESSING_FITTING_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version 76b26080-48c9-11ea-bc21-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:SKLSimpleImputer/preprocessor_param added with version 76b2fc8c-48c9-11ea-abed-fc084a6691eb, category: PREPROCESSOR_PARAM
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version 76b2fc8c-48c9-11ea-abed-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:SKLSimpleImputer added with version 76b3989c-48c9-11ea-a8f0-fc084a6691eb, category: PREPROC

## Remove NaN

In [12]:
numpy_interface.add_preprocessor_remove_rows_nan(ml_repo, preprocessor_name='RemoveNaN')

DEBUG:pailab.ml_repo.memory_handler:RemoveNaN/select_columns added with version f7cde5d2-48c8-11ea-8d2e-fc084a6691eb, category: PREPROCESSING_TRANSFORMING_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f7cde5d2-48c8-11ea-8d2e-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:RemoveNaN/preprocessor_param added with version f7ce81e4-48c8-11ea-8ed7-fc084a6691eb, category: PREPROCESSOR_PARAM
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f7ce81e4-48c8-11ea-8ed7-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:RemoveNaN added with version f7cf1dfe-48c8-11ea-82ae-fc084a6691eb, category: PREPROCESSOR
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f7cf1dfe-48c8-11ea-82ae-fc084a6691eb, category: COMMIT_INFO


## Add Polynomial Features

In [13]:
from sklearn.preprocessing import PolynomialFeatures
sklearn_interface.add_preprocessor(ml_repo,PolynomialFeatures(degree = 3), preprocessor_name='PolynomialFeatures', 
                                   columns =  ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'])

DEBUG:pailab.ml_repo.memory_handler:PolynomialFeatures/transform_sklearn added with version f89104d0-48c8-11ea-b676-fc084a6691eb, category: PREPROCESSING_TRANSFORMING_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f89104d0-48c8-11ea-b676-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:PolynomialFeatures/fit_sklearn added with version f891a0e2-48c8-11ea-b8fb-fc084a6691eb, category: PREPROCESSING_FITTING_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f891a0e2-48c8-11ea-b8fb-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:PolynomialFeatures/preprocessor_param added with version f8923ce8-48c8-11ea-b87e-fc084a6691eb, category: PREPROCESSOR_PARAM
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f8923ce8-48c8-11ea-b87e-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:PolynomialFeatures added with version f892d8ee-48c8-11ea-b9f0-fc084a6691eb, category:

# Add Trainings- and Testdata

In [14]:
ml_repo.tree.training_data.add('training_data', 'raw_data/data', end_index = int(0.8*full_data.shape[0]))
ml_repo.tree.test_data.add('test', 'raw_data/data', start_index = int(0.8*full_data.shape[0])+1)

DEBUG:pailab.ml_repo.memory_handler:training_data added with version f93359b8-48c8-11ea-9436-fc084a6691eb, category: TRAINING_DATA
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f93359b8-48c8-11ea-9436-fc084a6691eb, category: COMMIT_INFO
DEBUG:root:Getting training_data, version f93359b8-48c8-11ea-9436-fc084a6691eb
DEBUG:root:Getting raw_data/data, version last
DEBUG:pailab.ml_repo.memory_handler:test added with version f93491c8-48c8-11ea-bdf8-fc084a6691eb, category: TEST_DATA
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version f93491c8-48c8-11ea-bdf8-fc084a6691eb, category: COMMIT_INFO
DEBUG:root:Getting test, version f93491c8-48c8-11ea-bdf8-fc084a6691eb
DEBUG:root:Getting raw_data/data, version last


# Add DecisionTree

In [15]:
import pailab.externals.sklearn_interface as sklearn_interface
sklearn_interface.add_model(ml_repo, tree.DecisionTreeClassifier( max_depth=3, min_samples_split = 2, criterion = 'gini', random_state=42), 
                           preprocessors=['column_selector_1', 'SKLSimpleImputer', 'RemoveNaN', 'PolynomialFeatures'])

DEBUG:pailab.ml_repo.memory_handler:eval_sklearn added with version fa6d5a90-48c8-11ea-8e57-fc084a6691eb, category: MODEL_EVAL_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version fa6d5a90-48c8-11ea-8e57-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:train_sklearn added with version fa6df694-48c8-11ea-a162-fc084a6691eb, category: TRAINING_FUNCTION
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version fa6df694-48c8-11ea-a162-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:DecisionTreeClassifier/model_param added with version fa6e92a4-48c8-11ea-ab77-fc084a6691eb, category: MODEL_PARAM
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version fa6e92a4-48c8-11ea-ab77-fc084a6691eb, category: COMMIT_INFO
DEBUG:pailab.ml_repo.memory_handler:DecisionTreeClassifier added with version fa7066c0-48c8-11ea-9edf-fc084a6691eb, category: MODEL
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version fa7

In [18]:
ml_repo.run_training()

DEBUG:root:Getting DecisionTreeClassifier, version last
DEBUG:root:Getting training_data, version last
DEBUG:root:Getting raw_data/data, version last
DEBUG:root:Getting column_selector_1, version last
DEBUG:root:Getting SKLSimpleImputer, version last
DEBUG:root:Getting RemoveNaN, version last
DEBUG:root:Getting PolynomialFeatures, version last
DEBUG:root:Getting DecisionTreeClassifier/jobs/training, version is None.
DEBUG:pailab.ml_repo.memory_handler:DecisionTreeClassifier/jobs/training added with version 5df7092e-48c9-11ea-8099-fc084a6691eb, category: JOB
DEBUG:pailab.ml_repo.memory_handler:CommitInfo added with version 5df7092e-48c9-11ea-8099-fc084a6691eb, category: COMMIT_INFO
DEBUG:root:Getting DecisionTreeClassifier/jobs/training, version 5df7092e-48c9-11ea-8099-fc084a6691eb
DEBUG:root:Getting DecisionTreeClassifier, version last
DEBUG:root:Getting training_data, version last
DEBUG:root:Getting raw_data/data, version last
DEBUG:root:Getting training_data, version last
DEBUG:root:

ValueError: 'EXT_SOURCE_1' is not in list

In [16]:
imputer = SimpleImputer()


In [21]:
a=np.empty([10,1])
a[0,0] = True
a.dtype

dtype('float64')

In [None]:
ml_repo.tree.training_data.training_data.obj.x_coord_names

## Pruning

In [None]:
tree_clf = tree.DecisionTreeClassifier(max_depth=60, min_samples_split = 2, criterion = 'gini', random_state=42)
path = tree_clf.cost_complexity_pruning_path(x_train, y_train)

In [None]:
rcParams['figure.figsize'] = 15, 10
plt.subplot(1,2,1)
plot_pruning(path)
plt.subplot(1,2,2)
plot_pruning(path, 3, 20)

## Verschiedene Parametrisierungen

In [None]:
trees = {}
tree_clf = tree.DecisionTreeClassifier( max_depth=4, min_samples_split = 2, criterion = 'gini', random_state=42)
trees['example_1'] = tree_clf.fit(x_train, y_train)
tree_clf = tree.DecisionTreeClassifier(max_depth=5, min_samples_split = 2, criterion = 'gini', random_state=42)
trees['example_2'] = tree_clf.fit(x_train, y_train)
tree_clf = tree.DecisionTreeClassifier(max_depth=6, min_samples_split = 2, criterion = 'gini', random_state=42)#'entropy')   
trees['example_3'] = tree_clf.fit(x_train, y_train)
tree_clf = tree.DecisionTreeClassifier(max_depth=7, min_samples_split = 2, criterion = 'gini', random_state=42)#'entropy')   
trees['example_4'] = tree_clf.fit(x_train, y_train)
tree_clf = tree.DecisionTreeClassifier(max_depth=8, min_samples_split = 2, criterion = 'gini', random_state=42)#'entropy')   
trees['example_4'] = tree_clf.fit(x_train, y_train)
tree_clf = tree.DecisionTreeClassifier(max_depth=9, min_samples_split = 2, criterion = 'gini', random_state=42)#'entropy')   
trees['example_4'] = tree_clf.fit(x_train, y_train)

tree_clf = tree.DecisionTreeClassifier(max_depth=50, min_samples_split = 2, criterion = 'gini', random_state=42, ccp_alpha=0.0000021)
trees['example_5'] = tree_clf.fit(x_train, y_train)
tree_clf = tree.DecisionTreeClassifier(max_depth=50, min_samples_split = 2, criterion = 'gini', random_state=42, ccp_alpha=0.0000022)
trees['example_6'] = tree_clf.fit(x_train, y_train)

## ROC und Verteilung der Ausfallwahrscheinlichkeit

In [None]:
rcParams['figure.figsize'] = 15, 10
plt.subplot(2,2,1)
_ = plot_roc(trees, x_train, y_train)
plt.title('Receiver operating characteristic, training-data')
plt.subplot(2,2,2)
plot_roc(trees, x_test, y_test)
_ = plt.title('Receiver operating characteristic, test-data')
plt.subplot(2,2,3)
plot_probs(trees, x_train, y_train)

plt.subplot(2,2,4)
plot_probs(trees, x_test, y_test)


## Baum-Statistiken

In [None]:
tree_statistics(trees, x_test, y_test)

## Feature Importance

In [None]:
plot_feature_importance(trees, input_data_x.columns)

## Cross-Validation

In [None]:
for i in [3, 9, 15, 20]:
    t = tree.DecisionTreeClassifier(max_depth=i, min_samples_split = 2, criterion = 'gini', random_state=42, ccp_alpha=0.00005)
    scores = cross_val_score(t, x_train, y_train,cv=10, scoring='roc_auc')
    print('max_depht: ' + str(i) + ", accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

In [None]:
if False:
    for alpha in [0.0000021, 0.0000022, 0.0000023, 0.0000024]:
        t = tree.DecisionTreeClassifier(max_depth=50, min_samples_split = 10, criterion = 'gini', random_state=42, ccp_alpha=0.0000022)
        scores = cross_val_score(t, x_train, y_train,cv=10, scoring='roc_auc')
        print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

# Random Forests

In [None]:
forests = {}

In [None]:
forest_clf = RandomForestClassifier(max_depth=20, min_samples_split = 20, criterion = 'gini', random_state=42, n_estimators=120, n_jobs = 2, bootstrap = True)
forests['forest_1'] = forest_clf.fit(x_train, y_train)

In [None]:
forest_clf = RandomForestClassifier(max_depth=30, min_samples_split = 2, criterion = 'gini', random_state=42, n_estimators=80, n_jobs = 2)
forests['forest_2'] = forest_clf.fit(x_train, y_train)

In [None]:
forest_clf = RandomForestClassifier(max_depth=30, min_samples_split = 2, criterion = 'gini', random_state=42, n_estimators=240, n_jobs = 2)#, max_features=None)
forests['forest_3'] = forest_clf.fit(x_train, y_train)

In [None]:
forest_clf = RandomForestClassifier(max_depth=30, min_samples_split = 2, criterion = 'gini', random_state=42, n_estimators=500, n_jobs = 2, max_samples=0.3)#, max_features=None)
forests['forest_3'] = forest_clf.fit(x_train, y_train)

## ROC und Verteilung der Ausfallwahrscheinlichkeit

In [None]:
rcParams['figure.figsize'] = 15, 10
plt.subplot(2,2,1)
_ = plot_roc(forests, x_train, y_train)
plt.title('Receiver operating characteristic, training-data')
plt.subplot(2,2,2)
plot_roc(forests, x_test, y_test)
_ = plt.title('Receiver operating characteristic, test-data')
plt.subplot(2,2,3)
plot_probs(forests, x_train, y_train)

plt.subplot(2,2,4)
plot_probs(forests, x_test, y_test)

## Forest-Statistiken

In [None]:
forest_statistics(forests, x_test, y_test)

## Feature Importance

In [None]:
plot_feature_importance(forests, input_data_x.columns)

# Randomized Trees

In [None]:
extra_tree_clf = {}
extra = ExtraTreesClassifier(max_depth=30, min_samples_split = 2, criterion = 'gini', random_state=42, n_estimators=1000, n_jobs = 2, max_samples=0.1)#, max_features=None)
extra_tree_clf['example_1'] = extra.fit(x_train, y_train)

In [None]:
rcParams['figure.figsize'] = 15, 10
plt.subplot(2,2,1)
_ = plot_roc(extra_tree_clf, x_train, y_train)
plt.title('Receiver operating characteristic, training-data')
plt.subplot(2,2,2)
plot_roc(extra_tree_clf, x_test, y_test)
_ = plt.title('Receiver operating characteristic, test-data')
plt.subplot(2,2,3)
plot_probs(extra_tree_clf, x_train, y_train)

plt.subplot(2,2,4)
plot_probs(extra_tree_clf, x_test, y_test)

In [None]:
forest_statistics(extra_tree_clf, x_test, y_test)

In [None]:
extra = ExtraTreesClassifier(max_depth=30, min_samples_split = 2, criterion = 'gini', random_state=42, n_estimators=1000, n_jobs = 2, max_samples=0.1)#, max_features=None)
scores = cross_val_score(extra, x_train, y_train,cv=10, scoring='roc_auc')
print('max_depht: ' + str(i) + ", accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

In [15]:
a = np.empty((10,3))

In [16]:
a.dtype

dtype('float64')