# 1. 导入包

In [1]:
# -*- coding: utf-8 -*-
"""
Python 3.7.7
sklearn 0.23.1
使用决策树预测德国人信贷风险
"""

# 导入包
import numpy as np 
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 2. 导入数据集

In [2]:
# 导入数据集
data = pd.read_csv("german_credit_data.csv")

# 3. 数据预处理

## 3.1 检测并处理缺失值

In [3]:
# 检测并处理缺失值
null_df = data.isnull().sum() # 检测缺失值
null_df

NO.                   0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

#### 需要处理Saving accounts 和 Checking account 这2个字段

In [4]:

for col in ['Saving accounts', 'Checking account']: # 处理缺失值
    data[col].fillna('none', inplace=True) # none说明这些人没有银行账户

In [5]:
null_df = data.isnull().sum() # 检测缺失值
null_df

NO.                 0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
Risk                0
dtype: int64

## 3.2 处理类别型变量

In [6]:
# 处理Job字段
print(data.dtypes)
data['Job'] = data['Job'].astype('object')
print(data.dtypes)

NO.                  int64
Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
Risk                object
dtype: object
NO.                  int64
Age                  int64
Sex                 object
Job                 object
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
Risk                object
dtype: object


In [7]:
# 处理类别型变量
data = pd.get_dummies(data, drop_first = True) 

## 3.3 得到自变量和因变量

In [8]:
# 得到自变量和因变量
y = data['Risk_good'].values
data = data.drop(['Risk_good'], axis = 1)
x = data.values

## 3.4 拆分训练集和测试集

In [9]:
# 拆分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

## 3.5 特征缩放

In [10]:
# 特征缩放
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

# 4. 使用不同的参数构建决策树模型

## 4.1 模型1：构建决策树模型

### 4.1.1 构建模型

In [11]:
# 使用不同的参数构建决策树模型
# 模型1：构建决策树模型（criterion = 'entropy', max_depth = 3, min_samples_leaf = 50）
classifier = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, min_samples_leaf = 50, random_state = 0)
classifier.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=50,
                       random_state=0)

### 4.1.2 测试集做预测

In [12]:
# 在测试集做预测
y_pred = classifier.predict(x_test)

### 4.1.3 评估模型性能

In [13]:
# 评估模型性能
print(accuracy_score(y_test, y_pred)) # 0.705
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.705
[[  9  50]
 [  9 132]]
              precision    recall  f1-score   support

           0       0.50      0.15      0.23        59
           1       0.73      0.94      0.82       141

    accuracy                           0.70       200
   macro avg       0.61      0.54      0.53       200
weighted avg       0.66      0.70      0.65       200



### 4.1.4 画出树形结构

#### 准备工作：
1. 安装graphviz库，使用命令 pip install graphviz
2. 安装graphviz应用程序，并添加环境变量

##### 注：下面代码在Jupyter无法运行，需要在控制台运行

In [14]:
'''
# 画出树结构
from sklearn import tree
import graphviz
graphviz.Source(tree.export_graphviz(classifier, out_file='output.dot'))

# 将 dot 文件转换成其它文件
import os
os.system('dot -Tpng output.dot -o output.png') # 转换成png文件
'''

"\n# 画出树结构\nfrom sklearn import tree\nimport graphviz\ngraphviz.Source(tree.export_graphviz(classifier, out_file='output.dot'))\n\n# 将 dot 文件转换成其它文件\nimport os\nos.system('dot -Tpng output.dot -o output.png') # 转换成png文件\n"

## 4.2 模型2：构建决策树模型

In [15]:
# 模型2：构建决策树模型（criterion = 'gini', max_depth = 9, min_samples_leaf = 10）
classifier = DecisionTreeClassifier(criterion = 'gini', max_depth = 9, min_samples_leaf = 10, random_state = 0)
classifier.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=9, min_samples_leaf=10, random_state=0)

In [16]:
# 在测试集做预测
y_pred = classifier.predict(x_test)

In [17]:
# 评估模型性能
print(accuracy_score(y_test, y_pred)) # 0.715
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.715
[[ 27  32]
 [ 25 116]]
              precision    recall  f1-score   support

           0       0.52      0.46      0.49        59
           1       0.78      0.82      0.80       141

    accuracy                           0.71       200
   macro avg       0.65      0.64      0.64       200
weighted avg       0.71      0.71      0.71       200



#### 结论：
由上面2个模型可见，不同超参数对模型性能的影响不同