### Python金融风控策略实战（基于Jupyter Notebook）
## <center>德国信贷风控数据建模（步骤2：对离散数据进行编码）</center>
### <center>策略制定及验证：张君颖  ； 报告日期：2020.11.5</center>
  <font color=blue><center>本报告不构成投资建议，转载需注明作者且不得删改</center></font>
  <font color=blue><center>作者邮箱：zhang.jun.ying@outlook.com</center></font>

### 第一步：导入需要使用的python库，并进行数据描述

In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore") # 忽略警告

### 数据概况
数据来源：加州大学（UCI）欧文机器学习库    
http://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29  

数据概况：“德国信用数据”将申请贷款的人分为“好信用”和“坏信用”，样本数据1000人，数据维度20维   

特征向量：1.支票账户状态；2.借款周期；3.历史信用；4.借款目的；5.信用额度；6.储蓄账户状态；7.当前就业状态；8.分期付款占可支配收入百分比；9.性别与婚姻状态；10.他人担保信息；11.现居住地；12.财产状态；13.年龄；14.其他分期情况；15.房产状态；16.信用卡数量；17.工作状态；18.赡养人数；19.电话号码注册情况；20.是否有海外工作经历

特征向量对应英文：1.status_account, 2.duration, 3.credit_history, 4,purpose, 5.amount, 6.svaing_account, 7.present_emp, 8.income_rate, 9.personal_status, 10.other_debtors, 11.residence_info, 12.property, 13.age, 14.inst_plans, 15.housing, 16.num_credits, 17.job, 18.dependents, 19.telephone, 20.foreign_worker    

输出结果：好信用（0），坏信用（1）   

数据收录时间：1994-11-17    

数据上传者：Professor Dr. Hans Hofmann 汉斯霍夫曼博士（德国汉堡大学）  

### 第二步：下载数据至本地，保存成csv格式，使用pandas导入数据

In [2]:
df = pd.read_csv('C:\\Users\\lotbear\\Desktop\\Data-science！\\金融风控实战数据\\german.csv',
                     delim_whitespace = True, header = None )
# 变量重命名
columns = ['status_account','duration','credit_history','purpose', 'amount',
            'svaing_account', 'present_emp', 'income_rate', 'personal_status',
            'other_debtors', 'residence_info', 'property', 'age',
            'inst_plans', 'housing', 'num_credits',
            'job', 'dependents', 'telephone', 'foreign_worker', 'target']
df.columns = columns

# 将标签变量由状态1,2转为0,1; 0表示好用户，1表示坏用户
df.target = df.target - 1

### 查看数据信息

In [3]:
df.head()

Unnamed: 0,status_account,duration,credit_history,purpose,amount,svaing_account,present_emp,income_rate,personal_status,other_debtors,...,property,age,inst_plans,housing,num_credits,job,dependents,telephone,foreign_worker,target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   status_account   1000 non-null   object
 1   duration         1000 non-null   int64 
 2   credit_history   1000 non-null   object
 3   purpose          1000 non-null   object
 4   amount           1000 non-null   int64 
 5   svaing_account   1000 non-null   object
 6   present_emp      1000 non-null   object
 7   income_rate      1000 non-null   int64 
 8   personal_status  1000 non-null   object
 9   other_debtors    1000 non-null   object
 10  residence_info   1000 non-null   int64 
 11  property         1000 non-null   object
 12  age              1000 non-null   int64 
 13  inst_plans       1000 non-null   object
 14  housing          1000 non-null   object
 15  num_credits      1000 non-null   int64 
 16  job              1000 non-null   object
 17  dependents       1000 non-null   i

In [5]:
df.describe()

Unnamed: 0,duration,amount,income_rate,residence_info,age,num_credits,dependents,target
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,35.546,1.407,1.155,0.3
std,12.058814,2822.736876,1.118715,1.103718,11.375469,0.577654,0.362086,0.458487
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0,0.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0,0.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0,0.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0,1.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0,1.0


### 将数据分为data_train（训练集）和 data_test（测试集）两部分

In [6]:
# 方便后续自定义函数调用/读取数据
    
def data_read(data_path,file_name):
    df = pd.read_csv( os.path.join(data_path, file_name), delim_whitespace = True, header = None )
    columns = ['status_account','duration','credit_history','purpose', 'amount',
               'svaing_account', 'present_emp', 'income_rate', 'personal_status',
               'other_debtors', 'residence_info', 'property', 'age',
               'inst_plans', 'housing', 'num_credits',
               'job', 'dependents', 'telephone', 'foreign_worker', 'target']
    df.columns = columns
    df.target = df.target - 1
    data_train, data_test = train_test_split(df, test_size=0.2, random_state=0,stratify=df.target)
    return data_train, data_test

if __name__ == '__main__':
    data_path = os.path.join('C:\\Users\\lotbear\\Desktop\\Data-science！','金融风控实战数据')
    file_name = 'german.csv'
    ##读取数据
    data_train, data_test = data_read(data_path,file_name)
    ##不可排序变量
    var_no_order = ['credit_history','purpose', 'personal_status', 'other_debtors',
                  'inst_plans', 'housing', 'job','telephone', 'foreign_worker']

### 第三步：使用 one—hot 编码

In [7]:
def onehot_encode(df,data_path_1,flag='train'):
    df = df.reset_index(drop=True)
    # 判断数据集是否存在缺失值
    if sum(df.isnull().any()) > 0 :
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        var_numerics = df.select_dtypes(include=numerics).columns
        var_str = [ i for i in df.columns if i not in  var_numerics ]
        # 数据类型的缺失值用-77777填补
        if len(var_numerics) > 0:
            df.loc[:,var_numerics] = df[var_numerics].fillna(-7777)
        # 字符串类型的缺失值用NA填补
        if len(var_str) > 0:
            df.loc[:,var_str] = df[var_str].fillna('NA')
            
    if flag == 'train':
        enc = OneHotEncoder(dtype='int').fit(df)
        # 保存编码模型
        save_model = open(os.path.join(data_path_1 ,'onehot.pkl'), 'wb')
        pickle.dump(enc, save_model, 0)
        save_model.close()
        df_return = pd.DataFrame( enc.transform(df).toarray())
        df_return.columns = enc.get_feature_names(df.columns)
        
    elif flag =='test':
        # 测试数据编码
        read_model = open(os.path.join(data_path_1 ,'onehot.pkl'),'rb')
        onehot_model = pickle.load(read_model)
        read_model.close()
        # 如果训练集无缺失值，测试集有缺失值则将该样本删除
        var_range = onehot_model.categories_
        var_name = df.columns
        del_index = []
        for i in range(len(var_range)):
            if 'NA' not in var_range[i]and 'NA' in df[var_name[i]].unique():
                index = np.where( df[var_name[i]] == 'NA')
                del_index.append(index)
            elif -7777 not in var_range[i] and -7777 in df[var_name[i]].unique():
                index = np.where( df[var_name[i]] == -7777)
                del_index.append(index)
        # 删除样本
        if len(del_index) > 0:
            del_index = np.unique(del_index)
            df = df.drop(del_index)
            print('训练集无缺失值，但测试集有缺失值，第{0}条样本被删除'.format(del_index))
        df_return = pd.DataFrame(onehot_model.transform( df).toarray())
        df_return.columns = onehot_model.get_feature_names(df.columns)
        
    elif flag == 'transform':
        # 编码数据值转化为原始变量
        read_model = open(os.path.join(data_path_1,'onehot.pkl'),'rb')
        onehot_model = pickle.load(read_model)
        read_model.close()
        # 逆变换
        df_return = pd.DataFrame( onehot_model.inverse_transform(df) )
        df_return.columns  = np.unique( ['_'.join(i.rsplit('_')[:-1] ) for i in df.columns])
    return df_return

In [8]:
# 训练数据编码
data_train.credit_history[882] = np.nan
data_train_encode = onehot_encode(data_train[var_no_order],data_path,flag='train')

In [9]:
# 测试集数据编码
data_test.credit_history[529] = np.nan
data_test.purpose[355] = np.nan
data_test_encode = onehot_encode(data_test[var_no_order],data_path,flag='test')

训练集无缺失值，但测试集有缺失值，第[1]条样本被删除


In [10]:
data_test_encode

Unnamed: 0,credit_history_A30,credit_history_A31,credit_history_A32,credit_history_A33,credit_history_A34,credit_history_NA,purpose_A40,purpose_A41,purpose_A410,purpose_A42,...,housing_A152,housing_A153,job_A171,job_A172,job_A173,job_A174,telephone_A191,telephone_A192,foreign_worker_A201,foreign_worker_A202
0,0,0,0,0,0,1,1,0,0,0,...,1,0,0,1,0,0,0,1,1,0
1,0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
2,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,1,0,1,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
4,0,0,0,0,1,0,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
195,0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,1,1,0
196,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,1,1,0,1,0
197,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0


In [11]:
# 改回
data_inverse = onehot_encode(data_test_encode,data_path,flag='transform')
data_inverse

Unnamed: 0,credit_history,foreign_worker,housing,inst_plans,job,other_debtors,personal_status,purpose,telephone
0,,A40,A93,A101,A143,A152,A172,A192,A201
1,A32,A40,A93,A101,A143,A152,A173,A191,A201
2,A32,A41,A93,A101,A143,A152,A173,A191,A201
3,A32,A46,A93,A101,A143,A153,A173,A191,A201
4,A34,A40,A93,A101,A143,A152,A173,A191,A201
...,...,...,...,...,...,...,...,...,...
194,A32,A40,A93,A101,A143,A152,A173,A191,A201
195,A32,A40,A93,A101,A143,A152,A173,A192,A201
196,A32,A410,A91,A102,A143,A152,A174,A191,A201
197,A34,A49,A91,A101,A143,A152,A173,A192,A201


In [12]:
# 哑变量编码
data_train_dummies =  pd.get_dummies(data_train[var_no_order])
data_test_dummies =  pd.get_dummies(data_test[var_no_order])
data_train_dummies.columns

Index(['credit_history_A30', 'credit_history_A31', 'credit_history_A32',
       'credit_history_A33', 'credit_history_A34', 'purpose_A40',
       'purpose_A41', 'purpose_A410', 'purpose_A42', 'purpose_A43',
       'purpose_A44', 'purpose_A45', 'purpose_A46', 'purpose_A48',
       'purpose_A49', 'personal_status_A91', 'personal_status_A92',
       'personal_status_A93', 'personal_status_A94', 'other_debtors_A101',
       'other_debtors_A102', 'other_debtors_A103', 'inst_plans_A141',
       'inst_plans_A142', 'inst_plans_A143', 'housing_A151', 'housing_A152',
       'housing_A153', 'job_A171', 'job_A172', 'job_A173', 'job_A174',
       'telephone_A191', 'telephone_A192', 'foreign_worker_A201',
       'foreign_worker_A202'],
      dtype='object')

### 第四步：使用 Label 标签编码

In [13]:
def label_encode(df,data_path_1,flag='train'):
    if flag == 'train':
        enc = LabelEncoder().fit( df )
        # 保存编码模型
        save_model = open(os.path.join(data_path_1 ,'labelcode.pkl'), 'wb')
        pickle.dump(enc, save_model, 0)
        save_model.close()
        df_return = pd.DataFrame( enc.transform(df))
        df_return.name = df.name
        
    elif flag =='test':
        # 测试数据编码
        read_model = open(os.path.join(data_path_1 ,'labelcode.pkl'),'rb')
        label_model = pickle.load(read_model)
        read_model.close()
        df_return = pd.DataFrame(label_model.transform( df))
        df_return.name = df.name

    elif flag == 'transform':
        # 编码数据值转化为原始变量
        read_model = open(os.path.join(data_path_1 ,'labelcode.pkl'),'rb')
        label_model = pickle.load(read_model)
        read_model.close()
        # 逆变换
        df_return = pd.DataFrame( label_model.inverse_transform(df) )
    return df_return

In [14]:
# 可排序变量
var_order = ['status_account','svaing_account', 'present_emp', 'property']

# 训练数据编码
data_train_encode = label_encode(data_train[var_order[1]],data_path,flag='train')

In [15]:
# 验证集数据编码
data_test_encode = label_encode(data_test[var_order[1]],data_path,flag='test')
data_test_encode 

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
195,0
196,1
197,0
198,3


In [16]:
#改回
data_inverse = label_encode(data_test_encode,data_path,flag='transform')
data_inverse 

Unnamed: 0,0
0,A61
1,A61
2,A61
3,A61
4,A61
...,...
195,A61
196,A62
197,A61
198,A64


### 第五步：使用 dict 自定义映射编码

In [17]:
def dict_encode(df,data_path_1):
    # 自定义映射
    embarked_mapping = {}
    embarked_mapping['status_account'] = {'NA': 1, 'A14': 2, 'A11':3,'A12': 4,'A13':5}  
    embarked_mapping['svaing_account'] = {'NA': 1, 'A65': 1, 'A61':3,'A62': 5,'A63':6,'A64':8}  
    embarked_mapping['present_emp'] = {'NA': 1, 'A71': 2, 'A72':5,'A73': 6,'A74':8,'A75':10}  
    embarked_mapping['property'] = {'NA': 1, 'A124': 1, 'A123':4,'A122': 6, 'A121':9 } 

    df = df.reset_index(drop=True)
    # 判断数据集是否存在缺失值
    if sum(df.isnull().any()) > 0 :
        df = df.fillna('NA')
    # 字典映射
    var_dictEncode = []        
    for i in df.columns:
        col = i + '_dictEncode'
        df[col] = df[i].map(embarked_mapping[i])
        var_dictEncode.append(col)
    return df[var_dictEncode]

In [18]:
# 训练数据编码
data_train.credit_history[882] = np.nan
data_train_encode = dict_encode(data_train[var_order],data_path)

In [19]:
# 测试集数据编码
data_test.status_account[529] = np.nan
data_test_encode = dict_encode(data_test[var_order],data_path)

In [20]:
data_test_encode

Unnamed: 0,status_account_dictEncode,svaing_account_dictEncode,present_emp_dictEncode,property_dictEncode
0,1,3,5,9
1,4,3,5,9
2,3,3,5,9
3,3,3,8,6
4,4,3,6,1
...,...,...,...,...
195,4,3,8,4
196,4,5,6,4
197,3,3,2,4
198,5,8,6,9


### 第六步：使用 WOE 编码

In [21]:
def woe_cal_trans(x, y, target=1):
    # 计算总体的正负样本数
    p_total = sum(y == target)
    n_total = len(x)-p_total
    value_num = list(x.unique())
    woe_map = {}
    iv_value = 0
    for i in value_num:
        # 计算该变量取值箱内的正负样本总数    
        y1 = y[np.where(x == i)[0]]
        p_num_1 = sum(y1 == target)
        n_num_1 = len(y1) - p_num_1
        # 计算占比
        bad_1 = p_num_1 / p_total
        good_1 =  n_num_1 / n_total
        if bad_1 == 0:
            bad_1 = 1e-5
        elif good_1 == 0:
            good_1 = 1e-5
        woe_map[i] = np.log(bad_1 / good_1)
        iv_value += (bad_1 - good_1) * woe_map[i]
    x_woe_trans = x.map(woe_map)
    x_woe_trans.name = x.name + "_woe"
    return x_woe_trans, woe_map, iv_value

In [22]:
def woe_encode(df,data_path_1,varnames, y, filename,flag='train'):
    """
    WOE编码映射
    ---------------------------------------
    Param
    df: pandas dataframe,待编码数据
    data_path_1 :存取文件路径
    varnames: 变量列表
    y:  目标变量
    filename:编码存取的文件名
    flag: 选择训练还是测试
    ---------------------------------------
    Return
    df: pandas dataframe, 编码后的数据，包含了原始数据
    woe_maps: dict,woe编码字典
    iv_values: dict, 每个变量的IV值
    """  
    df = df.reset_index(drop=True)
    # 判断数据集是否存在缺失值
    if sum(df.isnull().any()) > 0 :
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        var_numerics = df.select_dtypes(include=numerics).columns
        var_str = [ i for i in df.columns if i not in  var_numerics ]
        # 数据类型的缺失值用-77777填补
        if len(var_numerics) > 0:
            df.loc[:,var_numerics] = df[var_numerics].fillna(-7777)
        # 字符串类型的缺失值用NA填补
        if len(var_str) > 0:
            df.loc[:,var_str] = df[var_str].fillna('NA')
    if flag == 'train':
        iv_values = {}
        woe_maps = {}
        var_woe_name = []
        for var in varnames:
 #           var = 'loan_amnt_BIN'
            x = df[var]
            # 变量映射
            x_woe_trans, woe_map, info_value = woe_cal_trans(x, y)
            var_woe_name.append(x_woe_trans.name)
            df = pd.concat([df, x_woe_trans], axis=1)
            woe_maps[var] = woe_map
            iv_values[var] = info_value
        # 保存woe映射字典
        save_woe_dict = open(os.path.join(data_path_1 ,filename+'.pkl'), 'wb')
        pickle.dump(woe_maps, save_woe_dict, 0)
        save_woe_dict.close()
        return df, woe_maps, iv_values ,var_woe_name
    elif flag == 'test':
         ##测试数据编码
        read_woe_dict = open(os.path.join(data_path_1 ,filename+'.pkl'),'rb')
        woe_dict = pickle.load(read_woe_dict)
        read_woe_dict.close()
        ##如果训练集无缺失值，测试集有缺失值则将该样本删除
        woe_dict.keys()
        del_index = []
        for key,value in woe_dict.items():
            if 'NA' not in value.keys() and 'NA' in df[key].unique():
                index = np.where(df[key] == 'NA')
                del_index.append(index)
            elif -7777 not in value.keys() and -7777 in df[key].unique():
                index = np.where(df[key] == -7777)
                del_index.append(index)
        # 删除样本
        if len(del_index) > 0:
            del_index = np.unique(del_index)
            df = df.drop(del_index)
            print('训练集无缺失值，但测试集有缺失值，该样本{0}删除'.format(del_index))
        # WOE编码映射
        var_woe_name = []
        for key,value in woe_dict.items():
            val_name =  key+ "_woe"
            df[val_name] = df[key].map(value)
            var_woe_name.append(val_name)
        return df, var_woe_name

In [23]:
# 训练集WOE编码
# df_train_woe, dict_woe_map, dict_iv_values ,var_woe_name = woe_encode(data_train,data_path,var_no_order, data_train.target, 'dict_woe_map',flag='train')

# 测试集WOE编码
# df_test_woe, var_woe_name = woe_encode(data_test,data_path,var_no_order, data_train.target, 'dict_woe_map',flag='test')