# 数据挖掘实践任务

## 任务描述

- 特征衍生
- 特征挑选：分别用IV值和随机森林等进行特征选择
- ……以及你能想到特征工程处理

In [1]:
# 读取任务1处理后的数据集
import pandas as pd
import numpy as np

data = pd.read_csv('./dataset/task1_proc.csv',encoding='gbk')
label = data['status']
features = data.drop(['status'],axis = 1, inplace=False)

# 查看特征属性与label的相关性

In [7]:
corr_matrix = data.corr()
corr_label = corr_matrix['status'].sort_values(ascending=False)

# 值得正相关性和负相关性的阈值，进行特征选择
p_threshold = 0.05
n_threshold = -0.05
for key,val in corr_label.items():
    if val >p_threshold or val < n_threshold:
        print(key,val) 

status 1.0
trans_fail_top_count_enum_last_1_month 0.33041964251993755
history_fail_fee 0.32006559669820817
loans_overdue_count 0.2792316878855482
latest_one_month_fail 0.257207311270889
rank_trad_1_month 0.14707676638259518
trans_fail_top_count_enum_last_6_month 0.13094562903201026
top_trans_count_last_1_month 0.1167819185019159
trans_fail_top_count_enum_last_12_month 0.1165483946924599
trans_day_last_12_month 0.07401728393870509
avg_price_top_last_12_valid_month 0.06992973143813404
latest_six_month_loan 0.06297144629542477
latest_query_day 0.059355339697781825
query_cash_count 0.0544344914448976
latest_three_month_loan 0.052952972369405306
low_volume_percent 0.05175474335192565
consfin_max_limit -0.06330561083001175
consume_top_time_last_1_month -0.06464396328738047
trans_top_time_last_1_month -0.0678142396120604
consfin_credit_limit -0.08389678368330399
consfin_avg_limit -0.09114810916531138
latest_one_month_suc -0.1383301312643708
apply_score -0.24448675882345297
loans_score -0.2593

# 使用随机森林进行特征选择

In [12]:
# 模型训练
# 暂时不考虑调参问题
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500,random_state=2018,verbose=2,oob_score=True)
rf_clf.fit(features,label)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
b

building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500
building tree 353 of 500
building tree 354 of 500
building tree 355 of 500
building tree 356 of 500
building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500
building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500
building tree 382 of 500
building tree 383 of 500
building tree 384 of 500
building tree 385 of 500


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    5.8s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=True, random_state=2018,
                       verbose=2, warm_start=False)

In [36]:
# 包外评估
print('oob_score:',rf_clf.oob_score_)

oob_score: 0.797979797979798


In [28]:
# 设定一定的阈值，进行特征提取
def select_features(scores,attributes, threshold = 0.01):
    '''
    scores:[]
    attributes:[]
    retrun: 被选择的特征列表和已根据重要性分数排序后的[(分数，特征)，...]
    '''
    feature_score_sorted = sorted(zip(scores,attributes),reverse=True)
    select = []
    for row in feature_score_sorted:
        score,feature = row
        if score > threshold:
            select.append(feature)
    return select,feature_score_sorted

In [32]:
features_seleted,feature_score_sorted = select_features(rf_clf.feature_importances_, features.columns)

In [37]:
# 打印更具特征重要性分数排序后对应的特征列表
feature_score_sorted

[(0.050186041151736645, 'history_fail_fee'),
 (0.048278592621966, 'trans_fail_top_count_enum_last_1_month'),
 (0.03605750268922589, 'loans_score'),
 (0.031413192669585026, 'apply_score'),
 (0.02652091779889111, 'latest_one_month_fail'),
 (0.023750925766497052, 'loans_overdue_count'),
 (0.017363105309311653, 'max_cumulative_consume_later_1_month'),
 (0.017311195149274296, 'trans_amount_3_month'),
 (0.016070385890153256, 'repayment_capability'),
 (0.01604477137541347, 'latest_query_day'),
 (0.01564516187535364, 'consfin_avg_limit'),
 (0.01547998017197939, 'abs'),
 (0.015433046287849469, 'avg_price_last_12_month'),
 (0.015291669220331984, 'historical_trans_amount'),
 (0.0152324129006679, 'loans_latest_day'),
 (0.01481948451986697, 'trans_amount_increase_rate_lately'),
 (0.014554273701466138, 'trans_activity_day'),
 (0.014350566351837749, 'first_transaction_day'),
 (0.014182978413727659, 'pawns_auctions_trusts_consume_last_6_month'),
 (0.014046196403715284, 'historical_trans_day'),
 (0.014

In [39]:
# 最后选取的特征
new_features = features[features_seleted]
print('经过随机森林特征选择后，当阈值为{}时，最后保留的特征数为{}个'.format(0.01,new_features.shape[1]))

经过随机森林特征选择后，当阈值为0.01时，最后保留的特征数为50个


# 使用IV值进行特征选择

In [5]:
# 分箱处理
def binning(feature,label,p_nums,n_nums,threshold,split_num = 10):
    while True:
        flag = True
        _,bins = pd.cut(feature,split_num,retbins=True)
        group_index = pd.cut(feature,bins,labels=False)
        group_p_nums = np.zeros(split_num)  # 每个分组正样本和负样本的数量
        group_n_nums = np.zeros(split_num)
        for i in range(split_num):
            temp = label[group_index==i]
            group_p_nums[i] = temp[temp==1].count()
            group_n_nums[i] = temp[temp==0].count()
            if (group_p_nums[i] / p_nums < threshold) or (group_n_nums[i] / n_nums < threshold):
                split_num -= 1
                flag = False
                break
        if i == (split_num - 1) and flag:
            break
        if split_num <= 0:   # 该情况不存在
            return None,None
    return group_p_nums,group_n_nums

In [9]:
# 计算IV值
def cal_iv(feature,label,p_nums,n_nums,threshold=0.005):
    #bin_num = len(feature.unique())
    group_p_nums,group_n_nums = binning(feature,label,p_nums,n_nums,threshold)
    iv = np.sum((group_p_nums/p_nums - group_n_nums/n_nums)* np.log((group_p_nums / p_nums)/(group_n_nums / n_nums)))
    return iv

In [10]:
# 得到所有特征的IV值对应表
def get_feature_iv_table(features,label):
    p_nums = np.sum(label==1)  # 所有样本中不逾期的样本
    n_nums = np.sum(label==0) # 所有样本中预期的样本
    temp = []
    for col in features.columns:
        iv = cal_iv(features[col],label,p_nums,n_nums)
        temp.append(iv)
    iv_df = pd.DataFrame({'features':features.columns, 'IV':temp}, columns=['features', 'IV'])
    iv_df = iv_df.sort_values('IV',ascending=False)
    return iv_df

In [11]:
featrues_standard = (features - features.min()) / (features.max() - features.min())
iv_df = get_feature_iv_table(featrues_standard,label)

In [360]:
threshold = 0.002
new_features = iv_df[iv_df['IV'] > threshold]['features']
print(new_features)
print('经过IV值进行特征选择后，当阈值为{}时，最后保留的特征数为{}个'.format(threshold,new_features.shape[0]))

50                          loans_score
41                          apply_score
16                    rank_trad_1_month
24          trans_top_time_last_1_month
26        consume_top_time_last_1_month
20         top_trans_count_last_1_month
54                  loans_overdue_count
22    avg_price_top_last_12_valid_month
39                first_transaction_day
13               first_transaction_time
1                 middle_volume_percent
52                          loans_count
60                latest_six_month_loan
9                     regional_mobility
75                consfin_product_count
55             loans_org_count_behavior
70                      loans_max_limit
78                     latest_query_day
57                     loans_cash_count
68              loans_org_count_current
45                     query_cash_count
5                    trans_activity_day
18      avg_consume_less_12_valid_month
32       consume_mini_time_last_1_month
11                         is_high_user
