# [作業目標]
- 使用 Day 17 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 17 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [26]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = './data/'

### 之前做過的處理

In [27]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape

(307511, 122)

In [28]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len(list(app_train[col].unique())) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 17 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [30]:
###----用 DAYS_BIRTH  與 TARGET的corr 繪製目標值平均與分組組別的長條圖-----##

In [46]:
#DAYS_BIRTH 與 TARGET 的相關係數
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

-0.07823930830982712

In [47]:
'''
#DAYS_BIRTH 與 TARGET 的相關係數
app_train.corr()['TARGET']['DAYS_BIRTH']
'''

"\n#DAYS_BIRTH 與 TARGET 的相關係數\napp_train.corr()['TARGET']['DAYS_BIRTH']\n"

In [48]:
app_train['DAYS_BIRTH'].describe()

count    307511.000000
mean      16036.995067
std        4363.988632
min        7489.000000
25%       12413.000000
50%       15750.000000
75%       19682.000000
max       25229.000000
Name: DAYS_BIRTH, dtype: float64

In [50]:
app_train['DAYS_BIRTH_Year'] = app_train['DAYS_BIRTH']/365

In [51]:
app_train['DAYS_BIRTH_Year'].describe()

count    307511.000000
mean         43.936973
std          11.956133
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: DAYS_BIRTH_Year, dtype: float64

In [55]:
#等寬劃分useless
app_train['DAYS_BIRTH_Year_width'] = pd.cut(app_train['DAYS_BIRTH_Year'],7)
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_EMPLOYED_ANOM,AMT_INCOME_TOTAL_width,AMT_INCOME_TOTAL_freq,DAYS_BIRTH_width,DAYS_BIRTH_Year,DAYS_BIRTH_Year_width
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,1.0,False,"(-91324.35, 5874367.5]","(147150.0, 202500.0]","(7471.26, 10023.286]",25.920548,"(20.469, 27.461]"
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,False,"(-91324.35, 5874367.5]","(202500.0, 117000000.0]","(15091.857, 17626.143]",45.931507,"(41.348, 48.291]"
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,False,"(-91324.35, 5874367.5]","(25649.999, 112500.0]","(17626.143, 20160.429]",52.180822,"(48.291, 55.234]"
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,,,,,False,"(-91324.35, 5874367.5]","(112500.0, 147150.0]","(17626.143, 20160.429]",52.068493,"(48.291, 55.234]"
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,False,"(-91324.35, 5874367.5]","(112500.0, 147150.0]","(17626.143, 20160.429]",54.608219,"(48.291, 55.234]"


In [60]:
bin_10y = [20,30,40,50,60,70]
app_train['DAYS_BIRTH_Year_width'] = pd.cut(app_train['DAYS_BIRTH_Year'],bins = bin_10y)
app_train['DAYS_BIRTH_Year_width'].value_counts()

(30, 40]    82308
(40, 50]    76541
(50, 60]    68062
(20, 30]    45021
(60, 70]    35579
Name: DAYS_BIRTH_Year_width, dtype: int64

In [61]:
#等頻劃分useless
app_train['DAYS_BIRTH_Year_freq'] = pd.qcut(app_train['DAYS_BIRTH_Year'],4)
app_train['DAYS_BIRTH_Year_freq']

0         (20.517, 34.008]
1         (43.151, 53.923]
2         (43.151, 53.923]
3         (43.151, 53.923]
4         (53.923, 69.121]
                ...       
307506    (20.517, 34.008]
307507    (53.923, 69.121]
307508    (34.008, 43.151]
307509    (20.517, 34.008]
307510    (43.151, 53.923]
Name: DAYS_BIRTH_Year_freq, Length: 307511, dtype: category
Categories (4, interval[float64]): [(20.517, 34.008] < (34.008, 43.151] < (43.151, 53.923] < (53.923, 69.121]]

In [32]:
#等寬劃分useless
app_train['AMT_INCOME_TOTAL_width'] = pd.cut(app_train['AMT_INCOME_TOTAL'],20)
app_train['AMT_INCOME_TOTAL_width']

0         (-91324.35, 5874367.5]
1         (-91324.35, 5874367.5]
2         (-91324.35, 5874367.5]
3         (-91324.35, 5874367.5]
4         (-91324.35, 5874367.5]
                   ...          
307506    (-91324.35, 5874367.5]
307507    (-91324.35, 5874367.5]
307508    (-91324.35, 5874367.5]
307509    (-91324.35, 5874367.5]
307510    (-91324.35, 5874367.5]
Name: AMT_INCOME_TOTAL_width, Length: 307511, dtype: category
Categories (20, interval[float64]): [(-91324.35, 5874367.5] < (5874367.5, 11723085.0] < (11723085.0, 17571802.5] < (17571802.5, 23420520.0] ... (93605130.0, 99453847.5] < (99453847.5, 105302565.0] < (105302565.0, 111151282.5] < (111151282.5, 117000000.0]]

In [33]:
#等頻劃分useless
app_train['AMT_INCOME_TOTAL_freq'] = pd.qcut(app_train['AMT_INCOME_TOTAL'],4)
app_train['AMT_INCOME_TOTAL_freq']

0            (147150.0, 202500.0]
1         (202500.0, 117000000.0]
2           (25649.999, 112500.0]
3            (112500.0, 147150.0]
4            (112500.0, 147150.0]
                   ...           
307506       (147150.0, 202500.0]
307507      (25649.999, 112500.0]
307508       (147150.0, 202500.0]
307509       (147150.0, 202500.0]
307510       (147150.0, 202500.0]
Name: AMT_INCOME_TOTAL_freq, Length: 307511, dtype: category
Categories (4, interval[float64]): [(25649.999, 112500.0] < (112500.0, 147150.0] < (147150.0, 202500.0] < (202500.0, 117000000.0]]