# [作業目標]
- 使用 Day 17 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 17 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [6]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = './data/'

### 之前做過的處理

In [7]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape

(307511, 122)

In [8]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len(list(app_train[col].unique())) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 17 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [None]:
# 等間距
# 處理 "DAYS_BITRH" data
app_train['YEARS_BIRTH'] = app_train['DAYS_BIRTH']//365

# 新增欄位 "equal_width_YEARS_BIRTH", 對年齡做等寬劃分
app_train["equal_width_YEARS_BIRTH"] = pd.cut(app_train["YEARS_BIRTH"], bins=10)

# 觀察等寬劃分下, 每個種組距各出現幾次
app_train["equal_width_YEARS_BIRTH"].value_counts() # 每個 bin 的值的範圍大小都是一樣的

equal_width_YEARS_BIRTH
(34.7, 39.6]      42860
(39.6, 44.5]      41416
(29.8, 34.7]      39439
(44.5, 49.4]      35127
(49.4, 54.3]      34942
(54.3, 59.2]      33132
(24.9, 29.8]      32850
(59.2, 64.1]      27473
(19.951, 24.9]    12150
(64.1, 69.0]       8122
Name: count, dtype: int64

In [26]:
# print(app_train.shape)
app_train[['YEARS_BIRTH','equal_width_YEARS_BIRTH']].head()


Unnamed: 0,YEARS_BIRTH,equal_width_YEARS_BIRTH
0,25,"(24.9, 29.8]"
1,45,"(44.5, 49.4]"
2,52,"(49.4, 54.3]"
3,52,"(49.4, 54.3]"
4,54,"(49.4, 54.3]"


In [28]:
# 等數量
# 新增欄位 "equal_freq_age", 對年齡做等頻劃分
app_train["equal_freq_YEARS_BIRTH"] = pd.qcut(app_train["YEARS_BIRTH"],q=10)
# 觀察等頻劃分下, 每個種組距各出現幾次
app_train["equal_freq_YEARS_BIRTH"].value_counts() # 每個 bin 的資料筆數是一樣的

equal_freq_YEARS_BIRTH
(19.999, 28.0]    37330
(51.0, 56.0]      35236
(39.0, 43.0]      33597
(28.0, 32.0]      31764
(32.0, 36.0]      31763
(60.0, 69.0]      29368
(43.0, 47.0]      29335
(47.0, 51.0]      26782
(36.0, 39.0]      26442
(56.0, 60.0]      25894
Name: count, dtype: int64

In [29]:
# print(app_train.shape)
app_train[['YEARS_BIRTH','equal_width_YEARS_BIRTH','equal_freq_YEARS_BIRTH']].head()


Unnamed: 0,YEARS_BIRTH,equal_width_YEARS_BIRTH,equal_freq_YEARS_BIRTH
0,25,"(24.9, 29.8]","(19.999, 28.0]"
1,45,"(44.5, 49.4]","(43.0, 47.0]"
2,52,"(49.4, 54.3]","(51.0, 56.0]"
3,52,"(49.4, 54.3]","(51.0, 56.0]"
4,54,"(49.4, 54.3]","(51.0, 56.0]"
