# 特徴量エンジニアリング（カテゴリ特徴量）

In [None]:
!pip install -U pip
!pip install scikit-learn==0.20.0
!pip install category_encoders
!git clone https://github.com/nejumi/fe_workshop.git

In [4]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
pd.set_option('display.max_columns', 100)
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import category_encoders as ce
from google.colab import files

In [None]:
# collaboratoryにLendingClub50000.csvをアップロードする。
#uploaded = files.upload() # ローカルからアップロードできるが今回は使用しない。

In [7]:
# ファイルを読み込む
df = pd.read_csv('fe_workshop/dataset/LendingClub50000.csv')

In [8]:
df.head()

Unnamed: 0,ID,member_id,year,month,loan_amount,purpose_type,purpose_detail,title,emp_length,home_ownership,annual_inc,zip_code,addr_state,dti,grade,sub_grade,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,acc_now_delinq,tot_coll_amt,tot_cur_bal,bad_loan
0,1529851,1793711,2012,9,16000.0,debt_consolidation,Debt Consolidation / Final Wedding Exp,PwC,3.0,RENT,90400.0,080xx,NJ,23.72,A,A4,0.0,Oct-2003,0.0,,,21.0,0.0,3946.0,22.4,42.0,f,0.0,,INDIVIDUAL,0.0,0.0,164787.0,False
1,1824764,2126933,2012,11,3600.0,debt_consolidation,Crush Credit Cards,Morgan Stanley Smith Barney,2.0,RENT,42500.0,217xx,MD,17.34,B,B3,0.0,Dec-2004,0.0,,,16.0,0.0,10927.0,63.5,30.0,f,0.0,,INDIVIDUAL,0.0,8806.0,96627.0,False
2,403548,442721,2009,5,19200.0,wedding,Consolidate debt and pay for wedding,Aggregate Knowledge,1.0,RENT,95000.0,940xx,CA,7.58,B,B4,0.0,Mar-1999,0.0,,,11.0,0.0,14006.0,20.3,26.0,f,0.0,,INDIVIDUAL,0.0,,,False
3,646411,799671,2011,1,21000.0,credit_card,Wells BofA Credit Card Refi,Emerson Process Management,1.0,RENT,85000.0,926xx,CA,17.07,B,B5,0.0,Jun-1999,1.0,,,12.0,0.0,36280.0,47.6,25.0,f,0.0,,INDIVIDUAL,0.0,,,False
4,552526,711946,2010,7,4000.0,other,Debt Consolidation,Home Depot,1.0,RENT,63500.0,303xx,GA,19.5,C,C4,1.0,Oct-2002,2.0,23.0,,16.0,0.0,595.0,11.4,24.0,f,0.0,,INDIVIDUAL,0.0,,,False


## One-hoeエンコーディング

In [9]:
# pandasのget_dummiesでOne-hot Encodingする
pd.get_dummies(df.grade).head()

Unnamed: 0,A,B,C,D,E,F,G
0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0


In [12]:
# sklearnのOneHotEncoderでOne-hot Encodingする
ohe = OneHotEncoder()
ohe.fit_transform(df[['grade']]).A[:5, :] # np.arrayで返ってくる

array([[1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.]])

In [6]:
# category_encodersのOneHotEncoderでOne-hot Encodingする
ohe = ce.OneHotEncoder()
ohe.fit_transform(df[['grade']]).head() # unknown labelに相当する"grade_-1"が加えられているのがpd.get_dummiesと異なる

Unnamed: 0,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,grade_7,grade_-1
0,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0


In [7]:
# では、'addr_state'も同様に処理できるだろうか？
df[['addr_state']].head()

Unnamed: 0,addr_state
0,NJ
1,MD
2,CA
3,CA
4,GA


In [8]:
# ユニーク数の多い'addr_state'に対してOne-hotを行うとカラム数が膨大になってしまう。
ohe = ce.OneHotEncoder()
ohe.fit_transform(df[['addr_state']]).head() 

Unnamed: 0,addr_state_1,addr_state_2,addr_state_3,addr_state_4,addr_state_5,addr_state_6,addr_state_7,addr_state_8,addr_state_9,addr_state_10,addr_state_11,addr_state_12,addr_state_13,addr_state_14,addr_state_15,addr_state_16,addr_state_17,addr_state_18,addr_state_19,addr_state_20,addr_state_21,addr_state_22,addr_state_23,addr_state_24,addr_state_25,addr_state_26,addr_state_27,addr_state_28,addr_state_29,addr_state_30,addr_state_31,addr_state_32,addr_state_33,addr_state_34,addr_state_35,addr_state_36,addr_state_37,addr_state_38,addr_state_39,addr_state_40,addr_state_41,addr_state_42,addr_state_43,addr_state_44,addr_state_45,addr_state_46,addr_state_47,addr_state_48,addr_state_49,addr_state_50,addr_state_-1
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Labelエンコーディング

In [9]:
# sklearnのLabelEncoderを用いる
le = LabelEncoder()
le.fit_transform(df[['addr_state']])[:5]

  y = column_or_1d(y, warn=True)


array([30, 20,  4,  4, 10])

In [10]:
# category_encodersのOrdinalEncoderを用いる。
# 割り振られるラベルは異なるが基本的には同じ。
# ただし、DataRobotのOrdinalEncodingとは大きく異なる点に注意。
oe = ce.OrdinalEncoder()
oe.fit_transform(df[['addr_state']]).head()

Unnamed: 0,addr_state
0,1
1,2
2,3
3,3
4,4


## Frequency/Countエンコーディング

In [11]:
# 州ごとの観測数をカウントする。
summary = df.groupby(['addr_state'])[['addr_state']].count()
summary.head()

Unnamed: 0_level_0,addr_state
addr_state,Unnamed: 1_level_1
AK,152
AL,617
AR,346
AZ,1097
CA,8707


In [12]:
# 集計結果をapplyする。
df.addr_state.apply(lambda x: summary.loc[x]).head()

Unnamed: 0,addr_state
0,2182
1,1196
2,8707
3,8707
4,1620


## Targetエンコーディング

In [13]:
# 'addr_state'について観測数の少ないカテゴリがどの程度か確認する。
df['addr_state'].value_counts()

CA    8707
NY    4722
TX    3773
FL    3701
NJ    2182
IL    1933
PA    1796
VA    1630
GA    1620
OH    1523
MA    1424
NC    1273
MD    1196
AZ    1097
WA    1093
MI    1070
CO     986
CT     888
MO     834
MN     793
NV     685
AL     617
WI     597
LA     577
OR     575
SC     566
KS     415
KY     411
OK     411
AR     346
UT     338
NM     256
HI     255
RI     250
NH     234
DC     213
WV     208
AK     152
MT     139
DE     135
WY     114
VT      98
SD      92
IN      28
MS      15
TN      11
IA       9
ID       7
NE       3
ME       2
Name: addr_state, dtype: int64

In [14]:
# category_encodersのTargetEncoderを用いる。
# min_samples_leafはvalue_countsを参考に100にしてみた。
# 州記号を州ごとの平均貸し倒れ率に置換したことになる。
te = ce.TargetEncoder(min_samples_leaf=100)
te.fit_transform(df[['addr_state']], df[['bad_loan']]).head()

Unnamed: 0,addr_state
0,0.156737
1,0.155518
2,0.155737
3,0.155737
4,0.166049


In [15]:
# TargetEncoderの対象は予測ターゲットに限定されない。
# ここでは'annual_inc'を選択したため、州記号を州ごとの平均年収に置換したことになる。

# 特にインパクトの大きい特徴量がある場合に有効で、予測ターゲットを用いておらず、ターゲットリーケージの懸念がほぼない点で有用。
te = ce.TargetEncoder(min_samples_leaf=50)
result = te.fit_transform(df[['addr_state']], df[['annual_inc']])
result.head()

Unnamed: 0,addr_state
0,75177.647562
1,79444.082366
2,72415.453038
3,72415.453038
4,69727.534377


In [16]:
# 州記号を州ごとの平均年収を採用してみる。
df.addr_state = result.addr_state

In [17]:
# 結果を保存する。
df.to_csv('LendingClub50000_ave_income_state.csv', index=False)

In [None]:
# ダウンロードしたら、DataRobotでもう一度予測してみよう
files.download('LendingClub50000_ave_income_state.csv')