In [3]:
# Ch4 データの表現と特徴量エンジニアリング
#
# 今まではデータが2次元の浮動小数点配列(連続値特徴量)として得られることを仮定していた
#   -> カテゴリ特徴量のような、離散値特徴量を見ていく
#
# 特徴量エンジニアリング: 特定のアプリケーションに対して、最良のデータ表現を模索すること
#   データを正しく表現することはめちゃ大事
#
#
# 4.1 カテゴリ変数
#
# 4.1.1 ワンホットエンコーディング（ダミー変数）
#
# * one-hot-encoding / one-out-of-N encoding
# * カテゴリ変数を1つ以上の0と1を持つ新しい特徴量で置き換える
# * pandas を使ってやる方が楽

import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

%matplotlib inline

adult_path = os.path.join(mglearn.datasets.DATA_PATH, "adult.data")
names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'gender',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
    'income']
data = pd.read_csv(
    adult_path, header=None, index_col=False, names=names)
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [4]:
print(data.gender.value_counts())

 Male      21790
 Female    10771
Name: gender, dtype: int64


In [5]:
print("Original features:\n", list(data.columns), "\n")
data_dummies =- pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))

Original features:
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

Features after get_dummies:
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-i

In [6]:
data_dummies.head()

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,-39,-40,0,0,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,-1,0
1,-50,-13,0,0,0,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,-1,0
2,-38,-40,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,-1,0
3,-53,-40,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,-1,0
4,-28,-40,0,0,0,0,-1,0,0,0,...,0,0,0,-1,0,0,0,0,-1,0


In [8]:
# features = data_dummies.ix[:, 'age': 'occupation_ Transport-moving']
features = data_dummies.loc[:, 'age': 'occupation_ Transport-moving']

# Pick up NumPy array
X = features.values
y = data_dummies['income_ >50K'].values
print(f"X.shape: {X.shape} y.shape: {y.shape}")

X.shape: (32561, 44) y.shape: (32561,)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Test score: {:.2f}".format(logreg.score(X_test, y_test)))

Test score: 0.81


In [11]:
# 連続値として扱っていいのか、離散値として扱わないといけないのか、データを把握しておく
# pandas の get_dummies は数値データは数値データのまま扱うので、例えば workingclass が 0~8 として格納されていたら、
# dummy data を用意しない

demo_df = pd.DataFrame({'Integer Feature': [0, 1, 2, 1], 'Categorical Feature': ['socks', 'fox', 'socks', 'box']})
display(demo_df)

Unnamed: 0,Categorical Feature,Integer Feature
0,socks,0
1,fox,1
2,socks,2
3,box,1


In [12]:
pd.get_dummies(demo_df)

Unnamed: 0,Integer Feature,Categorical Feature_box,Categorical Feature_fox,Categorical Feature_socks
0,0,0,0,1
1,1,0,1,0
2,2,0,0,1
3,1,1,0,0


In [13]:
demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)
pd.get_dummies(demo_df, columns=['Integer Feature', 'Categorical Feature'])

Unnamed: 0,Integer Feature_0,Integer Feature_1,Integer Feature_2,Categorical Feature_box,Categorical Feature_fox,Categorical Feature_socks
0,1,0,0,0,0,1
1,0,1,0,0,1,0
2,0,0,1,0,0,1
3,0,1,0,1,0,0
