In [1]:
import pandas as pd
import numpy as np

# 原始資料中的各欄沒有標題，且有些欄位為問號 '?'
df = pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data')
df.head()

Unnamed: 0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,...,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [2]:
# 原始資料並沒有標題，因此自訂 headers
headers = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration',
           'num_doors', 'body_style', 'drive_wheels', 'engine_location',
           'wheel_base', 'length', 'width', 'height', 'curb_weight',
           'engine_type', 'num_cylinders', 'engine_size', 'fuel_system',
           'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm',
           'city_mpg', 'highway_mpg', 'price']

# 讀檔案的過程中，除了給予自訂的 header，也同時將問號('?')轉換成 Nan
df = pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data',
                  header=None, names=headers, na_values='?')
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [3]:
# 只取需要的 columns
obj_df = df.select_dtypes(include=['object']).copy()
obj_df = obj_df[['num_doors', 'body_style', 'drive_wheels']]
obj_df.head()

Unnamed: 0,num_doors,body_style,drive_wheels
0,two,convertible,rwd
1,two,convertible,rwd
2,two,hatchback,rwd
3,four,sedan,fwd
4,four,sedan,4wd


In [4]:
# 查看資料型態
print(obj_df.dtypes)

num_doors       object
body_style      object
drive_wheels    object
dtype: object


In [5]:
# .any() 只有要有一個 True 
# .all() 全部都為 True
# axis=1: row direction; axis=0: column direction
# 查看是否有欄位是 NaN

obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,num_doors,body_style,drive_wheels
27,,sedan,fwd
63,,sedan,fwd


In [6]:
# value_counts(): 顯示非 NaN 的數量
obj_df['num_doors'].value_counts()

four    114
two      89
Name: num_doors, dtype: int64

In [7]:
obj_df['num_doors'].size

205

In [8]:
obj_df['num_doors'].count()

203

In [9]:
obj_df = obj_df.fillna(value={'num_doors': 'four'})

In [10]:
obj_df['num_doors'].value_counts()

four    116
two      89
Name: num_doors, dtype: int64

## Find and Replace

In [11]:
to_replace = {'num_doors': {'four': 4, 'two': 2},
              'num_cylinders': {
                                'four': 4, 'six': 6, 'five': 5, 'eight': 8,
                                'two': 2, 'twelve': 12, 'three':3
                               }
             }
obj_df.replace(to_replace=to_replace, inplace=True)
obj_df.head()

Unnamed: 0,num_doors,body_style,drive_wheels
0,2,convertible,rwd
1,2,convertible,rwd
2,2,hatchback,rwd
3,4,sedan,fwd
4,4,sedan,4wd


In [12]:
obj_df.dtypes

num_doors        int64
body_style      object
drive_wheels    object
dtype: object

## Label Encoding

### pandas Categorical

In [13]:
a1 = obj_df['body_style'] # a1: pandas.Series
print('ori data type(a1):', a1.dtypes)
a2 = obj_df['body_style'].astype('category') # a2: pandas.Series
print('new data type(a2):', a2.dtypes)

ori data type(a1): object
new data type(a2): category


In [14]:
# a1 為 object; a1.cat.codes 不可行
# a2 為 category; 
pd_cat = a2.cat.codes # convert to int from string
print('pd_cat:', pd_cat.tolist()[:5]) # transform the to list from Seires and shows the first 5 values

pd_cat: [0, 0, 2, 3, 3]


In [15]:
# pd_cat 所相對應的類別
print(a2.cat.categories)

Index(['convertible', 'hardtop', 'hatchback', 'sedan', 'wagon'], dtype='object')


In [16]:
# 將 pd_cat 所相對應的類別轉換成 dictionary，方便閱讀
a2_classes = dict(enumerate(a2.cat.categories))
print(a2_classes)

{0: 'convertible', 1: 'hardtop', 2: 'hatchback', 3: 'sedan', 4: 'wagon'}


### sklearn.preprocessing.LabelEncoder

In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
sk_cat = le.fit_transform(a1)
print('sk_cat:', sk_cat.tolist()[0:5])

sk_cat: [0, 0, 2, 3, 3]


In [18]:
print('pandas :', pd_cat.tolist()[0:10])
print('sklearn:', sk_cat.tolist()[0:10])

pandas : [0, 0, 2, 3, 3, 3, 3, 4, 3, 2]
sklearn: [0, 0, 2, 3, 3, 3, 3, 4, 3, 2]


## One Hot Encoding

### pandas get_dummies()

In [19]:
b = obj_df[['body_style', 'drive_wheels']]
b.head()

Unnamed: 0,body_style,drive_wheels
0,convertible,rwd
1,convertible,rwd
2,hatchback,rwd
3,sedan,fwd
4,sedan,4wd


In [20]:
pd.get_dummies(data=b, columns=['drive_wheels']).head()

Unnamed: 0,body_style,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,convertible,0,0,1
1,convertible,0,0,1
2,hatchback,0,0,1
3,sedan,0,1,0
4,sedan,1,0,0


#### The new data frame contains three new columns
* drive_wheels_4wd
* drive_wheels_rwd
* drive_wheels_fwd

#### You can label the new columns by `prefix`

In [21]:
pd.get_dummies(data=b, columns=['body_style', 'drive_wheels'], prefix=['body', 'drive']).head()

Unnamed: 0,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,1,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,1
3,0,0,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0


### sklearn.preprocessing.LabelBinarizer
#### 可直接將 string value 轉換成 integer

In [22]:
from sklearn.preprocessing import LabelBinarizer

lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(b['body_style'])
lb_results

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       ..., 
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

In [23]:
lb_style.classes_

array(['convertible', 'hardtop', 'hatchback', 'sedan', 'wagon'],
      dtype='<U11')

In [24]:
pd.DataFrame(lb_results, columns=lb_style.classes_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0


## Reference
[Guide to Encoding Categorical Values in Python](http://pbpython.com/categorical-encoding.html)