In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from io import StringIO

In [2]:
csv_data = '''A,B,C,D,E
5.0,2.0,3.0,,6
1.0,6.0,,8.0,5
0.0,11.0,12.0,4.0,5
3.0,,3.0,5.0,
5.0,1.0,4.0,2.0,4
'''

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


## Missing Data (空值資料處理)

In [5]:
df.dropna()

Unnamed: 0,A,B,C,D,E
2,0.0,11.0,12.0,4.0,5.0
4,5.0,1.0,4.0,2.0,4.0


In [6]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [7]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [8]:
df.fillna(0)

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,0.0,6.0
1,1.0,6.0,0.0,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,0.0,3.0,5.0,0.0
4,5.0,1.0,4.0,2.0,4.0


In [9]:
df['B'] = df['B'].fillna(df['B'].mean())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [25]:
df['C'] = df['C'].fillna(df['C'].mode())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,4.0
4,5.0,1.0,4.0,2.0,4.0


In [11]:
df['D'] = df['D'].fillna(df['D'].median())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,
4,5.0,1.0,4.0,2.0,4.0


In [12]:
df['E'] = df['E'].fillna(df['E'].min())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,4.5,6.0
1,1.0,6.0,,8.0,5.0
2,0.0,11.0,12.0,4.0,5.0
3,3.0,5.0,3.0,5.0,4.0
4,5.0,1.0,4.0,2.0,4.0


In [14]:
df2 = pd.DataFrame([['green','M',10.1,1],
                   ['red','L',13.5,2],
                   ['blue','XL',15.3,1]],columns = ['color','size','price','classlabel'])
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,2
2,blue,XL,15.3,1


In [15]:
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,2
2,blue,3,15.3,1


In [16]:
pd.get_dummies(df2['color'])

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0


In [17]:
#加上 color 讓後續使用了解 one hot encoding 是從 color 欄位而來
onehot_encoding = pd.get_dummies(df2['color'], prefix = 'color')

In [18]:
#刪除 color 欄位
df2 = df2.drop('color',1)

In [19]:
#將 onehot_encoding 和 df2 合併
pd.concat([onehot_encoding, df2], axis = 1)

Unnamed: 0,color_blue,color_green,color_red,size,price,classlabel
0,0,1,0,1,10.1,1
1,0,0,1,2,13.5,2
2,1,0,0,3,15.3,1


## 資料正規化 (Normalization)

In [27]:
iris = datasets.load_iris()
x = pd.DataFrame(iris['data'], columns=iris['feature_names'])
print('target_names:' + str(iris['target_names']))
y = pd.DataFrame(iris['target'], columns=['target_names'])
data = pd.concat([x,y], axis=1)
data.head(3)

target_names:['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [31]:
data['sepal length (cm)'] = (data['sepal length (cm)'] - data['sepal length (cm)'].min())/\
                            (data['sepal length (cm)'].max() - data['sepal length (cm)'].min())

In [32]:
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,0.222222,3.5,1.4,0.2,0
1,0.166667,3.0,1.4,0.2,0
2,0.111111,3.2,1.3,0.2,0
3,0.083333,3.1,1.5,0.2,0
4,0.194444,3.6,1.4,0.2,0


## 資料標準化 (Standardization)

In [33]:
data['sepal width (cm)'] = (data['sepal width (cm)'] - data['sepal width (cm)'].mean())/\
                            data['sepal width (cm)'].std()

In [34]:
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,0.222222,1.028611,1.4,0.2,0
1,0.166667,-0.12454,1.4,0.2,0
2,0.111111,0.33672,1.3,0.2,0
3,0.083333,0.10609,1.5,0.2,0
4,0.194444,1.259242,1.4,0.2,0
