### preprocessing (전처리)
- Data cleansing
- Data Encoding : 텍스트 데이터 -> 숫자로 변환 (범주형 데이터)
- Data Scaling : 숫자값 정규화
- Outlier : 이상치
- Feature Engineering : 속성 생성/수정/가공공

In [476]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Label Encoder
- 범주형 데이터에 대해 적절히 숫자로 변환하는 것

In [477]:
from sklearn.preprocessing import LabelEncoder

items = ['TV', '냉장고', '세탁기', '컴퓨터', '전기난로', '컴퓨터',\
    'TV','믹서기','컴퓨터']

encoder = LabelEncoder()
encoded_items = encoder.fit_transform(items)
encoded_items


array([0, 1, 3, 5, 4, 5, 0, 2, 5])

In [478]:
encoder.classes_

array(['TV', '냉장고', '믹서기', '세탁기', '전기난로', '컴퓨터'], dtype='<U4')

#### One-hot Encoder
- 주어진 데이터를 희소배열로 변환 (One-vs-Reset 배열)
- 희소 배열이란 대부분이 0이고 특정 인덱스만 값을 가지고 있는 배열

In [479]:
from sklearn.preprocessing import OneHotEncoder
items = np.array(items).reshape(-1,1)
encoder = OneHotEncoder()
oh_items = encoder.fit_transform(items)
oh_items

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (9, 6)>

In [480]:
print(oh_items)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (9, 6)>
  Coords	Values
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 3)	1.0
  (3, 5)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 0)	1.0
  (7, 2)	1.0
  (8, 5)	1.0


In [481]:
print(oh_items.toarray())

[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]]


In [482]:
encoder.categories_

[array(['TV', '냉장고', '믹서기', '세탁기', '전기난로', '컴퓨터'], dtype='<U4')]

- DataFrame에서 One-hot encoding 하기

In [483]:
df = pd.DataFrame({
    'items' : ['TV', '냉장고', '세탁기', '컴퓨터', '전기난로'
               , '컴퓨터','TV','믹서기','컴퓨터']
})
# df

In [484]:
df_dummies=pd.get_dummies(df, dtype=int)
# 2차원 배열로 바꿔줌 np.array(df_dummies) 와 같음
nd_dummies = df_dummies.to_numpy()

### Data Scaling( Feature Scaling)

In [485]:
from sklearn.datasets import load_iris

iris_data = load_iris()
iris_data.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [486]:
from sklearn.preprocessing import StandardScaler
standard_sc = StandardScaler()
standard_sc.fit_transform(iris_data.data)


array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

##### 표준 정규화(StandardScaler)
- 평균이 0, 표준편차가 1인 값으로 변환
- 이상치에 덜 민감하고, 선형회귀 및 로지스틱 회귀 등의 알고리즘에 적합
- 데이터가 정규분포인 경우 더욱 적합함

#### 최소최대 정규화(MinMaxScaler)
- 0~1 사이의 값으로 변환
- SVM 및 KNN과 같은 거리 기반 모델에 적합
- 이상치에 민감하게 반응, 이상치가 있는경우 데이터 왜곡 가능성

In [487]:
from sklearn.preprocessing import MinMaxScaler
minmax_sc = MinMaxScaler()
minmax_sc.fit_transform(iris_data.data)

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

### 타이타닉 생존율 예측에 필요한 전처리 해보기

In [488]:
df = pd.read_csv('./data/titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [489]:
# 전처리 -> 함수
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 

def fillna(df):
    '''
    결측치 처리 함수
    - Age : 평균치 대체
    - Cabin : 'N'이라는 기본값으로 대체
    - Embarked : 'N' 기본값으로 대체
    '''
    #Age
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    #Cabin
    df['Cabin'] = df['Cabin'].fillna('N')
    
    # Embarked
    df['Embarked'] = df['Embarked'].fillna('N')
    
    return df
        

def drop_feature(df):
    '''
    모델훈련과 관련 없는 속성 제거
    - PassengerId, Name, Ticket
    '''
    return df.drop(['PassengerId','Name','Ticket'], axis=1)

def encode_feature(df):
    '''
    범주형 데이터를 숫자로 인코딩
    - sex, Cabin, Embarked
    '''
    df['Cabin'] = df['Cabin'].str[:1]
    categories = ['Sex', 'Cabin', 'Embarked']
    for cate_item in categories:
        label_encoder = LabelEncoder()
        df[cate_item] = label_encoder.fit_transform(df[cate_item])

    return df

def scaling_feature(train_data, test_data):
    '''
    특성 스케일링
    '''
    standard = StandardScaler()
    train_scaled = standard.fit_transform(train_data)
    test_scaled = standard.fit_transform(test_data)
    
    return train_scaled, test_data

def preprocess_data(df):
    df = drop_feature(df)
    df = fillna(df)
    df = encode_feature(df)
    return df

In [490]:
# 전처리 함수 호출
df = preprocess_data(df)

In [491]:
# 훈련-테스트 데이터 분리
from sklearn.model_selection import train_test_split

# 입력-라벨 데이터 분리
titanic_input = df.drop(['Survived'],axis=1)
titanic_label = df['Survived']

X_train, X_test, y_train, y_test = \
    train_test_split(titanic_input,titanic_label, test_size=.2, random_state=0)

In [492]:
# 특성 스케일링
X_scaled_train, X_scaled_test = scaling_feature(X_train, X_test)

In [493]:
# LogisticRegression 훈련
from sklearn.linear_model import LogisticRegression
lr_classfier = LogisticRegression()
lr_classfier.fit(X_scaled_train, y_train)

In [494]:
# 평가
lr_classfier.score(X_scaled_train, y_train), \
    lr_classfier.score(X_scaled_test, y_test)



(0.7935393258426966, 0.6201117318435754)