# note2
> 머신러닝 정리, 특성공학
- toc: true
- branch: master
- badges: false
- comments: true
- author: pinkocto
- categories: [python]

## Machine Learning의 절차
1. 데이터의 결측치/이상치 제거, 처리 (시각화, 가설검정, ...)
2. $X$(설명변수), $Y$(목표변수)를 선언
3. 학습데이터와 검증데이터를 분할
4. 학습데이터를 가져와, 알고리즘을 이용해 학습 실시
5. 검증데이터를 이용하여, 평가작업 실시

## Load Dataset

In [2]:
import pandas as pd

In [3]:
#hide
df1 = pd.read_csv('./data/Data01.csv', encoding='cp949')

In [4]:
df1.shape

(51304, 18)

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51304 entries, 0 to 51303
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         51304 non-null  int64  
 1   id                 51304 non-null  int64  
 2   type_of_contract   51300 non-null  object 
 3   type_of_contract2  51303 non-null  object 
 4   channel            51304 non-null  object 
 5   datetime           51304 non-null  object 
 6   Term               51304 non-null  int64  
 7   payment_type       51304 non-null  object 
 8   product            51303 non-null  object 
 9   amount             51304 non-null  int64  
 10  state              51304 non-null  object 
 11  overdue_count      51304 non-null  int64  
 12  overdue            51302 non-null  object 
 13  credit rating      42521 non-null  float64
 14  bank               48544 non-null  object 
 15  cancellation       51279 non-null  object 
 16  age                405

In [6]:
df1.drop(['Unnamed: 0'], axis=1, inplace=True)

In [7]:
df1.isnull().sum() # 각 항목 별 결측치 개수

id                       0
type_of_contract         4
type_of_contract2        1
channel                  0
datetime                 0
Term                     0
payment_type             0
product                  1
amount                   0
state                    0
overdue_count            0
overdue                  2
credit rating         8783
bank                  2760
cancellation            25
age                  10795
Mileage              10795
dtype: int64

## 결측치 처리

### 결측값 제거

In [8]:
# 결측치 처리 : 1. 다른 값으로 대치 / 2. 삭제

# 결측치 제거
df2 = df1.dropna()
print(df2.shape)

(40480, 17)


### 결측값 대치

In [9]:
# 결측치 대치
import numpy as np

In [10]:
df1['age']

0        43.0
1        62.0
2        60.0
3        60.0
4        51.0
         ... 
51299     NaN
51300    39.0
51301    51.0
51302    64.0
51303    53.0
Name: age, Length: 51304, dtype: float64

#### 1. Numpy 방법 [ replace() ]

In [11]:
# 1. Numpy 방법을 활용하는 경우
df1['age(clean)'] = df1['age'].replace(np.nan, 0)

#### 2. pandas [ fillna() ]

In [12]:
# 2. Pandas fillna 함수를 활용하는 경우
df1['age(clean)'] = df1['age'].fillna(0)
print(df1.isnull().sum())

id                       0
type_of_contract         4
type_of_contract2        1
channel                  0
datetime                 0
Term                     0
payment_type             0
product                  1
amount                   0
state                    0
overdue_count            0
overdue                  2
credit rating         8783
bank                  2760
cancellation            25
age                  10795
Mileage              10795
age(clean)               0
dtype: int64


##### 평균값으로 변환

In [13]:
# 결측값을 평균값으로 변환
df1['age(clean_mean)'] = df1['age'].fillna(df1['age'].mean())
print(df1['age'].mean())

46.60828457873559


##### 중앙값으로 변환

In [14]:
# 결측치를 중앙값으로 변환
df1['age(clean_median)'] = df1['age'].fillna(df1['age'].median())
print(df1['age'].median())

46.0


##### 위의 행의 값으로 대치

In [15]:
print(df1['age'].tail(7))

51297    70.0
51298     NaN
51299     NaN
51300    39.0
51301    51.0
51302    64.0
51303    53.0
Name: age, dtype: float64


In [16]:
# 결측치를 위의 행의 값으로 대치
df1['age(pad)'] = df1['age'].fillna(method='ffill')
# df1['age'].fillna(method='pad')

In [17]:
print(df1['age(pad)'].tail(7))

51297    70.0
51298    70.0
51299    70.0
51300    39.0
51301    51.0
51302    64.0
51303    53.0
Name: age(pad), dtype: float64


##### 아래의 행으로 대치

In [18]:
# 결측치를 아래의 행으로 대치
df1['age(back)'] = df1['age'].fillna(method='bfill')
print(df1['age(back)'].tail(7))

51297    70.0
51298    39.0
51299    39.0
51300    39.0
51301    51.0
51302    64.0
51303    53.0
Name: age(back), dtype: float64


In [19]:
# 아래에서부터 한칸만 채워준다.(범위를 제한해줄 수 있다.)
df1['aeg(back)'] = df1['age'].fillna(method='bfill', limit=1)
print(df1['aeg(back)'].tail(7))

51297    70.0
51298     NaN
51299    39.0
51300    39.0
51301    51.0
51302    64.0
51303    53.0
Name: aeg(back), dtype: float64


## Modeling

이제 본격적으로 분류 모델을 만들어보자.

In [26]:
print(df2.columns)

Index(['id', 'type_of_contract', 'type_of_contract2', 'channel', 'datetime',
       'Term', 'payment_type', 'product', 'amount', 'state', 'overdue_count',
       'overdue', 'credit rating', 'bank', 'cancellation', 'age', 'Mileage',
       'age(clean)', 'age(clean_mean)', 'age(clean_median)', 'age(pad)',
       'age(back)', 'aeg(back)'],
      dtype='object')


In [23]:
# 결측치 제거
df2 = df1.dropna()
print(df2.shape)

(40480, 23)


In [31]:
X = df2[['type_of_contract','Term','product','amount','age','overdue']]
Y = df2['cancellation'].replace('정상',0).replace('해약',1)

In [32]:
X.head(2)

Unnamed: 0,type_of_contract,Term,product,amount,age,overdue
0,렌탈,60,K1,96900,43.0,없음
1,렌탈,60,K1,102900,62.0,없음


In [37]:
# One Hot Encoding
X1 = pd.get_dummies(X)
X1.head(2)

Unnamed: 0,Term,amount,age,type_of_contract_렌탈,type_of_contract_멤버십,product_K1,product_K2,product_K3,product_K4,product_K5,product_K6,overdue_없음,overdue_있음
0,60,96900,43.0,1,0,1,0,0,0,0,0,1,0
1,60,102900,62.0,1,0,1,0,0,0,0,0,1,0


In [33]:
Y

0        0
1        0
2        0
3        0
4        1
        ..
51295    1
51296    0
51297    1
51300    1
51303    1
Name: cancellation, Length: 40480, dtype: int64

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import classification_report
from sklearn.metrics import recall_score, precision_score

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y, test_size=0.3,
                                                    random_state=1234)

print(X_train.shape)
print(X_test.shape)
print(Y_trai)