# 머신러닝 파이프라인

`-` 01. 데이터 불러오기 

`-` 02. 파이프라인 생성 및 학습

`-` 03. 파이프라인 함수로 만들기

`-` 04. 학습내용 저장 및 불러오기

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sub = pd.read_csv("./sample_submission.csv")

train.shape, test.shape, sub.shape

((8693, 14), (4277, 13), (4277, 2))

In [3]:
train.info(), test.info(), sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  

(None, None, None)

In [4]:
sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### 파이프라인에 사용할 임퓨터, 스케일러, 분류모델 정의

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier as DTC

In [6]:
imputer = SimpleImputer(strategy = "mean")
scaler = MinMaxScaler()
model = DTC()
#model = RF()

## 02.  파이프라인 생성 및 학습

In [7]:
from sklearn.pipeline import Pipeline

pipe_line = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line.fit(X_train, y_train)

In [8]:
pipe_line.predict(X_test)

array([False,  True, False, ...,  True,  True,  True])

### RandomForest (model2)

In [20]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [21]:
imputer = SimpleImputer(strategy = "mean")
#scaler = MinMaxScaler()
#model = DTC()
scaler = StandardScaler()
model = RandomForestClassifier()

In [22]:
from sklearn.pipeline import Pipeline

pipe_line = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line.fit(X_train, y_train)

In [16]:
y_pred = pipe_line.predict(X_test)

In [19]:
# model.score(X_test, y_test)

### 파이프라인 함수만들기

In [23]:
def pipe_line_fnc(X, imputer, scaler, model):
    X = imputer.transform(X)
    X = scaler.transform(X)
    pred = model.predict(X)
    return pred

pred_Y = pipe_line_fnc(X_test, imputer, scaler, model)

pred_Y[:5]

array([False,  True, False, False,  True])

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(pred_Y, y_test)

0.7838086476540939

### 로지스틱 모델 (model3)

In [27]:
sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [28]:
imputer3 = SimpleImputer(strategy = "mean")
scaler3 = MinMaxScaler()
model3 = LogisticRegression()

In [29]:
pipe_line_Log = Pipeline([ ("imputer", imputer3), 
                          ("scaler", scaler3), 
                          ("model", model3) ])

pipe_line_Log.fit(X_train, y_train)
pred = pipe_line_Log.predict(X_test)
pred[0:5]

array([ True,  True, False, False,  True])

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7382704691812327


### 04. 학습내용 저장 및 불러오기

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
# from sklearn.externals import joblib
import sklearn.externals
import joblib

### Knn 모델로 선택하여 저장해보자. (model4)

In [32]:
imputer4 = SimpleImputer(strategy = "mean")
scaler4 = MinMaxScaler()
model4 = KNeighborsClassifier()
pipe_line_knn = Pipeline([ ("imputer", imputer3), 
                          ("scaler", scaler3), 
                          ("model", model4) ])

pipe_line_knn.fit(X_train, y_train)
pred = pipe_line_knn.predict(X_test)

# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7723091076356946


### joblib을 이용한 모델을 파일로 저장하기

In [33]:
import os

joblib.dump(pipe_line_knn, "./model_pipe_knn.joblib" )
os.listdir("./")

['.ipynb_checkpoints',
 '2022-10-31-pipeline.ipynb',
 'model_pipe_knn.joblib',
 'sample_submission.csv',
 'test.csv',
 'train.csv']