# Membuat Pipeline 

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data/titanic.csv', index_col='PassengerId')
df.drop(columns=['Name', 'Ticket', 'Age', 'Cabin'], inplace=True)

In [4]:
x = df.drop(columns=['Survived'])
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [11]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")), 
    ("scaler", MinMaxScaler())
    
])
                              
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder()),
])



In [12]:
x_train.head()

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
693,3,male,0,0,56.4958,S
482,2,male,0,0,0.0,S
528,1,male,0,0,221.7792,S
856,3,female,0,1,9.35,S
802,2,female,1,1,26.25,S


In [18]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['SibSp', 'Parch', 'Fare']),
    ("categorical", categorical_pipeline, ['Pclass', 'Sex', 'Embarked'])
])

In [20]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsClassifier())    
])

In [29]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['SibSp', 'Parch', 'Fare']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder())]),
                                                  ['Pclass', 'Sex',
                         

In [30]:
pipeline.score(x_test, y_test)

0.776536312849162

# membuat model dengan GridSearchCV

In [31]:
from sklearn.model_selection import GridSearchCV

In [35]:
parameter = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ['uniform', 'distance'],
    "algo__p" : [1, 2]
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)

model.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch',
                                                                          'Fare']),
                                                                        ('categorical',
                                                                         Pipeline(steps=[('im

In [39]:
pd.DataFrame(model.cv_results_).sort_values(by= 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algo__n_neighbors,param_algo__p,param_algo__weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
40,0.075215,0.028367,0.095931,0.014849,21,1,uniform,"{'algo__n_neighbors': 21, 'algo__p': 1, 'algo_...",0.815126,0.818565,0.810127,0.814606,0.003465,1
36,0.101112,0.028743,0.111473,0.023468,19,1,uniform,"{'algo__n_neighbors': 19, 'algo__p': 1, 'algo_...",0.815126,0.818565,0.805907,0.813200,0.005344,2
24,0.087956,0.022612,0.091001,0.016545,13,1,uniform,"{'algo__n_neighbors': 13, 'algo__p': 1, 'algo_...",0.819328,0.810127,0.810127,0.813194,0.004337,3
28,0.087066,0.013919,0.125875,0.031031,15,1,uniform,"{'algo__n_neighbors': 15, 'algo__p': 1, 'algo_...",0.819328,0.810127,0.810127,0.813194,0.004337,3
38,0.065475,0.007606,0.093661,0.012524,19,2,uniform,"{'algo__n_neighbors': 19, 'algo__p': 2, 'algo_...",0.815126,0.814346,0.805907,0.811793,0.004174,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,0.076801,0.029050,0.046349,0.012769,3,1,distance,"{'algo__n_neighbors': 3, 'algo__p': 1, 'algo__...",0.760504,0.734177,0.751055,0.748579,0.010890,95
3,0.074010,0.009046,0.043081,0.013710,1,2,distance,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.752101,0.683544,0.772152,0.735932,0.037938,97
2,0.073930,0.011364,0.090498,0.017349,1,2,uniform,"{'algo__n_neighbors': 1, 'algo__p': 2, 'algo__...",0.752101,0.683544,0.772152,0.735932,0.037938,97
1,0.064949,0.004673,0.039343,0.007967,1,1,distance,"{'algo__n_neighbors': 1, 'algo__p': 1, 'algo__...",0.747899,0.683544,0.763713,0.731719,0.034671,99


In [40]:
model.best_params_

{'algo__n_neighbors': 21, 'algo__p': 1, 'algo__weights': 'uniform'}

In [41]:
model.best_score_

0.8146060111808436

In [42]:
model.score(x_test, y_test)

0.7821229050279329