In [1]:
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
data1 = pd.read_csv('Pokemon.csv')
data1.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [3]:
data1.dropna(inplace=True)

In [4]:
data1.columns

Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')

In [5]:
X1 = data1[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]
y1 = data1['Generation']

In [6]:
# Split

X_train1, X_test1, y_train1, y_tes1 = train_test_split(X1, y1, test_size = 0.2, random_state = 0)

In [7]:
data = pd.read_csv('penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [8]:
data.dropna(inplace=True)

In [9]:
X = data.drop(['species'], axis = 1)
y = data['species']

In [10]:
# Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [11]:
numcols = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
catcols = ['island','sex']

In [12]:
pre1 = ColumnTransformer([
    ('numcol', MinMaxScaler(), ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'])
])

In [13]:
pre = ColumnTransformer([
    ('numcol', MinMaxScaler(), numcols),
    ('catcol', OneHotEncoder(), catcols)
])

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


pipe = Pipeline([
                 ('proses1', pre1),
                 ('proses2', LogisticRegression())
                 ])

In [15]:
from sklearn.model_selection import cross_val_score
import numpy as np

np.mean(cross_val_score(pipe, X_train1, y_train1, cv = 10)),

(0.2235294117647059,)

In [16]:
np.std(cross_val_score(pipe, X_train1, y_train1, cv = 10)),

(0.06368502760385558,)

In [17]:
# np.mean(cross_val_score(pipe, X_train, y_train, cv = 2))
np.mean(cross_val_score(pipe, X_train1, y_train1, cv = 10))

0.2235294117647059

In [18]:
pipe2 = Pipeline([
                 ('proses1', pre1),
                 ('proses2', SVC())
                 ])

In [19]:
np.mean(cross_val_score(pipe2, X_train1, y_train1, cv = 10))

0.24741532976827094

In [20]:
pipe.fit(X_train1, y_train1)

In [21]:
y_pred_train =  pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

KeyError: "None of [Index(['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], dtype='object')] are in the [columns]"

In [None]:
from sklearn.metrics import classification_report


print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00       109
   Chinstrap       1.00      1.00      1.00        59
      Gentoo       1.00      1.00      1.00        99

    accuracy                           1.00       267
   macro avg       1.00      1.00      1.00       267
weighted avg       1.00      1.00      1.00       267



In [None]:
from sklearn.linear_model import LogisticRegression


paramgrid = [{'proses2' : [SVC()], 
              'proses2__kernel' : ['rbf', 'poly'], 
              'proses2__C' : [1, 10, 100]},
             {'proses2' : [DecisionTreeClassifier()], 
              'proses2__max_depth' : [1, 10]},
             {'proses2' : [LogisticRegression()],
              'proses2__C' : [1, 10], 
              'proses2__max_iter' : [10, 20]}]

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, param_grid = paramgrid, cv = 5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

{'proses2': SVC(C=1), 'proses2__C': 1, 'proses2__kernel': 'rbf'}

In [None]:
y_pred_train1 = grid.best_estimator_.predict(X_train)

In [None]:
print(classification_report(y_train, y_pred_train1))

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00       109
   Chinstrap       1.00      1.00      1.00        59
      Gentoo       1.00      1.00      1.00        99

    accuracy                           1.00       267
   macro avg       1.00      1.00      1.00       267
weighted avg       1.00      1.00      1.00       267

