# Local training

In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

ModuleNotFoundError: No module named 'pandas'

### Load the data

In [2]:
zoo = pd.read_csv("../data/zoo.csv")
zoo

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,Mammal
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,Mammal
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,Fish
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,Mammal
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,Mammal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,spider,0,0,1,0,0,0,1,1,0,1,1,0,8,0,0,0,Invertebrate
210,snail,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,Invertebrate
211,silkworm,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,Invertebrate
212,jellyfish,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,Invertebrate


### Train/test split

In [3]:
X = zoo.iloc[:,1:-1]
y = zoo.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Create an ensamble classifier

In [4]:
ensamble = VotingClassifier(estimators=[
    ('mnb', MultinomialNB()),
    ('svc', SVC()),
    ('rf', RandomForestClassifier())
])

### Create a pipeline

In [5]:
pipe = Pipeline([
    ('encoder', None), 
    ('classifier', ensamble),
])

### Optimize parameters

In [6]:
cls = GridSearchCV(
    pipe, 
    {
        'encoder': [
            None,
            OneHotEncoder(handle_unknown='ignore')
        ],
        'classifier__mnb__alpha': [0.1, 1, 2],
        'classifier__svc__C': [0.1, 1, 10],
        'classifier__svc__class_weight': ['balanced'],
        'classifier__rf__n_estimators': [10, 100],
        'classifier__rf__criterion': ['gini', 'entropy'],
    }, 
    cv=5, 
    scoring='f1_macro'
)

In [7]:
cls.fit(X_train, y_train)
cls.best_params_

{'classifier__mnb__alpha': 1,
 'classifier__rf__criterion': 'entropy',
 'classifier__rf__n_estimators': 100,
 'classifier__svc__C': 1,
 'classifier__svc__class_weight': 'balanced',
 'encoder': OneHotEncoder(handle_unknown='ignore')}

### Print evaluation metrics

In [8]:
print('Validation score', cls.best_score_)
print('Test score', cls.score(X_test, y_test))

Validation score 0.9686167800453515
Test score 0.8598901098901098


### Save the model

In [9]:
pickle.dump(cls, open('model.pkl', 'wb'))