# Local training

In [39]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

## Load the dataset

In [40]:
df = pd.read_csv("../data/mushrooms.csv")

In [41]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [42]:
# Encode the categorical features
label_encoders = {}
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [43]:
pickle.dump(label_encoders, open('label_encoders.pkl', 'wb'))

In [44]:
X = df.drop("class", axis=1)
y = df["class"]

## Train-test split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

## Ensamble classifier

In [46]:
ensamble = VotingClassifier(estimators=[
    ('mnb', MultinomialNB()),
    ('svc', SVC()),
    ('rf', RandomForestClassifier())
])

## Optimize parameters

In [47]:
cls = GridSearchCV(
    ensamble, 
    {
        'mnb__alpha': [0.1, 1, 2],
        'svc__C': [0.1, 1, 10],
        'svc__class_weight': ['balanced'],
        'rf__n_estimators': [10, 100],
        'rf__criterion': ['gini', 'entropy'],
    },
    cv=5,
    scoring='f1_macro'
)

In [48]:
cls.fit(X_train, y_train)
cls.best_params_

{'mnb__alpha': 0.1,
 'rf__criterion': 'gini',
 'rf__n_estimators': 10,
 'svc__C': 10,
 'svc__class_weight': 'balanced'}

## Evaluation metrics

In [49]:
print('Validation score', cls.best_score_)
print('Test score', cls.score(X_test, y_test))

Validation score 1.0
Test score 1.0


## Save the model

In [50]:
pickle.dump(cls, open('model.pkl', 'wb'))