In [9]:
import pandas as pd
import sqlite3
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from tempfile import mkdtemp
import pickle
from model_utils import prepare_df
from flat_classifier import FlatClassifier

In [2]:
conn = sqlite3.connect('../data/data_for_ml.db')
df = pd.read_sql('SELECT * FROM messages', con=conn)
conn.close()

In [3]:
df_train, df_test, cols = prepare_df(df)

In [4]:
cols

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'water', 'food', 'shelter', 'clothing', 'money', 'missing_people',
       'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport',
       'buildings', 'electricity', 'tools', 'hospitals', 'shops',
       'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
       'storm', 'fire', 'earthquake', 'cold', 'other_weather',
       'direct_report'],
      dtype='object')

In [16]:
vec_cachedir = mkdtemp()
pipe_cachedir = mkdtemp()

vec_transformer = Pipeline([
    ('vec', CountVectorizer(tokenizer=str.split,
                            token_pattern=None,
                            lowercase=False,
                            min_df=10)),
    ('tfidf', 'passthrough')
], memory = vec_cachedir)

preprocessor = ColumnTransformer(
    transformers = [
        ('message', vec_transformer, 'message'),
        ('original', 'drop', 'original')
    ]
)

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(class_weight='balanced', n_jobs=2))
], memory = pipe_cachedir)

In [17]:
param_grid = {
    'clf__min_samples_leaf': [1,2,3,5]
}

In [18]:
fclf = FlatClassifier(pipeline, cols)

In [19]:
fclf.fit(df_train, df_train[cols].values,
         selector=GridSearchCV,
         selector_params={'param_grid': param_grid,
                          'scoring': ['f1_macro', 'accuracy', 'f1'],
                          'refit': 'f1',
                          'cv': StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                          'verbose': 3
                         })

doing related
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.820) f1: (test=0.887) f1_macro: (test=0.719) total time=  10.6s
[CV 2/5] END clf__min_samples_leaf=1; accuracy: (test=0.824) f1: (test=0.890) f1_macro: (test=0.721) total time=   9.1s
[CV 3/5] END clf__min_samples_leaf=1; accuracy: (test=0.815) f1: (test=0.885) f1_macro: (test=0.707) total time=   9.0s
[CV 4/5] END clf__min_samples_leaf=1; accuracy: (test=0.822) f1: (test=0.889) f1_macro: (test=0.721) total time=  10.2s
[CV 5/5] END clf__min_samples_leaf=1; accuracy: (test=0.823) f1: (test=0.889) f1_macro: (test=0.724) total time=  10.4s
[CV 1/5] END clf__min_samples_leaf=2; accuracy: (test=0.815) f1: (test=0.876) f1_macro: (test=0.758) total time=   4.0s
[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.820) f1: (test=0.881) f1_macro: (test=0.757) total time=   4.2s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.814) f1: (test=0.876) f1_macro

[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.780) f1: (test=0.737) f1_macro: (test=0.774) total time=   5.9s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.780) f1: (test=0.737) f1_macro: (test=0.774) total time=   6.8s
[CV 4/5] END clf__min_samples_leaf=2; accuracy: (test=0.780) f1: (test=0.736) f1_macro: (test=0.774) total time=   7.9s
[CV 5/5] END clf__min_samples_leaf=2; accuracy: (test=0.779) f1: (test=0.740) f1_macro: (test=0.774) total time=   6.5s
[CV 1/5] END clf__min_samples_leaf=3; accuracy: (test=0.776) f1: (test=0.737) f1_macro: (test=0.771) total time=   4.6s
[CV 2/5] END clf__min_samples_leaf=3; accuracy: (test=0.773) f1: (test=0.730) f1_macro: (test=0.767) total time=   4.6s
[CV 3/5] END clf__min_samples_leaf=3; accuracy: (test=0.775) f1: (test=0.736) f1_macro: (test=0.770) total time=   4.3s
[CV 4/5] END clf__min_samples_leaf=3; accuracy: (test=0.774) f1: (test=0.732) f1_macro: (test=0.768) total time=   4.4s
[CV 5/5] END clf__min_samples_leaf=3; ac

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


doing medical_help
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.925) f1: (test=0.194) f1_macro: (test=0.577) total time=   9.0s
[CV 2/5] END clf__min_samples_leaf=1; accuracy: (test=0.924) f1: (test=0.159) f1_macro: (test=0.559) total time=   8.2s
[CV 3/5] END clf__min_samples_leaf=1; accuracy: (test=0.922) f1: (test=0.133) f1_macro: (test=0.546) total time=   8.0s
[CV 4/5] END clf__min_samples_leaf=1; accuracy: (test=0.926) f1: (test=0.148) f1_macro: (test=0.555) total time=   7.9s
[CV 5/5] END clf__min_samples_leaf=1; accuracy: (test=0.926) f1: (test=0.162) f1_macro: (test=0.561) total time=   8.7s
[CV 1/5] END clf__min_samples_leaf=2; accuracy: (test=0.919) f1: (test=0.500) f1_macro: (test=0.728) total time=   4.1s
[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.915) f1: (test=0.433) f1_macro: (test=0.693) total time=   4.4s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.921) f1: (test=0.491) f1_

[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.981) f1: (test=0.024) f1_macro: (test=0.507) total time=   2.4s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.981) f1: (test=0.071) f1_macro: (test=0.531) total time=   2.4s
[CV 4/5] END clf__min_samples_leaf=2; accuracy: (test=0.981) f1: (test=0.114) f1_macro: (test=0.552) total time=   2.4s
[CV 5/5] END clf__min_samples_leaf=2; accuracy: (test=0.980) f1: (test=0.125) f1_macro: (test=0.557) total time=   2.4s
[CV 1/5] END clf__min_samples_leaf=3; accuracy: (test=0.978) f1: (test=0.167) f1_macro: (test=0.578) total time=   2.2s
[CV 2/5] END clf__min_samples_leaf=3; accuracy: (test=0.978) f1: (test=0.078) f1_macro: (test=0.534) total time=   2.1s
[CV 3/5] END clf__min_samples_leaf=3; accuracy: (test=0.977) f1: (test=0.210) f1_macro: (test=0.599) total time=   2.2s
[CV 4/5] END clf__min_samples_leaf=3; accuracy: (test=0.978) f1: (test=0.162) f1_macro: (test=0.575) total time=   2.0s
[CV 5/5] END clf__min_samples_leaf=3; ac

[CV 4/5] END clf__min_samples_leaf=3; accuracy: (test=0.934) f1: (test=0.747) f1_macro: (test=0.855) total time=   2.7s
[CV 5/5] END clf__min_samples_leaf=3; accuracy: (test=0.939) f1: (test=0.754) f1_macro: (test=0.860) total time=   2.5s
[CV 1/5] END clf__min_samples_leaf=5; accuracy: (test=0.919) f1: (test=0.707) f1_macro: (test=0.830) total time=   2.3s
[CV 2/5] END clf__min_samples_leaf=5; accuracy: (test=0.921) f1: (test=0.718) f1_macro: (test=0.836) total time=   2.3s
[CV 3/5] END clf__min_samples_leaf=5; accuracy: (test=0.934) f1: (test=0.751) f1_macro: (test=0.857) total time=   2.1s
[CV 4/5] END clf__min_samples_leaf=5; accuracy: (test=0.919) f1: (test=0.709) f1_macro: (test=0.831) total time=   2.3s
[CV 5/5] END clf__min_samples_leaf=5; accuracy: (test=0.920) f1: (test=0.701) f1_macro: (test=0.827) total time=   2.2s
doing shelter
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.938) f1: (test=0.492) f1_macro

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


doing clothing
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.986) f1: (test=0.213) f1_macro: (test=0.603) total time=   3.7s
[CV 2/5] END clf__min_samples_leaf=1; accuracy: (test=0.985) f1: (test=0.187) f1_macro: (test=0.590) total time=   3.5s
[CV 3/5] END clf__min_samples_leaf=1; accuracy: (test=0.984) f1: (test=0.058) f1_macro: (test=0.525) total time=   3.7s
[CV 4/5] END clf__min_samples_leaf=1; accuracy: (test=0.985) f1: (test=0.182) f1_macro: (test=0.587) total time=   4.2s
[CV 5/5] END clf__min_samples_leaf=1; accuracy: (test=0.986) f1: (test=0.296) f1_macro: (test=0.645) total time=   3.6s
[CV 1/5] END clf__min_samples_leaf=2; accuracy: (test=0.988) f1: (test=0.609) f1_macro: (test=0.802) total time=   1.9s
[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.988) f1: (test=0.602) f1_macro: (test=0.798) total time=   1.8s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.988) f1: (test=0.559) f1_macr

[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.970) f1: (test=0.398) f1_macro: (test=0.691) total time=   2.5s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.969) f1: (test=0.434) f1_macro: (test=0.709) total time=   2.5s
[CV 4/5] END clf__min_samples_leaf=2; accuracy: (test=0.966) f1: (test=0.311) f1_macro: (test=0.647) total time=   2.5s
[CV 5/5] END clf__min_samples_leaf=2; accuracy: (test=0.966) f1: (test=0.375) f1_macro: (test=0.679) total time=   2.5s
[CV 1/5] END clf__min_samples_leaf=3; accuracy: (test=0.962) f1: (test=0.406) f1_macro: (test=0.693) total time=   2.2s
[CV 2/5] END clf__min_samples_leaf=3; accuracy: (test=0.965) f1: (test=0.464) f1_macro: (test=0.723) total time=   2.2s
[CV 3/5] END clf__min_samples_leaf=3; accuracy: (test=0.963) f1: (test=0.456) f1_macro: (test=0.718) total time=   2.2s
[CV 4/5] END clf__min_samples_leaf=3; accuracy: (test=0.961) f1: (test=0.415) f1_macro: (test=0.697) total time=   2.1s
[CV 5/5] END clf__min_samples_leaf=3; ac

[CV 4/5] END clf__min_samples_leaf=3; accuracy: (test=0.910) f1: (test=0.325) f1_macro: (test=0.638) total time=   2.6s
[CV 5/5] END clf__min_samples_leaf=3; accuracy: (test=0.904) f1: (test=0.291) f1_macro: (test=0.620) total time=   2.7s
[CV 1/5] END clf__min_samples_leaf=5; accuracy: (test=0.872) f1: (test=0.310) f1_macro: (test=0.620) total time=   2.1s
[CV 2/5] END clf__min_samples_leaf=5; accuracy: (test=0.878) f1: (test=0.324) f1_macro: (test=0.629) total time=   2.2s
[CV 3/5] END clf__min_samples_leaf=5; accuracy: (test=0.869) f1: (test=0.306) f1_macro: (test=0.617) total time=   2.1s
[CV 4/5] END clf__min_samples_leaf=5; accuracy: (test=0.883) f1: (test=0.339) f1_macro: (test=0.637) total time=   2.1s
[CV 5/5] END clf__min_samples_leaf=5; accuracy: (test=0.876) f1: (test=0.303) f1_macro: (test=0.617) total time=   2.2s
doing transport
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.955) f1: (test=0.069) f1_mac

doing tools
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.994) f1: (test=0.000) f1_macro: (test=0.499) total time=   2.9s
[CV 2/5] END clf__min_samples_leaf=1; accuracy: (test=0.994) f1: (test=0.000) f1_macro: (test=0.498) total time=   2.8s
[CV 3/5] END clf__min_samples_leaf=1; accuracy: (test=0.994) f1: (test=0.000) f1_macro: (test=0.498) total time=   2.8s
[CV 4/5] END clf__min_samples_leaf=1; accuracy: (test=0.994) f1: (test=0.000) f1_macro: (test=0.498) total time=   2.8s
[CV 5/5] END clf__min_samples_leaf=1; accuracy: (test=0.994) f1: (test=0.000) f1_macro: (test=0.498) total time=   2.8s
[CV 1/5] END clf__min_samples_leaf=2; accuracy: (test=0.994) f1: (test=0.000) f1_macro: (test=0.499) total time=   1.6s
[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.994) f1: (test=0.000) f1_macro: (test=0.498) total time=   1.5s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.994) f1: (test=0.000) f1_macro: 

[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.988) f1: (test=0.039) f1_macro: (test=0.517) total time=   1.9s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.989) f1: (test=0.078) f1_macro: (test=0.536) total time=   1.8s
[CV 4/5] END clf__min_samples_leaf=2; accuracy: (test=0.989) f1: (test=0.040) f1_macro: (test=0.517) total time=   1.8s
[CV 5/5] END clf__min_samples_leaf=2; accuracy: (test=0.988) f1: (test=0.038) f1_macro: (test=0.516) total time=   1.8s
[CV 1/5] END clf__min_samples_leaf=3; accuracy: (test=0.987) f1: (test=0.203) f1_macro: (test=0.598) total time=   1.7s
[CV 2/5] END clf__min_samples_leaf=3; accuracy: (test=0.987) f1: (test=0.131) f1_macro: (test=0.562) total time=   1.7s
[CV 3/5] END clf__min_samples_leaf=3; accuracy: (test=0.988) f1: (test=0.161) f1_macro: (test=0.578) total time=   1.7s
[CV 4/5] END clf__min_samples_leaf=3; accuracy: (test=0.987) f1: (test=0.182) f1_macro: (test=0.588) total time=   1.7s
[CV 5/5] END clf__min_samples_leaf=3; ac

[CV 4/5] END clf__min_samples_leaf=3; accuracy: (test=0.937) f1: (test=0.636) f1_macro: (test=0.801) total time=   2.6s
[CV 5/5] END clf__min_samples_leaf=3; accuracy: (test=0.939) f1: (test=0.650) f1_macro: (test=0.808) total time=   2.6s
[CV 1/5] END clf__min_samples_leaf=5; accuracy: (test=0.908) f1: (test=0.559) f1_macro: (test=0.754) total time=   2.2s
[CV 2/5] END clf__min_samples_leaf=5; accuracy: (test=0.918) f1: (test=0.590) f1_macro: (test=0.772) total time=   2.2s
[CV 3/5] END clf__min_samples_leaf=5; accuracy: (test=0.920) f1: (test=0.600) f1_macro: (test=0.778) total time=   2.2s
[CV 4/5] END clf__min_samples_leaf=5; accuracy: (test=0.917) f1: (test=0.579) f1_macro: (test=0.766) total time=   2.2s
[CV 5/5] END clf__min_samples_leaf=5; accuracy: (test=0.917) f1: (test=0.590) f1_macro: (test=0.772) total time=   2.2s
doing storm
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.933) f1: (test=0.575) f1_macro: 

doing cold
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END clf__min_samples_leaf=1; accuracy: (test=0.980) f1: (test=0.023) f1_macro: (test=0.507) total time=   3.8s
[CV 2/5] END clf__min_samples_leaf=1; accuracy: (test=0.980) f1: (test=0.024) f1_macro: (test=0.507) total time=   3.6s
[CV 3/5] END clf__min_samples_leaf=1; accuracy: (test=0.980) f1: (test=0.066) f1_macro: (test=0.528) total time=   3.6s
[CV 4/5] END clf__min_samples_leaf=1; accuracy: (test=0.980) f1: (test=0.000) f1_macro: (test=0.495) total time=   3.7s
[CV 5/5] END clf__min_samples_leaf=1; accuracy: (test=0.981) f1: (test=0.130) f1_macro: (test=0.560) total time=   3.7s
[CV 1/5] END clf__min_samples_leaf=2; accuracy: (test=0.985) f1: (test=0.553) f1_macro: (test=0.773) total time=   2.0s
[CV 2/5] END clf__min_samples_leaf=2; accuracy: (test=0.981) f1: (test=0.487) f1_macro: (test=0.739) total time=   2.0s
[CV 3/5] END clf__min_samples_leaf=2; accuracy: (test=0.983) f1: (test=0.551) f1_macro: (

In [20]:
with open('model_individual.pickle', 'wb') as fid:
    pickle.dump(fclf, fid)

In [21]:
f1_score(fclf.predict(df_test), df_test[cols].values, average='macro')

0.48305306281277205