# ML Pipeline Preparation
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database
- Define feature and target variables X and Y

In [1]:
# import libraries and set configurations
from IPython.display import display
import tqdm
import datetime
import pandas as pd
import numpy as np
import re

from sqlalchemy import create_engine

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from joblib import dump, load

from xgboost import XGBClassifier

import optuna

nltk.download(['punkt','stopwords'])
pd.set_option('display.max_columns',40)

[nltk_data] Downloading package punkt to /Users/mccunha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mccunha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///../data/DisasterResponse.db')

df = pd.read_sql('disaster_data', engine)

display(df.head())

def highlight_imbalanced(col):
    distance_perfect_distribution = np.abs(col - (1/len(col)))
    return ['background-color: red' if len(col)<2 else 'background-color: yellow' if dist > 0.1 else '' for dist in distance_perfect_distribution]


print('Viewing distributions of values per label (class imbalances are highlighted):\n')
df_X = df.iloc[:,4:]
for col in df_X:
    ct = pd.crosstab(index=df_X[col], columns='%freq', normalize='columns')
    display(ct.style.apply(highlight_imbalanced))
    if len(ct) < 2:
        print('Column with less than 2 values!')

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Viewing distributions of values per label (class imbalances are highlighted):



col_0,%freq
related,Unnamed: 1_level_1
0,0.233522
1,0.759307
2,0.007171


col_0,%freq
request,Unnamed: 1_level_1
0,0.829341
1,0.170659


col_0,%freq
offer,Unnamed: 1_level_1
0,0.995499
1,0.004501


col_0,%freq
aid_related,Unnamed: 1_level_1
0,0.585749
1,0.414251


col_0,%freq
medical_help,Unnamed: 1_level_1
0,0.920507
1,0.079493


col_0,%freq
medical_products,Unnamed: 1_level_1
0,0.949916
1,0.050084


col_0,%freq
search_and_rescue,Unnamed: 1_level_1
0,0.972383
1,0.027617


col_0,%freq
security,Unnamed: 1_level_1
0,0.982034
1,0.017966


col_0,%freq
military,Unnamed: 1_level_1
0,0.967196
1,0.032804


col_0,%freq
child_alone,Unnamed: 1_level_1
0,1.0


Column with less than 2 values!


col_0,%freq
water,Unnamed: 1_level_1
0,0.936222
1,0.063778


col_0,%freq
food,Unnamed: 1_level_1
0,0.888503
1,0.111497


col_0,%freq
shelter,Unnamed: 1_level_1
0,0.911733
1,0.088267


col_0,%freq
clothing,Unnamed: 1_level_1
0,0.984551
1,0.015449


col_0,%freq
money,Unnamed: 1_level_1
0,0.976961
1,0.023039


col_0,%freq
missing_people,Unnamed: 1_level_1
0,0.988633
1,0.011367


col_0,%freq
refugees,Unnamed: 1_level_1
0,0.966623
1,0.033377


col_0,%freq
death,Unnamed: 1_level_1
0,0.954455
1,0.045545


col_0,%freq
other_aid,Unnamed: 1_level_1
0,0.868554
1,0.131446


col_0,%freq
infrastructure_related,Unnamed: 1_level_1
0,0.934963
1,0.065037


col_0,%freq
transport,Unnamed: 1_level_1
0,0.954188
1,0.045812


col_0,%freq
buildings,Unnamed: 1_level_1
0,0.949153
1,0.050847


col_0,%freq
electricity,Unnamed: 1_level_1
0,0.979707
1,0.020293


col_0,%freq
tools,Unnamed: 1_level_1
0,0.993935
1,0.006065


col_0,%freq
hospitals,Unnamed: 1_level_1
0,0.989205
1,0.010795


col_0,%freq
shops,Unnamed: 1_level_1
0,0.995423
1,0.004577


col_0,%freq
aid_centers,Unnamed: 1_level_1
0,0.988213
1,0.011787


col_0,%freq
other_infrastructure,Unnamed: 1_level_1
0,0.956096
1,0.043904


col_0,%freq
weather_related,Unnamed: 1_level_1
0,0.721659
1,0.278341


col_0,%freq
floods,Unnamed: 1_level_1
0,0.917798
1,0.082202


col_0,%freq
storm,Unnamed: 1_level_1
0,0.906813
1,0.093187


col_0,%freq
fire,Unnamed: 1_level_1
0,0.989243
1,0.010757


col_0,%freq
earthquake,Unnamed: 1_level_1
0,0.906355
1,0.093645


col_0,%freq
cold,Unnamed: 1_level_1
0,0.979783
1,0.020217


col_0,%freq
other_weather,Unnamed: 1_level_1
0,0.947513
1,0.052487


col_0,%freq
direct_report,Unnamed: 1_level_1
0,0.806416
1,0.193584


As shown above, almost all classes are imbalanced.

We highlighted the labels that were considered imbalanced (with skewed distributions). Labels are considered imbalanced here when their values have a distance from the perfect balance that is higher than 10%. For example, a label that has 3 possible values (0,1,2) has a perfect balance of 33.33%. If a value occurs more than 43.33% or less than 23.33% of the time, than the label is considered imbalanced.

`child_alone` is always 0. Therefore we could choose not to predict this column (always predict 0 for example without an ML model). For this reason we choose to drop this column.

In [3]:
# Drop `child_alone` column
df.drop('child_alone', axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
print('Top-10 Labels that appear the most (values are 1 or 2):')
most_freq_categories = df.iloc[:,4:].sum(axis=0).sort_values(ascending=False)[:10].index.values; most_freq_categories

Top-10 Labels that appear the most (values are 1 or 2):


array(['related', 'aid_related', 'weather_related', 'direct_report',
       'request', 'other_aid', 'food', 'earthquake', 'storm', 'shelter'],
      dtype=object)

In [6]:
df_metadata = pd.read_csv('../data/metadata.csv', sep=';')

In [7]:
df_metadata

Unnamed: 0,Column,Description
0,id,Unique ID for each individual row
1,message,English text of actual messages related to dis...
2,original,Text of column 3 in native language as origina...
3,genre,"Type of message, including direct messages, so..."
4,related,"Is the message disaster related? 1= yes, 2=no,..."
5,request,"Does the message contain a request? 1= yes, 2=no"
6,offer,"Does the message contain an offer? 1= yes, 2=no"
7,aid_related,"Is the message aid related? 1= yes, 2=no"
8,medical_help,"Does the message concern medical help? 1= yes,..."
9,medical_products,Does the message concern medical products? 1= ...


In [8]:
X = df.iloc[:,1].values
Y = df.iloc[:,4:].values

In [9]:
X[:5]

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name',
       'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
       'says: west side of Haiti, rest of the country today and tonight'],
      dtype=object)

In [10]:
Y[:5,:]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [11]:
column_names = df.columns; column_names

Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

### 2. Write a tokenization function to process your text data

In [12]:
message = X[0]; message

'Weather update - a cold front from Cuba that could pass over Haiti'

In [13]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    
    # remove leading and trailing spaces before lemmatizing
    tokens = [lemmatizer.lemmatize(word.strip()) for word in tokens if word not in stop_words]  

    return tokens

In [14]:
tokenize(message)

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']

### 3. Build a machine learning pipeline
- Use MultiOutputClassifier to for predict multiple target variables.

In [15]:
pipeline_gbc = Pipeline([
                    ('features', TfidfVectorizer(tokenizer=tokenize)),
                    ('clf', MultiOutputClassifier(GradientBoostingClassifier()))
               ],
               verbose=True)

In [16]:
pipeline_gbc.get_params()

{'memory': None,
 'steps': [('features',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=1.0, max_features=None,
                   min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x1a27a9dcb0>, use_idf=True,
                   vocabulary=None)),
  ('clf',
   MultiOutputClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                              init=None,
                                                              learning_rate=0.1,
                                                              loss='deviance',
                                                            

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [17]:
# random_state to make it easier to reproduce
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1000)

In [18]:
#%%prun #%%mrun
pipeline_gbc.fit(X_train,Y_train)

In [19]:
dump(pipeline_gbc, f'../flask_app/models/pipeline_gbc-{datetime.date.today()}.joblib')

In [21]:
pipeline_gbc = load('../flask_app/models/pipeline_gbc.joblib')

In [22]:
pipeline_gbc

Pipeline(memory=None,
         steps=[('features',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patter...
                                                                            loss='deviance',
                                                                            max_depth=3,
                                                                            max_features=None,
                         

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [23]:
Y_preds_gbc = pipeline_gbc.predict(X_test)

In [24]:
def create_reports(clfs_names, Y_test, Y_preds, column_names, verbose=True, df_reports=None):
    """
    Function for printing classification reports for each label (column) and generating a Pandas DataFrame with metrics for the whole model averaging labels. 
    
    clfs_names : array of strings, shape = [n_classifiers]
    Names of each classifier
    
    Y_test : 2d array-like
    Ground truth (correct) target values. Each column is a label.

    Y_preds : n x 2d array-like
    Estimated targets as returned by a classifier. List of predictions for n classifiers.
    
    column_names: list of column names
    
    df_reports: Pandas DataFrame of the metrics for the whole multi-label model.
    If no DataFrame report if provided, create a new one. If it is provided, append metrics for new models.
    
    verbose: bool, default: True
    When set to True, prints classification_report for each column.
    """
    if df_reports is None:
        df_reports = pd.DataFrame(columns=['mean accuracy','mean macro avg f1-score', 'mean weighted avg f1-score'])
    
    for clf_name, Y_preds in zip(clfs_names,Y_preds):#,Y_preds_hgbc,Y_preds_rfc]):
        clf_metrics = pd.DataFrame()
        if verbose:
            print(f'Metrics for each feature for model - {clf_name}\n')
        for col in range(Y_preds.shape[1]):
            report = classification_report(Y_test[:,col], Y_preds[:,col], output_dict=True)
            # model_reports[clf_name] = [report['accuracy'], report['macro avg']['f1-score'], report['weighted avg']['f1-score']]
            label_metrics = pd.DataFrame(data=[[report['accuracy'], report['macro avg']['f1-score'], report['weighted avg']['f1-score']]])
            clf_metrics = pd.concat([clf_metrics, label_metrics], axis=0)
            if verbose:
                print('Column:', column_names[col])
                print(classification_report(Y_test[:,col], Y_preds[:,col]),'\n   -----------------------------------------------\n')
        clf_metrics = clf_metrics.mean(axis=0).to_frame().transpose().rename(index={0:clf_name}, columns={0:'mean accuracy',1:'mean macro avg f1-score',2:'mean weighted avg f1-score'})
        df_reports = pd.concat([df_reports, clf_metrics],axis=0)
        
    return df_reports


In [25]:
df_reports = create_reports(['gbc'], Y_test, [Y_preds_gbc], column_names)

Metrics for each feature for model - gbc

Column: id
              precision    recall  f1-score   support

           0       0.72      0.19      0.30      2046
           1       0.79      0.97      0.87      6556
           2       0.31      0.22      0.26        50

    accuracy                           0.78      8652
   macro avg       0.61      0.46      0.48      8652
weighted avg       0.77      0.78      0.73      8652
 
   -----------------------------------------------

Column: message
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      7201
           1       0.82      0.48      0.61      1451

    accuracy                           0.90      8652
   macro avg       0.86      0.73      0.77      8652
weighted avg       0.89      0.90      0.88      8652
 
   -----------------------------------------------

Column: original
              precision    recall  f1-score   support

           0       0.99      1.00      1.00  

 
   -----------------------------------------------

Column: other_aid
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      8476
           1       0.39      0.24      0.30       176

    accuracy                           0.98      8652
   macro avg       0.69      0.62      0.64      8652
weighted avg       0.97      0.98      0.97      8652
 
   -----------------------------------------------

Column: infrastructure_related
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      8597
           1       0.07      0.05      0.06        55

    accuracy                           0.99      8652
   macro avg       0.53      0.53      0.53      8652
weighted avg       0.99      0.99      0.99      8652
 
   -----------------------------------------------

Column: transport
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8563
          

In [26]:
print('Mean metrics for all outputs of a model:')
df_reports

Mean metrics for all outputs of a model:


Unnamed: 0,mean accuracy,mean macro avg f1-score,mean weighted avg f1-score
gbc,0.94613,0.6665,0.938002


### 6. Improve your model
Use grid search to find better parameters. 

In [27]:
pipeline_gbc.get_params()

{'memory': None,
 'steps': [('features',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=1.0, max_features=None,
                   min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x1a27a9dcb0>, use_idf=True,
                   vocabulary=None)),
  ('clf',
   MultiOutputClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                              init=None,
                                                              learning_rate=0.1,
                                                              loss='deviance',
                                                            

In [28]:
parameters = {
    'clf__estimator__learning_rate': [0.03, 0.3],
    'clf__estimator__n_estimators': [100, 200],
    'clf__estimator__max_depth': [2,4]
}

cv_gbc = GridSearchCV(pipeline_gbc, param_grid=parameters, n_jobs=-1 ,verbose=2)
cv_gbc.fit(X_train, Y_train)

In [29]:
dump(cv_gbc, f'../scripts/models/cv_gbc-{datetime.date.today()}.joblib')

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

In [30]:
cv_gbc = load('../flask_app/models/cv_gbc.joblib')

In [31]:
Y_preds_gbc_tuned = cv_gbc.predict(X_test)

In [32]:
df_reports = create_reports(['gbc_tuned'], Y_test, [Y_preds_gbc_tuned], column_names, df_reports=df_reports, verbose=False)

In [33]:
df_reports

Unnamed: 0,mean accuracy,mean macro avg f1-score,mean weighted avg f1-score
gbc,0.94613,0.6665,0.938002
gbc_tuned,0.948246,0.673501,0.938953


In [34]:
# Other way to calculate mean accuracy for all labels
y = cv_gbc.best_estimator_.predict(X_test)
(y==Y_test).mean()

0.9482464830592431

### 8. Improving  model further

In [35]:
# TODOs
# BERT - Modify head -> view "ML Pipeline Preparation.ipynb"

In [36]:
def mean_accuracy_score(estimator, X, y):
    y_pred = estimator.predict(X)
    score = (y_pred==y).mean()
    return score

In [37]:
def mean_macro_average_f1(estimator, X, y):
    y_pred = estimator.predict(X)
    
    f_tmp = 0
    for col in range(y.shape[1]):
        f_tmp += f1_score(y[:,col], y_pred[:,col], average='macro')
    
    score = f_tmp/y.shape[1]
    return score

In [38]:
# Class do perform hyperparameter optimization with Optuna
# Comparing XGBoost, Scikit-Learn's Gradient Boosting and Support Vector Machines

class Objective(object):
    def __init__(self, data):
        self.data = data

    def __call__(self, trial):
        x, y = self.data

        classifier_name = trial.suggest_categorical("classifier", ["GradientBoostingClassifier", "XGB", "SVC"])
        
        if classifier_name == "GradientBoostingClassifier":
            gbc_lr = trial.suggest_loguniform("gbc_lr", 1e-2, 6e-1)
            gbc_estimators = int(trial.suggest_loguniform("gbc_estimators", 30, 300))
            gbc_depth = int(trial.suggest_uniform("gbc_depth", 3, 8))
            
            gbc = GradientBoostingClassifier(learning_rate=gbc_lr,
                                             n_estimators=gbc_estimators,
                                             max_depth=gbc_depth)
            transform = TfidfVectorizer(tokenizer=tokenize)
            
            classifier_obj = make_pipeline(transform, MultiOutputClassifier(gbc))
        
        elif classifier_name=="SVC":
            svc_c = trial.suggest_loguniform("svc_c", 1e-10, 1e10)
            svc_gamma = trial.suggest_categorical("svc_gamma", ["auto", "scale"])
            
            svc = SVC(C=svc_c, gamma=svc_gamma)
            transform = TfidfVectorizer(tokenizer=tokenize)
            
            classifier_obj = make_pipeline(transform, MultiOutputClassifier(svc))
            
        else:            
            param = {
                "silent": 1,
                "objective": "binary:logistic",
                "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
                "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
                "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
            }

            if param["booster"] == "gbtree" or param["booster"] == "dart":
                param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
                param["eta"] = trial.suggest_loguniform("eta", 1e-8, 1.0)
                param["gamma"] = trial.suggest_loguniform("gamma", 1e-8, 1.0)
                param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
            if param["booster"] == "dart":
                param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
                param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
                param["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
                param["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)
            
            
            xgb = XGBClassifier(**param)    
            transform = TfidfVectorizer(tokenizer=tokenize)
            
            classifier_obj = make_pipeline(transform, MultiOutputClassifier(xgb))
        
        print(f"[{trial.number}] Cross-validating trial ...\n")
        print(f"[{trial.number}] Using {classifier_name}")
        # Use 1 CPU core per trial (n_jobs=1)
        score = cross_val_score(classifier_obj, x, y, scoring=mean_macro_average_f1, n_jobs=1, cv=2)
        print(f"[{trial.number}] Finished cross-validation!")
        score = score.mean()
        return score
    
# TODO: Test CatBoost and LightGBM

In [18]:
# %pdb off
# Load the dataset in advance for reusing it each trial execution.
data = (X_train, Y_train)
objective = Objective(data)

study = optuna.create_study(direction="maximize")
# Parallelize in all CPUs cores
study.optimize(objective, n_trials=64, n_jobs=-1)#,  show_progress_bar=True)
print(study.best_trial)
dump(study, f'optimization_studies/study-{datetime.date.today()}.pkl')

Cross-validating trial [0] ...Cross-validating trial [1] ...Cross-validating trial [2] ...

Cross-validating trial [3] ...



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Finished cross-validation!


[I 2020-03-29 00:25:26,527] Finished trial#3 resulted in value: 0.47489560900009964. Current best value is 0.47489560900009964 with parameters: {'classifier': 'SVC', 'svc_c': 1.0562657575145283e-07, 'svc_gamma': 'auto'}.
  'precision', 'predicted', average, warn_for)


Finished cross-validation!


[I 2020-03-29 00:25:30,349] Finished trial#2 resulted in value: 0.47489560900009964. Current best value is 0.47489560900009964 with parameters: {'classifier': 'SVC', 'svc_c': 1.0562657575145283e-07, 'svc_gamma': 'auto'}.


Finished cross-validation!


[I 2020-03-29 00:28:13,901] Finished trial#0 resulted in value: 0.6657379262335108. Current best value is 0.6657379262335108 with parameters: {'classifier': 'GradientBoostingClassifier', 'gbc_lr': 0.03950316541652496, 'gbc_estimators': 80.32742106036575, 'gbc_depth': 7.920818745614438}.
  'precision', 'predicted', average, warn_for)


Finished cross-validation!


[I 2020-03-29 00:30:52,592] Finished trial#1 resulted in value: 0.6324655247033049. Current best value is 0.6657379262335108 with parameters: {'classifier': 'GradientBoostingClassifier', 'gbc_lr': 0.03950316541652496, 'gbc_estimators': 80.32742106036575, 'gbc_depth': 7.920818745614438}.


FrozenTrial(number=0, value=0.6657379262335108, datetime_start=datetime.datetime(2020, 3, 29, 0, 22, 3, 412810), datetime_complete=datetime.datetime(2020, 3, 29, 0, 28, 13, 900804), params={'classifier': 'GradientBoostingClassifier', 'gbc_lr': 0.03950316541652496, 'gbc_estimators': 80.32742106036575, 'gbc_depth': 7.920818745614438}, distributions={'classifier': CategoricalDistribution(choices=('GradientBoostingClassifier', 'XGB', 'SVC')), 'gbc_lr': LogUniformDistribution(high=0.6, low=0.01), 'gbc_estimators': LogUniformDistribution(high=300, low=30), 'gbc_depth': UniformDistribution(high=8, low=3)}, user_attrs={}, system_attrs={'_number': 0}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE)


['study.pkl']

In [39]:
study = load('optimization_studies/study-3.pkl')
print('Best trial until now:')
print(' Value: ', study.best_trial.value)
print(' Params: ')
for key, value in study.best_trial.params.items():
    print(f'    {key}: {value}')

Best trial until now:
 Value:  0.6128839894727732
 Params: 
    classifier: XGB
    booster: dart
    lambda: 2.1730751676538944e-08
    alpha: 3.3754198596250943e-06
    max_depth: 9
    eta: 0.9570893233427447
    gamma: 0.0003481149200635622
    grow_policy: lossguide
    sample_type: uniform
    normalize_type: tree
    rate_drop: 1.4949511164445425e-07
    skip_drop: 2.818772298302604e-06


In [40]:
pd.set_option('display.max_rows',64)
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,params_alpha,params_booster,params_classifier,params_eta,params_gamma,params_gbc_depth,params_gbc_estimators,params_gbc_lr,params_grow_policy,params_lambda,params_max_depth,params_normalize_type,params_rate_drop,params_sample_type,params_skip_drop,params_svc_c,params_svc_gamma,system_attrs__number,state
0,0,0.565407,2020-03-29 01:27:15.987016,2020-03-29 02:15:13.122438,,,GradientBoostingClassifier,,,3.148234,153.984408,0.010393,,,,,,,,,,0,COMPLETE
1,1,0.474817,2020-03-29 01:27:15.988294,2020-03-29 01:58:12.363813,,,SVC,,,,,,,,,,,,,2.2564e-08,auto,1,COMPLETE
2,2,0.529609,2020-03-29 01:27:15.989154,2020-03-29 01:35:25.131919,0.0008568883,dart,XGB,0.01963646,0.120401,,,,lossguide,7.220142e-06,1.0,forest,0.00112553,uniform,0.005459291,,,2,COMPLETE
3,3,0.562586,2020-03-29 01:27:15.992491,2020-03-29 01:55:24.901511,,,GradientBoostingClassifier,,,5.246889,45.986809,0.021441,,,,,,,,,,3,COMPLETE
4,4,0.531164,2020-03-29 01:27:15.993758,2020-03-29 03:07:44.260328,,,SVC,,,,,,,,,,,,,0.5416569,scale,4,COMPLETE
5,5,0.610161,2020-03-29 01:27:15.994798,2020-03-29 02:04:16.291907,5.116875e-06,dart,XGB,0.2285427,9.797291e-05,,,,lossguide,6.910054e-08,9.0,forest,0.5032392,weighted,1.442294e-07,,,5,COMPLETE
6,6,0.602571,2020-03-29 01:27:15.998380,2020-03-29 04:43:53.191903,,,GradientBoostingClassifier,,,6.870526,263.170727,0.10068,,,,,,,,,,6,COMPLETE
7,7,0.604938,2020-03-29 01:27:15.999804,2020-03-29 01:54:20.859205,0.0001186826,dart,XGB,0.006027397,0.05974223,,,,depthwise,1.120853e-08,6.0,forest,2.48444e-05,weighted,1.897093e-08,,,7,COMPLETE
8,8,0.58482,2020-03-29 01:35:25.143207,2020-03-29 02:05:32.111070,,,GradientBoostingClassifier,,,5.049818,55.036054,0.025576,,,,,,,,,,8,COMPLETE
9,9,0.572563,2020-03-29 01:54:20.863304,2020-03-29 08:58:10.505464,,,SVC,,,,,,,,,,,,,1021100000.0,auto,9,COMPLETE


In [41]:
study.best_params

{'classifier': 'XGB',
 'booster': 'dart',
 'lambda': 2.1730751676538944e-08,
 'alpha': 3.3754198596250943e-06,
 'max_depth': 9,
 'eta': 0.9570893233427447,
 'gamma': 0.0003481149200635622,
 'grow_policy': 'lossguide',
 'sample_type': 'uniform',
 'normalize_type': 'tree',
 'rate_drop': 1.4949511164445425e-07,
 'skip_drop': 2.818772298302604e-06}

In [42]:
tuned_params = study.best_params
tuned_params.pop('classifier', None)
tuned_params

{'booster': 'dart',
 'lambda': 2.1730751676538944e-08,
 'alpha': 3.3754198596250943e-06,
 'max_depth': 9,
 'eta': 0.9570893233427447,
 'gamma': 0.0003481149200635622,
 'grow_policy': 'lossguide',
 'sample_type': 'uniform',
 'normalize_type': 'tree',
 'rate_drop': 1.4949511164445425e-07,
 'skip_drop': 2.818772298302604e-06}

In [43]:
xgb = XGBClassifier(**tuned_params)    
transform = TfidfVectorizer(tokenizer=tokenize)

tpe_tuned_xgb = make_pipeline(transform, MultiOutputClassifier(xgb))

In [41]:
tpe_tuned_xgb.fit(X_train, Y_train)
dump(tpe_tuned_xgb, f'../scripts/models/tpe_tuned_xgb-{datetime.date.today()}.joblib')

['../scripts/models/tpe_tuned_xgb.joblib']

In [44]:
Y_preds_tpe = tpe_tuned_xgb.predict(X_test)
df_reports = create_reports(['tpe_tuned_xgb'], Y_test, [Y_preds_tpe], column_names, df_reports=df_reports, verbose=False)

In [48]:
df_reports.sort_values('mean macro avg f1-score', ascending=False)

Unnamed: 0,mean accuracy,mean macro avg f1-score,mean weighted avg f1-score
gbc_tuned,0.948246,0.673501,0.938953
tpe_tuned_xgb,0.939905,0.6709,0.936478
gbc,0.94613,0.6665,0.938002


As seen above, bayesian optimization (with the Tree-structured Parzen Estimator implementation of Optuna) __might not__ improve our model. It helps exploring the search space of possible hyperparameters, since testing all possibilities would be impractical. 

It tries to make more intelligent choices of hyperparameters (than random or pre-defined grid) by using probabilistic models and with them select the most promising hyperparameters to enhance our model performance.

Obs.: Another observation is that due to imbalance between classes (shown in the beginning of this notebook), the F1 score was used in the objective function to be optimized