# ML Pipeline Preparation
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database
- Define feature and target variables X and Y

In [1]:
# %%bash
# conda install scikit-learn=0.21.2 -y
# # git clone -b master https://github.com/charles9n/bert-sklearn
# cd bert-sklearn; pip install .
# conda install joblib -y

In [9]:
# import libraries and set configurations
from IPython.display import display
import pandas as pd
import numpy as np
import re

from sqlalchemy import create_engine

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Imports for testing the LightGBM inspired Scikit Learn Histogram-based Gradient Boosting Tree.
# from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from joblib import dump, load

from bert_sklearn import BertClassifier
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

import torch
print('pytorch version:', torch.__version__)
if torch.has_cuda:
    print('GPU:',torch.cuda.get_device_name(0))

nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger','stopwords'])
pd.set_option('display.max_columns',40)

# modified ...lib/python3.6/site-packages/sklearn/multioutput.py to accept X as 1D array in class MultiOutputEstimator in
# def fit(self, X, y, sample_weight=None):
#     (...)
#     X, y = check_X_y(X, y,
#                      multi_output=True,
#                      accept_sparse=True,
#                      ensure_2d=False)
# 
# TODO: Open issue on scikit GitHub Repo to understand why X is required to be 2D array

pytorch version: 1.3.0


[nltk_data] Downloading package punkt to /Users/mccunha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mccunha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mccunha/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mccunha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# load data from database
engine = create_engine('sqlite:///../data/DisasterResponse.db')

In [6]:
df = pd.read_sql('disaster_data', engine)

In [7]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
def highlight_imbalanced(col):
    distance_perfect_distribution = np.abs(col - (1/len(col)))
    return ['background-color: red' if len(col)<2 else 'background-color: yellow' if dist > 0.1 else '' for dist in distance_perfect_distribution]


In [7]:
print('Viewing distributions of values per label (class imbalances are highlighted):\n')
df_X = df.iloc[:,4:]
for col in df_X:
    ct = pd.crosstab(index=df_X[col], columns='%freq', normalize='columns')
    display(ct.style.apply(highlight_imbalanced))
    if len(ct) < 2:
        print('Column with less than 2 values!')

03/26/2020 21:27:13 - INFO - numexpr.utils -   NumExpr defaulting to 8 threads.


Viewing distributions of values per label (class imbalances are highlighted):



col_0,%freq
related,Unnamed: 1_level_1
0,0.233522
1,0.759307
2,0.00717119


col_0,%freq
request,Unnamed: 1_level_1
0,0.829341
1,0.170659


col_0,%freq
offer,Unnamed: 1_level_1
0,0.995499
1,0.00450107


col_0,%freq
aid_related,Unnamed: 1_level_1
0,0.585749
1,0.414251


col_0,%freq
medical_help,Unnamed: 1_level_1
0,0.920507
1,0.0794934


col_0,%freq
medical_products,Unnamed: 1_level_1
0,0.949916
1,0.0500839


col_0,%freq
search_and_rescue,Unnamed: 1_level_1
0,0.972383
1,0.0276167


col_0,%freq
security,Unnamed: 1_level_1
0,0.982034
1,0.0179661


col_0,%freq
military,Unnamed: 1_level_1
0,0.967196
1,0.0328044


col_0,%freq
child_alone,Unnamed: 1_level_1
0,1


Column with less than 2 values!


col_0,%freq
water,Unnamed: 1_level_1
0,0.936222
1,0.0637778


col_0,%freq
food,Unnamed: 1_level_1
0,0.888503
1,0.111497


col_0,%freq
shelter,Unnamed: 1_level_1
0,0.911733
1,0.0882667


col_0,%freq
clothing,Unnamed: 1_level_1
0,0.984551
1,0.0154486


col_0,%freq
money,Unnamed: 1_level_1
0,0.976961
1,0.0230394


col_0,%freq
missing_people,Unnamed: 1_level_1
0,0.988633
1,0.0113671


col_0,%freq
refugees,Unnamed: 1_level_1
0,0.966623
1,0.0333766


col_0,%freq
death,Unnamed: 1_level_1
0,0.954455
1,0.0455447


col_0,%freq
other_aid,Unnamed: 1_level_1
0,0.868554
1,0.131446


col_0,%freq
infrastructure_related,Unnamed: 1_level_1
0,0.934963
1,0.0650366


col_0,%freq
transport,Unnamed: 1_level_1
0,0.954188
1,0.0458117


col_0,%freq
buildings,Unnamed: 1_level_1
0,0.949153
1,0.0508468


col_0,%freq
electricity,Unnamed: 1_level_1
0,0.979707
1,0.020293


col_0,%freq
tools,Unnamed: 1_level_1
0,0.993935
1,0.006065


col_0,%freq
hospitals,Unnamed: 1_level_1
0,0.989205
1,0.0107949


col_0,%freq
shops,Unnamed: 1_level_1
0,0.995423
1,0.00457736


col_0,%freq
aid_centers,Unnamed: 1_level_1
0,0.988213
1,0.0117867


col_0,%freq
other_infrastructure,Unnamed: 1_level_1
0,0.956096
1,0.0439045


col_0,%freq
weather_related,Unnamed: 1_level_1
0,0.721659
1,0.278341


col_0,%freq
floods,Unnamed: 1_level_1
0,0.917798
1,0.0822017


col_0,%freq
storm,Unnamed: 1_level_1
0,0.906813
1,0.0931874


col_0,%freq
fire,Unnamed: 1_level_1
0,0.989243
1,0.0107568


col_0,%freq
earthquake,Unnamed: 1_level_1
0,0.906355
1,0.0936451


col_0,%freq
cold,Unnamed: 1_level_1
0,0.979783
1,0.0202167


col_0,%freq
other_weather,Unnamed: 1_level_1
0,0.947513
1,0.052487


col_0,%freq
direct_report,Unnamed: 1_level_1
0,0.806416
1,0.193584


As shown above, almost all classes are imbalanced.

In our case, we considered imbalanced labels (skewed distribution) in the classes those where the distance from the perfect balance is higher than 10%. For example, for 3 possible labels in a class we have a perfect balance of 33.33%. If a label occurs more than 43.33% or less than 23.33% of the time, it is considered imbalanced.

`child_alone` is always 0. Therefore we can ignore this column.

In [8]:
# Drop `child_alone` column
df.drop('child_alone', axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
X = df.iloc[:,1].values
Y = df.iloc[:,4:].values

In [13]:
X[:5]

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name',
       'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
       'says: west side of Haiti, rest of the country today and tonight'],
      dtype=object)

In [14]:
Y[:5,:]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [2]:
def load_data(db_path):
    engine = create_engine(f'sqlite:///{db_path}')
    df = pd.read_sql('disaster_data', engine)
    df.drop('child_alone', axis=1, inplace=True)
    X = df.iloc[:,1].values
    Y = df.iloc[:,4:].values
    col_names = df.columns
    return X,Y,col_names

X, Y, col_names = load_data('../data/DisasterResponse.db')

In [3]:
X[:5], Y[:5,:]

(array(['Weather update - a cold front from Cuba that could pass over Haiti',
        'Is the Hurricane over or is it not over',
        'Looking for someone but no name',
        'UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.',
        'says: west side of Haiti, rest of the country today and tonight'],
       dtype=object),
 array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [4]:
col_names

Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

### 3. Build a machine learning pipeline
- Use MultiOutputClassifier to for predict multiple target variables.

In [5]:
class TextTransform(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.astype(str)

In [12]:
# sklearn BERT wrapper has already BERT's specific tokenizers
pipeline_bert = Pipeline([
                    ('transform_str', TextTransform()),
                    ('clf', MultiOutputClassifier(BertClassifier(max_seq_length=24, 
                                                                 train_batch_size=8,
                                                                 epochs=2,
                                                                 num_mlp_hiddens=200
                                                                 # fp16=True # requires NVIDA apex
                                                                )))
               ])

Building sklearn text classifier...


In [13]:
pipeline_bert

Pipeline(memory=None,
         steps=[('transform_str',
                 <__main__.TextTransform object at 0x7f45f53453c8>),
                ('clf',
                 MultiOutputClassifier(estimator=BertClassifier(bert_config_json=None,
                                                                bert_model='bert-base-uncased',
                                                                bert_vocab=None,
                                                                do_lower_case=None,
                                                                epochs=2,
                                                                eval_batch_size=8,
                                                                fp16=False,
                                                                from_tf=False,
                                                                gradient_accumulation_steps=2,
                                                                ignore_label=None,
             

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [14]:
# random_state to make it easier to reproduce
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1000)

In [10]:
#%%prun #%%mrun
from tqdm import tqdm

In [None]:
%%time
# %pdb
pipeline_bert = pipeline_bert.fit(X_train,Y_train)



Building sklearn text classifier...
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 15808, validation data size: 1756


Training  : 100%|██████████| 1976/1976 [03:16<00:00, 10.08it/s, loss=0.407]
Validating: 100%|██████████| 220/220 [00:05<00:00, 38.94it/s]

Epoch 1, Train loss: 0.4071, Val loss: 0.3474, Val accy: 86.16%



Training  :  55%|█████▌    | 1092/1976 [01:41<01:27, 10.05it/s, loss=0.244]

Using sklearn MultiOuput is the easiest approach to extend BERT for multi-label classification.
However, for our 36 labels it does not make sense to fit a BERT model for each column due to it's size (even the smaller BERT version). When trying fitting in a NVIDIA Tesla V100 with 16GB, reducing batch sizes to 8 and sequence length to 24, even doing so resulted in CUDA out of memory errors and we could only fit ~15 models.

TODO:

Modify added MLP head of BERT to be multi-label (not only multi-class). 
One strategy to do that would be to use sigmoid in each output, so that all outputs vary from 0 to 1. And if the outpu if greater than 0.5 than we consider it 1.

One problem with that is related to labels with more than one class, such as `related`, that has possible values of 0, 1, 2. What could be done is group outpu neurons according to labels, e.g. 3 neurons for `related` (to make it easier to interpret the 3 possible outputs), 1 neuron for `request` (0 or 1), 1 neuron for `offer` (0 or 1), etc. Apply softmax for labels with more than 2 classes and sigmoid in those with only 2 classes.

An easier way would be to consider all classes binary (since `related` label is highly imbalanced).

### 5. Test your model

Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [186]:
def create_reports(clfs_names, Y_test, Y_preds, df_reports=None):
    if df_reports is None:
        df_reports = pd.DataFrame(columns=['mean accuracy','mean macro avg f1-score', 'mean weighted avg f1-score'])
    
    for clf_name, Y_preds in zip(clfs_names,Y_preds):#,Y_preds_hgbc,Y_preds_rfc]):
        clf_metrics = pd.DataFrame()
        print(f'Metrics for each feature for model - {clf_name}\n')
        for col in range(Y_preds.shape[1]):
            report = classification_report(Y_test[:,col], Y_preds[:,col], output_dict=True)
            # model_reports[clf_name] = [report['accuracy'], report['macro avg']['f1-score'], report['weighted avg']['f1-score']]
            label_metrics = pd.DataFrame(data=[[report['accuracy'], report['macro avg']['f1-score'], report['weighted avg']['f1-score']]])
            clf_metrics = pd.concat([clf_metrics, label_metrics], axis=0)
            print('Column:', column_names[col])
            print(classification_report(Y_test[:,col], Y_preds[:,col]),'\n   -----------------------------------------------\n')
        clf_metrics = clf_metrics.mean(axis=0).to_frame().transpose().rename(index={0:clf_name}, columns={0:'mean accuracy',1:'mean macro avg f1-score',2:'mean weighted avg f1-score'})
        df_reports = pd.concat([df_reports, clf_metrics],axis=0)
        
    return df_reports


In [5]:
df_reports = create_reports(['bert'], Y_test, [Y_preds_gbc])

In [6]:
print('Mean metrics for all outputs of a model:')
df_reports

### 6. Improve your model
Use grid search to find better parameters. 

In [2]:
# pipeline_bert.get_params()

In [3]:
# parameters = {
#     'clf__estimator__learning_rate': [1e-5, 2e-5, 4e-5],
#     'clf__estimator__epochs': [2, 3, 4],
#     'clf__estimator__num_mlp_hiddens': [300,600],
#     'clf__estimator__max_seq_length': [32, 64],
# }

# cv_bert = GridSearchCV(pipeline_bert, param_grid=parameters, n_jobs=-1 ,verbose=2)
# cv_bert.fit(X_train, Y_train)

In [4]:
# dump(cv_bert, '../scripts/models/cv_bert.joblib')

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

In [7]:
# Y_preds_bert_tuned = cv_bert.predict(X_test)

In [8]:
# df_reports = create_reports(['cv_bert'], Y_test, [Y_preds_gbc_tuned], df_reports=df_reports)

In [9]:
# df_reports

### 9. Export your model as a pickle file