# Machine Learning Pipeline For Disaster Response Message Classification

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import sqlalchemy
import re

# Imports from nltk
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Imports from  sklearn 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patrick.peltier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patrick.peltier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/patrick.peltier/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrick.peltier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [8]:
# Load data from database
engine = sqlalchemy.create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql("select * from messages", engine)

X = df["message"]
Y = df[df.columns.difference(["id","message","original","genre"])]

In [9]:
df.to_csv("disaster_response.csv")

In [4]:
# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

## Pre-Processing

In [3]:
def tokenize(text):
    """ Text tokenizer
    
        Processes text in three steps:
        - Converts text to lower case & splits string into tokens
        - Lemmatizes tokens
        - Removes stopwords
        
        Args: 
            text (str): Input text
        Returns:
            str: Tokens
    """
    # Remove all non-alpha-numeric characters and tokenize text
    tokens = word_tokenize(re.sub('[^a-z0-9]', ' ', text.lower().strip()))
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(tok).strip() for tok in tokens]
    
    # Remove stopwords
    clean_tokens = [tok for tok in clean_tokens if tok not in stopwords.words("english")]
    
    return clean_tokens

## First Training Round

In [61]:
# Define pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3),
                                  n_jobs=-1))
])

In [62]:
# Train model
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...om_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False),
           n_jobs=-1))])

In [63]:
# Predict on test data
Y_pred = pipeline.predict(X_test)

In [64]:
# Evaluate model performance
categories = Y_test.columns.tolist()
print(classification_report(Y_test, Y_pred, target_names=categories))

                        precision    recall  f1-score   support

           aid_centers       0.33      0.01      0.02        94
           aid_related       0.73      0.70      0.71      3600
             buildings       0.72      0.33      0.46       427
              clothing       0.78      0.45      0.57       134
                  cold       0.70      0.33      0.45       180
                 death       0.77      0.46      0.58       387
         direct_report       0.67      0.53      0.59      1657
            earthquake       0.89      0.77      0.82       830
           electricity       0.63      0.25      0.36       177
                  fire       0.85      0.25      0.39        87
                floods       0.88      0.54      0.67       676
                  food       0.83      0.68      0.75       996
             hospitals       0.30      0.04      0.07        76
infrastructure_related       0.38      0.07      0.12       539
          medical_help       0.61      

  'precision', 'predicted', average, warn_for)


In [65]:
# Evaluate model performance along accuracy, precision, recall & f1 score
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average="weighted")
recall = recall_score(Y_test, Y_pred, average="weighted")
f1 = f1_score(Y_test, Y_pred, average="weighted")

# Print metrics
print("\nAccuracy: {:.2f} \nPrecision: {:.2f} \nRecall: {:.2f} \nF1 Score: {:.2f} \n".format(
    accuracy, precision, recall, f1))


Accuracy: 0.30 
Precision: 0.74 
Recall: 0.60 
F1 Score: 0.65 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Second Training Round

In [None]:
# Define pipeline with SGDClassifier
random_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(SGDClassifier(loss="modified_huber",
                                                max_iter=1000,
                                                tol=1e-3),
                                  n_jobs=-1))
])

# Define parameters grid
parameters =  {
    'clf__estimator__alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
}

# Create cross validation object for randomized grid search
alpha_grid = GridSearchCV(random_pipeline, param_grid=parameters)

In [72]:
# Run grid search
alpha_grid.fit(X_train, Y_train)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...om_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False),
           n_jobs=-1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__estimator__alpha': [0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06, 1e-07]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=0)

In [80]:
# Predict on test data
Y_pred = alpha_grid.best_estimator_.predict(X_test)

In [81]:
# Evaluate model performance
categories = Y_test.columns.tolist()
print(classification_report(Y_test, Y_pred, target_names=categories))

                        precision    recall  f1-score   support

           aid_centers       0.33      0.01      0.02        94
           aid_related       0.73      0.70      0.71      3600
             buildings       0.72      0.33      0.46       427
              clothing       0.78      0.45      0.57       134
                  cold       0.70      0.33      0.45       180
                 death       0.77      0.46      0.58       387
         direct_report       0.67      0.53      0.59      1657
            earthquake       0.89      0.77      0.82       830
           electricity       0.63      0.25      0.36       177
                  fire       0.85      0.25      0.39        87
                floods       0.88      0.54      0.67       676
                  food       0.83      0.68      0.75       996
             hospitals       0.30      0.04      0.07        76
infrastructure_related       0.38      0.07      0.12       539
          medical_help       0.61      

  'precision', 'predicted', average, warn_for)


In [82]:
# Evaluate model performance along accuracy, precision, recall & f1 score
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average="weighted")
recall = recall_score(Y_test, Y_pred, average="weighted")
f1 = f1_score(Y_test, Y_pred, average="weighted")

# Print metrics
print("\nAccuracy: {:.2f} \nPrecision: {:.2f} \nRecall: {:.2f} \nF1 Score: {:.2f} \n".format(
    accuracy, precision, recall, f1))


Accuracy: 0.30 
Precision: 0.74 
Recall: 0.60 
F1 Score: 0.65 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [79]:
# Best parameters
alpha_grid.best_params_

{'clf__estimator__alpha': 0.0001}

## Third Training Round

In [24]:
class CharacterCount(BaseEstimator, TransformerMixin):
    """ Custom sklearn transformer class to count number of characters
    """
    def character_count(self, text):
        """ Counts the number of characters in string
        
            Args: 
                text (str): Input text
            Returns:
                int: Number of characters
        """
        
        return len(text)

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_new = pd.Series(X).apply(self.character_count)
        return pd.DataFrame(X_new.astype(str).astype(int)).to_sparse()

In [25]:
class WordCount(BaseEstimator, TransformerMixin):
    """ Custom sklearn transformer class to count number of words
    """
    def word_count(self, text):
        """ Counts the number of stopwords in string
        
            Args: 
                text (str): Input text
            Returns:
                int: Number of words
        """
        tokens = nltk.word_tokenize(re.sub('[^a-z]', ' ', text.lower()))
        return len(tokens)

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_new = pd.Series(X).apply(self.word_count)
        return pd.DataFrame(X_new.astype(str).astype(int)).to_sparse()

In [26]:
class StopwordCount(BaseEstimator, TransformerMixin):
    """ Custom sklearn transformer class to count number of stopwords
    """
    def stopword_count(self, text):
        """ Counts the number of stopwords in string
        
            Args: 
                text (str): Input text
            Returns:
                int: Number of stopwords
        """
        tokens = nltk.word_tokenize(re.sub('[^a-z]', ' ', text.lower()))
        stopword_tokens = [tok for tok in tokens if tok in stopwords.words("english")]
        return len(stopword_tokens)

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_new = pd.Series(X).apply(self.stopword_count)
        return pd.DataFrame(X_new.astype(str).astype(int)).to_sparse()

In [27]:
class StartingVerb(BaseEstimator, TransformerMixin):
    """ Custom sklearn transformer class to check if first word is a verb
    """
    def starting_verb(self, text):
        """ Checks if first word in text is a verb
        
            Args: 
                text (str): Input text
            Returns:
                bool: True if first word is verb else False
        """
        sentence_list = sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            if len(pos_tags) >= 1:
                first_word, first_tag = pos_tags[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged).to_sparse()

In [85]:
# Define new pipeline with feature unions
random_feat_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('nlp', Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer())
        ])),
        ('char_count', CharacterCount()),
        ('word_count', WordCount()),
        ('starting_verb', StartingVerb())
    ])),
    ('clf', MultiOutputClassifier(SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3, alpha=1e-4),
                                      n_jobs=-1))
])

In [87]:
# Define parameters grid
parameters =  {
    'features__nlp__vect__ngram_range': [(1, 1), (1, 2)],
    'features__nlp__vect__max_df': [0.75, 1.0],
    'features__nlp__vect__max_features': [None, 1000, 5000],
    'features__nlp__tfidf__use_idf': [True, False],
    'features__transformer_weights': [
            {'nlp': 1.0,'char_count': 0.0,'word_count': 0.0,'starting_verb': 0.0},
            {'nlp': 1.0,'char_count': 0.5,'word_count': 0.5,'starting_verb': 0.5},
            {'nlp': 1.0,'char_count': 0.75,'word_count': 0.75,'starting_verb': 0.75},
            {'nlp': 1.0,'char_count': 1.0,'word_count': 1.0,'starting_verb': 1.0},
    ]
}

# Create cross validation object for randomized grid search
random_feat_grid = RandomizedSearchCV(pipeline, param_distributions=parameters, cv=3, n_iter=15)

In [88]:
# Run grid search
random_feat_grid.fit(X_train, Y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('nlp', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max...om_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False),
           n_jobs=-1))]),
          fit_params=None, iid=True, n_iter=15, n_jobs=1,
          param_distributions={'features__nlp__vect__ngram_range': [(1, 1), (1, 2)], 'features__nlp__vect__max_df': [0.75, 1.0], 'features__nlp__vect__max_features': [None, 1000, 5000], 'features__nlp__tfidf__use_idf': [True, False], 'features__transformer_weights': [{'nlp': 1.0, 'char_count': 0.0, 'word_coun..., 'starting_verb': 0.75}, {'nlp': 1.0, 'char_count': 1.0, 'word_count': 1.0, 'starting_verb': 1.0}]},
          pre_dispatch='2*n_

In [89]:
# Predict on test data
Y_pred = random_feat_grid.best_estimator_.predict(X_test)

In [90]:
# Evaluate model performance
categories = Y_test.columns.tolist()
print(classification_report(Y_test, Y_pred, target_names=categories))

                        precision    recall  f1-score   support

           aid_centers       0.50      0.01      0.02        94
           aid_related       0.76      0.70      0.73      3600
             buildings       0.67      0.39      0.49       427
              clothing       0.77      0.49      0.60       134
                  cold       0.60      0.31      0.41       180
                 death       0.74      0.50      0.60       387
         direct_report       0.70      0.47      0.56      1657
            earthquake       0.89      0.76      0.82       830
           electricity       0.51      0.21      0.30       177
                  fire       0.74      0.23      0.35        87
                floods       0.88      0.53      0.66       676
                  food       0.82      0.67      0.74       996
             hospitals       0.27      0.05      0.09        76
infrastructure_related       0.42      0.05      0.09       539
          medical_help       0.62      

  'precision', 'predicted', average, warn_for)


In [91]:
# Evaluate model performance along accuracy, precision, recall & f1 score
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average="weighted")
recall = recall_score(Y_test, Y_pred, average="weighted")
f1 = f1_score(Y_test, Y_pred, average="weighted")

# Print metrics
print("\nAccuracy: {:.2f} \nPrecision: {:.2f} \nRecall: {:.2f} \nF1 Score: {:.2f} \n".format(
    accuracy, precision, recall, f1))


Accuracy: 0.30 
Precision: 0.75 
Recall: 0.59 
F1 Score: 0.64 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [95]:
# Best parameters
random_feat_grid.best_params_

{'features__transformer_weights': {'nlp': 1.0,
  'char_count': 0.0,
  'word_count': 0.0,
  'starting_verb': 0.0},
 'features__nlp__vect__ngram_range': (1, 1),
 'features__nlp__vect__max_features': 1000,
 'features__nlp__vect__max_df': 1.0,
 'features__nlp__tfidf__use_idf': False}

## Save Model

In [96]:
# Save model
joblib.dump(random_feat_grid.best_estimator_, "classifier.pkl", compress = 1)

['classifier.pkl']

In [97]:
# Re-load from file to check if everything is correct
joblib_model = joblib.load("classifier.pkl")

# Calculate the accuracy
score = joblib_model.score(X_test, Y_test)
score

0.3012805587892899