# ML Pipeline Preparation

### 1. Import libraries and load data from database

In [1]:
# Import libraries
import joblib
import re
import pandas as pd
from sqlalchemy import create_engine

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


[nltk_data] Downloading package stopwords to /home/tri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load data from database
engine = create_engine('sqlite:///../data/messages.db')
df = pd.read_sql('SELECT * FROM messages', engine)
X = df['message'].copy()
Y = df.iloc[:, 4:].copy()
Y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

### 2. Define a tokenization function to process text data

In [4]:
stop = stopwords.words('english')
    
def tokenize(text):
    '''
    Steps:
        Lowercase characters
        Remove punctuation
        Tokenize
        Strip white spaces
        Remove stopwords
        Stem words
    '''
    
    # Steps 1 - 3
    tokens = word_tokenize(re.sub(r'[^A-Za-z0-9]', ' ', text.lower()))
    
    # Step 4 - 5
    stopwords_removed = [word.strip() for word in tokens if word.strip() not in stop]
    
    # Step 6
    stemmer = SnowballStemmer('english')
    return [stemmer.stem(word) for word in stopwords_removed]


for s in X[:10]:
    print(tokenize(s))

['weather', 'updat', 'cold', 'front', 'cuba', 'could', 'pass', 'haiti']
['hurrican']
['look', 'someon', 'name']
['un', 'report', 'leogan', '80', '90', 'destroy', 'hospit', 'st', 'croix', 'function', 'need', 'suppli', 'desper']
['say', 'west', 'side', 'haiti', 'rest', 'countri', 'today', 'tonight']
['inform', 'nation', 'palac']
['storm', 'sacr', 'heart', 'jesus']
['pleas', 'need', 'tent', 'water', 'silo', 'thank']
['would', 'like', 'receiv', 'messag', 'thank']
['croix', 'des', 'bouquet', 'health', 'issu', 'worker', 'santo', '15', 'area', 'croix', 'des', 'bouquet']


### 3. Build a machine learning pipeline

In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
    ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=0)))
])

### 4. Train pipeline

In [6]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((19662,), (6554,), (19662, 35), (6554, 35))

In [7]:
pipeline.fit(X_train, Y_train)                          

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function tokenize at 0x7fac38266940>)),
                ('clf',
                 MultiOutputClassifier(estimator=LogisticRegression(max_iter=1000,
                                                                    random_state=0)))])

### 5. Evaluate pipeline

In [8]:
# Define a function to evaluate models
def evaluate_model(model, X_test, Y_test, label_names, print_reports=False):
    
    pred = pd.DataFrame(model.predict(X_test), columns=label_names)
    
    metrics = []
    for col in label_names:

        # Store metrics in a list
        report = classification_report(Y_test[col], pred[col])
        scores = report.split('accuracy')[1].split()
        metrics.append([float(scores[i]) for i in [0, 4, 5, 6, 10, 11, 12]])

        # Print classification report
        if print_reports:
            print('-' * 53)
            print(f'Label: {col}')
            print(report)
            
    # Convert the metrics list into a dataframe
    metric_names = ['accuracy', 'macro_avg_precision', 'macro_avg_recall', 'macro_avg_f1', 'weighted_avg_precision', 'weighted_avg_recall', 'weighted_avg_f1']
    return pd.DataFrame(metrics, columns=metric_names, index=label_names)

evaluate_model(pipeline, X_test, Y_test, Y_test.columns)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.83,0.85,0.48,0.5,0.82,0.83,0.81
request,0.9,0.86,0.75,0.79,0.89,0.9,0.89
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.78,0.78,0.76,0.77,0.78,0.78,0.78
medical_help,0.93,0.8,0.58,0.61,0.91,0.93,0.91
medical_products,0.96,0.85,0.59,0.63,0.95,0.96,0.94
search_and_rescue,0.97,0.91,0.53,0.55,0.97,0.97,0.96
security,0.98,0.49,0.5,0.5,0.97,0.98,0.98
military,0.97,0.85,0.56,0.6,0.97,0.97,0.96
water,0.96,0.88,0.76,0.81,0.96,0.96,0.96


### 6. Grid search to optimize parameters

In [9]:
# Parameter grid to search
parameters = {
    'tfidf__max_df': [0.01, 0.1, 0.2],
    'tfidf__max_features': [None, 1000, 10000],
    'tfidf__ngram_range': [(1, 1), (2, 2), (3, 3)]
}

In [10]:
gs = GridSearchCV(pipeline, parameters, cv=4, n_jobs=12, verbose=2)
gs.fit(X_train, Y_train)

gs.best_params_

Fitting 4 folds for each of 27 candidates, totalling 108 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=12)]: Done 108 out of 108 | elapsed:  6.3min finished


{'tfidf__max_df': 0.2,
 'tfidf__max_features': 1000,
 'tfidf__ngram_range': (1, 1)}

##### Best params for logreg pipeline: {'tfidf__max_df': 0.2, 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 1)}

### 7. Evaluate pipeline with best parameters

In [11]:
logreg = gs.best_estimator_
report_lr = evaluate_model(logreg, X_test, Y_test, Y_test.columns)
report_lr

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.83,0.73,0.5,0.52,0.82,0.83,0.82
request,0.9,0.86,0.75,0.79,0.89,0.9,0.89
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.77,0.77,0.75,0.76,0.77,0.77,0.77
medical_help,0.93,0.81,0.59,0.63,0.91,0.93,0.91
medical_products,0.95,0.82,0.59,0.63,0.94,0.95,0.94
search_and_rescue,0.97,0.83,0.55,0.58,0.97,0.97,0.96
security,0.98,0.99,0.5,0.5,0.98,0.98,0.98
military,0.97,0.82,0.57,0.61,0.96,0.97,0.96
water,0.96,0.86,0.75,0.79,0.95,0.96,0.95


In [12]:
report_lr.mean()

accuracy                  0.948000
macro_avg_precision       0.786571
macro_avg_recall          0.613143
macro_avg_f1              0.642000
weighted_avg_precision    0.939429
weighted_avg_recall       0.948000
weighted_avg_f1           0.938000
dtype: float64

### 8. Try a Naive Bayes classifier and random forest

In [13]:
# Naive Bayes
pipe_nb = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
    ('clf', MultiOutputClassifier(MultinomialNB(alpha=0.01)))
])

In [14]:
# Grid search
params = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.1, 0.2],
    'tfidf__max_features': [None, 100, 1000, 10000]
}

gs_nb = GridSearchCV(pipe_nb, params, cv=4, n_jobs=16, verbose=2)
gs_nb.fit(X_train, Y_train)

gs_nb.best_params_

Fitting 4 folds for each of 16 candidates, totalling 64 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   13.9s
[Parallel(n_jobs=16)]: Done  64 out of  64 | elapsed:   54.4s finished


{'tfidf__max_df': 0.1,
 'tfidf__max_features': 10000,
 'tfidf__ngram_range': (1, 2)}

In [15]:
# Evaluate the Naive Bayes classifier
multnb = gs_nb.best_estimator_
report_nb = evaluate_model(multnb, X_test, Y_test, Y_test.columns)
report_nb

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.82,0.74,0.58,0.63,0.81,0.82,0.81
request,0.89,0.82,0.79,0.8,0.89,0.89,0.89
offer,0.99,0.5,0.5,0.5,0.99,0.99,0.99
aid_related,0.76,0.75,0.75,0.75,0.76,0.76,0.76
medical_help,0.93,0.77,0.61,0.65,0.91,0.93,0.91
medical_products,0.96,0.78,0.65,0.69,0.95,0.96,0.95
search_and_rescue,0.97,0.78,0.54,0.56,0.96,0.97,0.96
security,0.98,0.49,0.5,0.5,0.97,0.98,0.98
military,0.97,0.77,0.67,0.71,0.97,0.97,0.97
water,0.95,0.82,0.72,0.76,0.94,0.95,0.95


In [16]:
# Calculate the metric means
report_nb.mean()

accuracy                  0.941714
macro_avg_precision       0.728571
macro_avg_recall          0.615429
macro_avg_f1              0.639143
weighted_avg_precision    0.933143
weighted_avg_recall       0.941714
weighted_avg_f1           0.934571
dtype: float64

##### Best params for nb pipeline: {'tfidf__max_df': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}

In [17]:
# Random forest
forest = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2), max_df=0.2)),
    ('clf', MultiOutputClassifier(
        RandomForestClassifier(n_estimators=300, min_samples_split=5, max_features=0.4, max_samples=0.7, 
                               random_state=0, verbose=2), n_jobs=18)
    )
])
forest.fit(X_train, Y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.2, ngram_range=(1, 2),
                                 tokenizer=<function tokenize at 0x7fac38266940>)),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(max_features=0.4,
                                                                        max_samples=0.7,
                                                                        min_samples_split=5,
                                                                        n_estimators=300,
                                                                        random_state=0,
                                                                        verbose=2),
                                       n_jobs=18))])

In [18]:
# Evaluate the random forest classifier
report_rf = evaluate_model(forest, X_test, Y_test, Y_test.columns)
report_rf

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.79,0.76,0.53,0.56,0.79,0.79,0.79
request,0.89,0.82,0.76,0.79,0.88,0.89,0.88
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.77,0.77,0.76,0.76,0.77,0.77,0.77
medical_help,0.93,0.78,0.64,0.68,0.92,0.93,0.92
medical_products,0.96,0.86,0.67,0.73,0.96,0.96,0.96
search_and_rescue,0.97,0.81,0.61,0.66,0.97,0.97,0.97
security,0.98,0.78,0.52,0.53,0.98,0.98,0.98
military,0.97,0.78,0.64,0.68,0.97,0.97,0.97
water,0.97,0.87,0.87,0.87,0.97,0.97,0.97


In [19]:
# Calculate the metric means
report_rf.mean()

accuracy                  0.948000
macro_avg_precision       0.778000
macro_avg_recall          0.664857
macro_avg_f1              0.690286
weighted_avg_precision    0.942000
weighted_avg_recall       0.948000
weighted_avg_f1           0.942857
dtype: float64

### 9. Save models as pickle files

In [20]:
joblib.dump(logreg, '../models/lr_model.pkl')

['../models/lr_model.pkl']

In [21]:
joblib.dump(multnb, '../models/nb_model.pkl')

['../models/nb_model.pkl']

In [22]:
joblib.dump(forest, '../models/rf_model.pkl')

['../models/rf_model.pkl']

### 10. Code for ML script

In [23]:
import sys
import joblib
import re
import pandas as pd
from sqlalchemy import create_engine

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# List of stopwords
stop = stopwords.words('english')


def load_data(database_filepath):

    engine = create_engine('sqlite:///' + database_filepath)
    df = pd.read_sql('SELECT * FROM messages', engine)
    X = df['message'].copy()
    Y = df.iloc[:, 4:].copy()
    categories = Y.columns.tolist()
    return X, Y, categories

    
def tokenize(text):

    # Steps 1 - 3
    tokens = word_tokenize(re.sub(r'[^A-Za-z0-9]', ' ', text.lower()))
    
    # Step 4 - 5
    stopwords_removed = [word.strip() for word in tokens if word.strip() not in stop]
    
    # Step 6
    stemmer = SnowballStemmer('english')
    return [stemmer.stem(word) for word in stopwords_removed]


def build_model():

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 1), max_df=0.2, max_features=1000)),
        ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=0)))
    ])
    return pipeline


def evaluate_model(model, X_test, Y_test, category_names):

    pred = pd.DataFrame(model.predict(X_test), columns=category_names)

    metrics = []

    for col in category_names:
        report = classification_report(Y_test[col], pred[col])
        scores = report.split('accuracy')[1].split()
        metrics.append([float(scores[i]) for i in [0, 4, 5, 6, 10, 11, 12]])

    metric_names = ['accuracy', 'macro_avg_precision', 'macro_avg_recall', 'macro_avg_f1', 'weighted_avg_precision',
                    'weighted_avg_recall', 'weighted_avg_f1']
    metrics_df = pd.DataFrame(metrics, columns=metric_names, index=category_names)

    print(metrics_df)
    print(metrics_df.sum)
    return metrics_df
        

def save_model(model, model_filepath):

    joblib.dump(model, model_filepath)
    return


def main():

    if len(sys.argv) == 3:

        database_filepath, model_filepath = sys.argv[1:]

        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:

        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/messages.db classifier.pkl')


# if __name__ == '__main__':
#     main()

[nltk_data] Downloading package stopwords to /home/tri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
