In [1]:
# import libraries
import os
import re
import pandas as pd
import sqlite3
from sqlalchemy import create_engine

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

In [2]:
def load_data(database_filepath):
    sql_url = 'sqlite:///'+ database_filepath
    table = os.path.basename(database_filepath)
    engine = create_engine(sql_url)
    df = pd.read_sql_table(table, engine)
    
    X = df['message'].values
    y = df[df.columns[4:]].values
    
    #category_names = y.columns
    category_names = list(df.columns)[4:]
    
    return X, y, category_names

In [3]:
database_filepath = 'data/DisasterResponse.db'

X, y, category_names = load_data(database_filepath)

In [4]:
print(X.shape, y.shape)

(26216,) (26216, 35)


In [10]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [11]:
def build_model():
    '''
        build_model:
            - Build Natural Language Processing ML pipeline model
            - Processes and transforms text messages and applies a classifier

            In:
                - None

            Out: pipeline model
    '''
    pipeline = Pipeline([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', MultiOutputClassifier(RandomForestClassifier()))
            ])),
        ])
    
    return pipeline

In [12]:
# split data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# train classifier
model = build_model()
model.fit(X_train, y_train)

Pipeline(steps=[('text_pipeline',
                 Pipeline(steps=[('vect',
                                  CountVectorizer(tokenizer=<function tokenize at 0x7fef67b2f430>)),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf',
                                  MultiOutputClassifier(estimator=RandomForestClassifier()))]))])

In [18]:
def evaluate_model(model, X_test, y_test, category_names):
    '''
    
    '''
    # model accuracy
    y_pred = model.predict(X_test)
    accuracy = (y_pred == y_test).mean()
    overall_accuracy = (y_pred == y_test).mean().mean()
    print('Model accuracy by category:\n {}'.format(accuracy))
    print('\nOverall model accuracy: {}'.format(overall_accuracy))

    # classification report
    report = classification_report(y_test, y_pred, target_names=category_names, output_dict = True)
    df_class_rpt = pd.DataFrame(report).transpose()
    print(df_class_rpt)
    
    return

In [19]:
print('Evaluating model...')
evaluate_model(model, X_test, y_test, category_names)

Evaluating model...
Model accuracy by category:
 0.9446932548763213

Overall model accuracy: 0.9446932548763213
                        precision    recall  f1-score  support
related                  0.814196  0.973540  0.886767   4006.0
request                  0.879271  0.451991  0.597061    854.0
offer                    0.000000  0.000000  0.000000     22.0
aid_related              0.782028  0.595745  0.676293   2162.0
medical_help             0.600000  0.046053  0.085540    456.0
medical_products         0.850000  0.062044  0.115646    274.0
search_and_rescue        0.428571  0.024000  0.045455    125.0
security                 0.500000  0.011364  0.022222     88.0
military                 0.500000  0.029586  0.055866    169.0
water                    0.895833  0.269592  0.414458    319.0
food                     0.840741  0.411978  0.552984    551.0
shelter                  0.871795  0.227679  0.361062    448.0
clothing                 0.666667  0.024390  0.047059     82.0
money 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
