In [20]:
import re
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import pickle

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:

def load_data(database_filepath):
    """ Load dataset from the ETL processed databaseself.Separate for predictors
        variables (X) and target variables (Y). Also, get targets' names list.

    Args:
        database_filepath (str): the saved database's path from ETL processself.

    Return:
        X: predictors subset of ETL processed dataset
        Y: targets subset of ETL processed dataset
        category_names: the list of target variable names.
    """
    # load data from database
    engine = create_engine('sqlite:///{}'.format(database_filepath))
    df = pd.read_sql_table('etl_processed_data', engine)

    # split training and testing dataset
    X = df.ix[:,1:2].values[:,0]
    Y = df.ix[:,4:].values

    # get target variable names
    category_names = df.ix[:,4:].columns.tolist()

    return X, Y, category_names

In [3]:
X, Y, category_names = load_data('data/DisasterResponse.db')

In [4]:
X

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name', ...,
       "Proshika, operating in Cox's Bazar municipality and 5 other unions, Ramu and Chokoria, assessment, 5 kg rice, 1,5 kg lentils to 700 families.",
       'Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.',
       'A radical shift in thinking came about as a result of this meeting, recognizing that HIV/AIDS is at the core of the humanitarian crisis and identifying the crisis itself as a function of the HIV/AIDS pandemic.'],
      dtype=object)

In [5]:
Y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [6]:
category_names

['related',
 'request',
 'offer',
 'aid_related',
 'medical_help',
 'medical_products',
 'search_and_rescue',
 'security',
 'military',
 'child_alone',
 'water',
 'food',
 'shelter',
 'clothing',
 'money',
 'missing_people',
 'refugees',
 'death',
 'other_aid',
 'infrastructure_related',
 'transport',
 'buildings',
 'electricity',
 'tools',
 'hospitals',
 'shops',
 'aid_centers',
 'other_infrastructure',
 'weather_related',
 'floods',
 'storm',
 'fire',
 'earthquake',
 'cold',
 'other_weather',
 'direct_report']

In [7]:

def tokenize(text):
    """Nomalize, tokenize and lemmatize process for text

    Args:
        text (str)

    Return:
        text_lems: list of element of the text after normalization, tokenization
        stopwords removal and lemmetization
    """
    # Normailization -- lower case + remove puntuation
    text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())

    # tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    words = [word for word in tokens if word not in stopwords.words("english")]

    # stemmization
    text_lems = [WordNetLemmatizer().lemmatize(lem).strip() for lem in words]

    return text_lems


In [8]:
def build_model():
    """ The model builing process to integrate all the necessary steps of model
        training, which include data loading, transformation, model training,
        parameter grid search, model evaluation and save the trained model.
    """

    # feature preprocessing pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    # grid search
    parameters = {
        #'clf__estimator__n_estimators': [20, 50],
        #'clf__estimator__max_depth': [3, 6]
        'clf__estimator__min_samples_split': [3]
        #'clf__estimator__loss': ['log', 'hinge']
        #'clf__estimator__penalty': ['l2']
        #'clf__estimator__alpha': [0.001, 0.0001]
    }

    cv = GridSearchCV(pipeline, param_grid= parameters)

    return cv


In [9]:
model = build_model()

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [11]:
model.fit(X_train,Y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..._score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=None))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'clf__estimator__min_samples_split': [3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:

def evaluate_model(model, X_test, Y_test, category_names):
    """ Evaluation trained model via comparing true Y_test values and predicted
        test values

    Args:
        model: the trained model
        X_test: subset dataframe of selected test predictors
        Y_test: subset dataframe of selected test tragets
        category_names: the list of traget variables name list

    Return:
        None. Classification report which includes precison, recall, f1
        score for each target variable, and averaged f1 score of all target
        variables  will be printed out while calling the function.
    """
    # prediction on test dataset
    Y_pred = model.predict(X_test)

    # initialize aggregated avg score
    weighted_fscore = 0

    for i in np.arange(0,36,1):
        print("Target:{}".format(category_names[i]))
        print("\n")
        print(classification_report(Y_test[i],Y_pred[i]))
        print('\n')
        print('\n')
        weighted_fscore += f1_score(Y_test[i],Y_pred[i], \
        average='weighted')

    print("Overall average f1 score of all categories are: {}".\
    format(weighted_fscore/(i+1)))


In [13]:
evaluate_model(model, X_test, Y_test, category_names)

Target:related


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36

   micro avg       1.00      1.00      1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36





Target:request


              precision    recall  f1-score   support

           0       0.90      1.00      0.95        28
           1       1.00      0.62      0.77         8

   micro avg       0.92      0.92      0.92        36
   macro avg       0.95      0.81      0.86        36
weighted avg       0.92      0.92      0.91        36





Target:offer


              precision    recall  f1-score   support

           0       0.97      1.00      0.99        33
           1       1.00      0.67      0.80         3

   micro avg       0.97      0.97      0.97        36
   macro avg       0.99      0.83      0.89        36
weighted avg       0.97      0.97      0.97        36





Target:aid_r

In [21]:
model2 = joblib.load("model/RF_CV.pkl")

In [26]:
model.predict(['slightly injured, need help'])

array([[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [17]:
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('etl_processed_data', engine)


query = 'slightly injured, need help'

classification_labels = model.predict([query])[0]
classification_results = dict(zip(df.columns[4:], classification_labels))

In [18]:
classification_results

{'related': 1,
 'request': 1,
 'offer': 0,
 'aid_related': 1,
 'medical_help': 1,
 'medical_products': 0,
 'search_and_rescue': 0,
 'security': 0,
 'military': 0,
 'child_alone': 0,
 'water': 0,
 'food': 0,
 'shelter': 0,
 'clothing': 0,
 'money': 0,
 'missing_people': 0,
 'refugees': 0,
 'death': 0,
 'other_aid': 0,
 'infrastructure_related': 0,
 'transport': 0,
 'buildings': 0,
 'electricity': 0,
 'tools': 0,
 'hospitals': 0,
 'shops': 0,
 'aid_centers': 0,
 'other_infrastructure': 0,
 'weather_related': 0,
 'floods': 0,
 'storm': 0,
 'fire': 0,
 'earthquake': 0,
 'cold': 0,
 'other_weather': 0,
 'direct_report': 1}

In [28]:
import nltk
import sklearn

import sqlalchemy
print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The sqlalchemy version is {}.'.format(sqlalchemy.__version__))

The nltk version is 3.4.
The scikit-learn version is 0.20.1.
The sqlalchemy version is 1.2.15.
