In [24]:
import sys
# import libraries
import re

import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier

from sqlalchemy import create_engine

import pickle


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alirezamirsadraee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alirezamirsadraee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
def load_data(database_filepath):
    engine = create_engine('sqlite:///' + database_filepath)
    df = pd.read_sql_table('ETL_Pipeline', engine)
    X = df['message']
    Y = df[df.columns[4:]]
    category_names = Y.columns
    return X, Y, category_names

def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def build_model():
    pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    parameters = {
    'clf__estimator__n_estimators': [2],
    'clf__estimator__min_samples_split': [2],
    }

    # create grid search object
    model = GridSearchCV(pipeline, param_grid=parameters,verbose = 2, cv = 3)
    
    return model

def evaluate_model(model, X_test, Y_test, category_names):
    Y_pred = model.predict(X_test)
    
    for i in range(36):
        print(classification_report(Y_test.iloc[:,i], Y_pred[:,i])) #, target_names=category_names))


def save_model(model, model_filepath):
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)



In [26]:

# def main():
# if len(sys.argv) == 3:
# database_filepath, model_filepath = sys.argv[1:]
database_filepath = '../data/ETL_Pipeline.db'
model_filepath = 'ML_Classifier.pkl'
print('Loading data...\n    DATABASE: {}'.format(database_filepath))


Loading data...
    DATABASE: ../data/ETL_Pipeline.db


In [27]:
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print('Reading ' + database_filepath)
print('Reading ' + model_filepath)
print(category_names  )


Reading ../data/ETL_Pipeline.db
Reading ML_Classifier.pkl
Index(['('related',)', '('request',)', '('offer',)', '('aid_related',)',
       '('medical_help',)', '('medical_products',)', '('search_and_rescue',)',
       '('security',)', '('military',)', '('child_alone',)', '('water',)',
       '('food',)', '('shelter',)', '('clothing',)', '('money',)',
       '('missing_people',)', '('refugees',)', '('death',)', '('other_aid',)',
       '('infrastructure_related',)', '('transport',)', '('buildings',)',
       '('electricity',)', '('tools',)', '('hospitals',)', '('shops',)',
       '('aid_centers',)', '('other_infrastructure',)', '('weather_related',)',
       '('floods',)', '('storm',)', '('fire',)', '('earthquake',)',
       '('cold',)', '('other_weather',)', '('direct_report',)'],
      dtype='object')


In [28]:
print('Building model...')
model = build_model()
print(model)


Building model...
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x29013a050>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'clf__estimator__min_samples_split': [2],
                         'clf__estimator__n_estimators': [2]},
             verbose=2)


In [29]:
print('Training model...')
model.fit(X_train, Y_train)


Training model...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=2; total time=   8.6s
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=2; total time=   8.5s
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=2; total time=   8.5s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x29013a050>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'clf__estimator__min_samples_split': [2],
                         'clf__estimator__n_estimators': [2]},
             verbose=2)

In [30]:
model.best_score_

0.19116039035678203

In [31]:
model.best_params_
best_model = model.best_estimator_
print(best_model)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x29013a050>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=2)))])


In [32]:
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)


Evaluating model...
              precision    recall  f1-score   support

           0       0.40      0.60      0.48      1175
           1       0.86      0.74      0.80      4038
           2       0.80      0.13      0.22        31

    accuracy                           0.71      5244
   macro avg       0.69      0.49      0.50      5244
weighted avg       0.76      0.71      0.72      5244

              precision    recall  f1-score   support

           0       0.86      0.98      0.91      4330
           1       0.70      0.23      0.35       914

    accuracy                           0.85      5244
   macro avg       0.78      0.61      0.63      5244
weighted avg       0.83      0.85      0.82      5244

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5219
           1       0.00      0.00      0.00        25

    accuracy                           0.99      5244
   macro avg       0.50      0.50      0.50      5244


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')

    # else:
    #     print('Please provide the filepath of the disaster messages database '\
    #           'as the first argument and the filepath of the pickle file to '\
    #           'save the model to as the second argument. \n\nExample: python '\
    #           'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


# if __name__ == '__main__':

#     main()


Saving model...
    MODEL: ML_Classifier.pkl
Trained model saved!


In [34]:
Y_pred = model.predict(X_test)
evaluate_model(model, X_test, Y_test, category_names)


              precision    recall  f1-score   support

           0       0.40      0.60      0.48      1175
           1       0.86      0.74      0.80      4038
           2       0.80      0.13      0.22        31

    accuracy                           0.71      5244
   macro avg       0.69      0.49      0.50      5244
weighted avg       0.76      0.71      0.72      5244

              precision    recall  f1-score   support

           0       0.86      0.98      0.91      4330
           1       0.70      0.23      0.35       914

    accuracy                           0.85      5244
   macro avg       0.78      0.61      0.63      5244
weighted avg       0.83      0.85      0.82      5244

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5219
           1       0.00      0.00      0.00        25

    accuracy                           0.99      5244
   macro avg       0.50      0.50      0.50      5244
weighted avg       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
category_names

Index(['('related',)', '('request',)', '('offer',)', '('aid_related',)',
       '('medical_help',)', '('medical_products',)', '('search_and_rescue',)',
       '('security',)', '('military',)', '('child_alone',)', '('water',)',
       '('food',)', '('shelter',)', '('clothing',)', '('money',)',
       '('missing_people',)', '('refugees',)', '('death',)', '('other_aid',)',
       '('infrastructure_related',)', '('transport',)', '('buildings',)',
       '('electricity',)', '('tools',)', '('hospitals',)', '('shops',)',
       '('aid_centers',)', '('other_infrastructure',)', '('weather_related',)',
       '('floods',)', '('storm',)', '('fire',)', '('earthquake',)',
       '('cold',)', '('other_weather',)', '('direct_report',)'],
      dtype='object')