# 1. Install libraries

In [None]:
!pip install nltk
!pip install plotly

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# 2. Import libraries

In [1]:
import pandas as pd
import numpy as np

import re
from sqlalchemy import create_engine

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.base import BaseEstimator,TransformerMixin

import pickle

# 3. Import datasets

In [2]:
# Import datasets

filepath = "../data/DisasterResponse.db"
engine = create_engine('sqlite:///' + filepath)
df = pd.read_sql_table('DisasterResponse', engine)


In [3]:
pd.set_option('display.max_columns', 100)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# 4. Clean dataframe

In [4]:
# Describe

df.describe()

Unnamed: 0,id,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
count,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0,26216.0
mean,15224.82133,0.77365,0.170659,0.004501,0.414251,0.079493,0.050084,0.027617,0.017966,0.032804,0.0,0.063778,0.111497,0.088267,0.015449,0.023039,0.011367,0.033377,0.045545,0.131446,0.065037,0.045812,0.050847,0.020293,0.006065,0.010795,0.004577,0.011787,0.043904,0.278341,0.082202,0.093187,0.010757,0.093645,0.020217,0.052487,0.193584
std,8826.88914,0.435276,0.376218,0.06694,0.492602,0.270513,0.218122,0.163875,0.132831,0.178128,0.0,0.244361,0.314752,0.283688,0.123331,0.150031,0.106011,0.179621,0.2085,0.337894,0.246595,0.209081,0.219689,0.141003,0.077643,0.103338,0.067502,0.107927,0.204887,0.448191,0.274677,0.2907,0.103158,0.29134,0.140743,0.223011,0.395114
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7446.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15662.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,22924.25,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,30265.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Drop 'child_alone' column

df.drop('child_alone', axis=1, inplace=True)

In [6]:
# Max value of the 'related' column is 2, could be an error. 

print('Number or rows with value of 2: ', df[df.related == 2].shape[0])

# Convert to 1

df['related'] = df['related'].apply(lambda x: 1 if x==2 else x)

Number or rows with value of 2:  188


# 5. Write a tokenize function to process text data

In [7]:
def tokenize(text):
    """
    Tokenize the text
    
    - Input: text message
    - Output: tokens from the text message
    """
    
    # Create a regex of url strings:
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Find all occurences of url strings:
    detected_urls = re.findall(url_regex, text)
    
    # Replace all url strings with 'urlplaceholder'
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')
        
    # Extract the token
    tokens = word_tokenize(text)
    
    # Lemmatize
    lemmatize = WordNetLemmatizer()
    
    # Return clean tokens:
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatize.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
        
    return clean_tokens

In [8]:
# Build StartingVerbExtractor transformer to extract the starting verb
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    
    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

# 6. Build a machine learning pipeline

In [9]:
# Build a pipeline consisting of CountVectorizer, TfidfTransformer, StartingVerbExtractor as tranformers 
# and RandomForestClassifier as estimator

pipeline = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('starting_verb', StartingVerbExtractor())
    ])),

    ('clf', RandomForestClassifier())
])

# 7. Train pipeline

In [10]:
# Extract X and y

X = df['message']
y = df.loc[:,'related':]

In [11]:
# Split into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
# Fit model

pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer(tokenizer=<function tokenize at 0x0000021BA41F1820>)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())])),
                                                ('starting_verb',
                                                 StartingVerbExtractor())])),
                ('clf', RandomForestClassifier())])

# 8. Test the model

In [13]:
# Predict y_test
y_pred = pipeline.predict(X_test)

# Print classification report
print(classification_report(y_test.values, y_pred, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.83      0.96      0.89      5057
               request       0.90      0.42      0.57      1094
                 offer       0.00      0.00      0.00        28
           aid_related       0.84      0.48      0.61      2752
          medical_help       0.50      0.01      0.01       537
      medical_products       0.56      0.02      0.03       317
     search_and_rescue       0.00      0.00      0.00       195
              security       1.00      0.01      0.02       117
              military       0.00      0.00      0.00       239
                 water       0.93      0.15      0.26       426
                  food       0.92      0.26      0.40       751
               shelter       0.93      0.07      0.13       595
              clothing       0.67      0.02      0.04        98
                 money       1.00      0.01      0.01       144
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 9. Improve your model

In [14]:
# Try different parameters with the estimator

parameters = {
    'clf__n_estimators': [50,100],
    'clf__min_samples_split': [2,3]
}

cv = GridSearchCV(pipeline, param_grid = parameters)
cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('vect',
                                                                                         CountVectorizer(tokenizer=<function tokenize at 0x0000021BA41F1820>)),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())])),
                                                                       ('starting_verb',
                                                                        StartingVerbExtractor())])),
                                       ('clf', RandomForestClassifier())]),
             param_grid={'clf__min_samples_split': [2, 3],
                         'clf__n_estimators': [50, 100]})

# 10. Export your model as a pickle file

In [None]:
pickle.dump(cv, 'model.pkl')

NameError: name 'pickle' is not defined