## Disaster-Response-Pipeline

### ETL Pipeline

In [68]:
# Load required libraries
import sys
import re
import pandas as pd
import numpy as np
import sqlite3 as db
import pickle

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

from nltk.corpus import stopwords
# use nltk.download('stopwords') and nltk.download('wordnet') to download the relevant files
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
# load data
# RUN: e.g. python data/process_data.py data/disaster_messages.csv data/disaster_categories.csv data/DisasterResponse.db

messages_filepath = "./data/disaster_messages.csv"
categories_filepath = "./data/disaster_categories.csv"

messages = pd.read_csv(messages_filepath)
categories = pd.read_csv(categories_filepath)
    
df = messages.merge(categories, on='id')

In [3]:
df.shape

(26386, 5)

In [4]:
df.head()

Unnamed: 0,id,message,original,genre,categories
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,related-1;request-0;offer-0;aid_related-0;medi...
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,related-1;request-0;offer-0;aid_related-1;medi...
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,related-1;request-0;offer-0;aid_related-0;medi...
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,related-1;request-1;offer-0;aid_related-1;medi...
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,related-1;request-0;offer-0;aid_related-0;medi...


In [5]:
# Split categories into separate columns
categories = df['categories'].str.split(';', expand=True)
categories.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


In [6]:
# Select the first row of the categories 
firstrow = categories.iloc[0,:]

# Extract new column names for categories
category_colnames = firstrow.apply(lambda x:x[:-2])

In [7]:
# Rename the columns
categories.columns = category_colnames
categories.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
1,related-1,request-0,offer-0,aid_related-1,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-1,floods-0,storm-1,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
2,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
3,related-1,request-1,offer-0,aid_related-1,medical_help-0,medical_products-1,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0
4,related-1,request-0,offer-0,aid_related-0,medical_help-0,medical_products-0,search_and_rescue-0,security-0,military-0,child_alone-0,...,aid_centers-0,other_infrastructure-0,weather_related-0,floods-0,storm-0,fire-0,earthquake-0,cold-0,other_weather-0,direct_report-0


In [8]:
# Get the last character of the string and convert it numeric
for column in categories:
    categories[column] = categories[column].str[-1]
    categories[column] = pd.to_numeric(categories[column])
    
categories.head(5)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Drop the original columns
df.drop('categories', axis=1, inplace=True)
df.head()

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [10]:
# Concat the DataFrames
df = pd.concat([df, categories], axis=1)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Check quality (NaN and duplicates)
print(df.duplicated().sum())
print(df.isnull().sum())

170
id                            0
message                       0
original                  16140
genre                         0
related                       0
request                       0
offer                         0
aid_related                   0
medical_help                  0
medical_products              0
search_and_rescue             0
security                      0
military                      0
child_alone                   0
water                         0
food                          0
shelter                       0
clothing                      0
money                         0
missing_people                0
refugees                      0
death                         0
other_aid                     0
infrastructure_related        0
transport                     0
buildings                     0
electricity                   0
tools                         0
hospitals                     0
shops                         0
aid_centers                   0
othe

In [14]:
# Drop all duplicates
df.drop_duplicates(inplace=True)

### TRAIN CLASSIFIER

In [None]:
# RUN: e.g. python models/train_classifier.py data/DisasterResponse.db models/classifier.pkl

In [82]:
df.shape

(26216, 40)

In [28]:
# Generate X, Y
X = df['message'] 
Y = df.iloc[:,4:] # if you load the data with sqlite plese use  Y = df.iloc[:,5:]

In [23]:
# mapping extra values to `1`
Y['related']=Y['related'].map(lambda x: 1 if x == 2 else x)
category_names = Y.columns

In [81]:
X.shape

(26216,)

In [21]:
category_names

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [25]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [29]:
# Function -> tokenizes text data
def tokenize(text):
    tokens = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Normalize text 
    words = word_tokenize(tokens) # Tokenize text  
    words = [word for word in words if word not in stopwords.words("english")] # remove stopwords  
    
    words = [WordNetLemmatizer().lemmatize(word) for word in words] # lemmatize 
    
    return words

In [42]:
# Model pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC())))])

In [44]:
# Hyper-parameter grid
parameter = {
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_df': (0.75, 1.0)}

In [48]:
# Create model
model = GridSearchCV(estimator=pipeline, param_grid=parameter, verbose=3, cv=3)

In [56]:
#training
model.fit(X_train, Y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.282, total= 2.2min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s
  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.277, total= 2.4min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.6min remaining:    0.0s
  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.276, total= 2.4min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................


  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.289, total= 2.5min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................


  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.280, total= 2.6min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................


  str(classes[c]))


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.281, total= 2.6min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.282, total= 2.3min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.277, total= 2.2min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.276, total= 2.3min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 2) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 2), score=0.289, total= 2.3min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 2) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 2), score=0.280, total= 2.2min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 2) ......................


  str(classes[c]))


[CV]  vect__max_df=1.0, vect__ngram_range=(1, 2), score=0.281, total= 2.5min


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 28.6min finished
  str(classes[c]))


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [57]:
#evaluate

# predict
prediction = model.predict(X_test)

In [63]:
# Accuracy of the model
np.mean(Y_test.values == y_pred)


Accuracy: 0.9494766505636071


In [65]:
# print classification report
print(classification_report(Y_test.values, y_pred, target_names=category_names));


                        precision    recall  f1-score   support

               related       0.86      0.94      0.90      4036
               request       0.75      0.66      0.70       907
                 offer       0.00      0.00      0.00        27
           aid_related       0.71      0.78      0.74      2210
          medical_help       0.60      0.38      0.46       408
      medical_products       0.62      0.31      0.42       274
     search_and_rescue       0.68      0.18      0.28       141
              security       0.50      0.03      0.05       120
              military       0.55      0.30      0.39       184
           child_alone       0.00      0.00      0.00         0
                 water       0.72      0.75      0.73       327
                  food       0.80      0.78      0.79       616
               shelter       0.73      0.63      0.67       475
              clothing       0.70      0.42      0.52        88
                 money       0.57      

In [69]:
# save model to pickle file
pickle.dump(model, open("models/classifier.pkl", 'wb'))

In [None]:
# TESTING

In [75]:
msg = ['All houses are on fire']
test_output = model.predict(msg)
print(Y_train.columns.values[(test_output.flatten()==1)])

['related' 'aid_related' 'shelter' 'buildings' 'weather_related' 'fire']
