## ETL Pipeline

In [1]:
import sys
import pandas as pd
from sqlalchemy import create_engine

def load_merge_data(messages_filepath, categories_filepath):

    messages_filepath = 'data/disaster_messages.csv'
    categories_filepath = 'data/disaster_categories.csv'

    #Reads disaster_messages.csv and drop the original column
    df_mess = pd.read_csv(messages_filepath, encoding='latin-1')
    df_mess.drop(['original'],axis=1,inplace=True)
    
    #Reads disaster_categories.csv
    df_cat = pd.read_csv(categories_filepath, encoding='latin-1')

    # Merges both dataframes on ['Id']
    df = df_mess.merge(df_cat, how='outer', on=['id'])
    
    return df

In [2]:
def clean_data(df):
    
### Creates columns with correspondent values of the 'categories' column

    # Provides a list with all the columns extracted from the category column
    df.dropna()
    cat = df.loc[0,'categories']
    cat_list = cat.split(';')
    col_names = []
    for val in cat_list:
        c = val.split('-')[0]
        col_names.append(c)

    # Creates all columns in df with correct value
    for col in col_names[0:-1]:
        try:
            df[col]          = df['categories'].apply(lambda st: st[st.find("-")+1:st.find(";")])
            df['categories'] = df['categories'].str.split(';',n=1).str[1:]
            df['categories'] = df['categories'].apply(lambda x: str(x[0]))
        # deals with the last column
        except:
            df[col_names[-1]]= df['categories'].apply(lambda st: st[st.find("-")+1:])
    
    # Drops de 'categories' column
    df.drop(['categories'], axis = 1, inplace = True)
    
    #Remove duplicates
    print('Number of columns: {}, and number of duplicates: {}'.format(df['message'].shape[0],df[df.duplicated() == True]['id'].count()))
    print('The following rows are duplications:')
    print(df[['id','message']][df.duplicated()==True])
    df.drop_duplicates(inplace=True)
    #df.drop_duplicates(subset=['id', 'message'],inplace=True)
    print('Number of columns of new dataframe excluding duplicates: {}'.format(df['message'].shape[0]))
    
    return df

In [3]:
def main():

    # Load and merge datasets
    messages_filepath = 'data/disaster_messages.csv'
    categories_filepath = 'data/disaster_categories.csv'

    df = load_merge_data(messages_filepath, categories_filepath)

    # Clean and remove duplicates
    df = clean_data(df)

    # Save clean dataset into an sqlite database
    engine = create_engine('sqlite:///Disaster_response_pipelines.db')
    df.to_sql('Disaster_response_pipelines', engine, index=False, if_exists = 'replace')
    
main()

Number of columns: 26386, and number of duplicates: 172
The following rows are duplications:
          id                                            message
164      202  ?? port au prince ?? and food. they need gover...
165      202  ?? port au prince ?? and food. they need gover...
658      804  elle est vraiment malade et a besoin d'aide. u...
659      804  elle est vraiment malade et a besoin d'aide. u...
660      804  elle est vraiment malade et a besoin d'aide. u...
...      ...                                                ...
25291  29022  In a field in Jallouzai, just inside Pakistan,...
25292  29022  In a field in Jallouzai, just inside Pakistan,...
25378  29119  Most victims (90 per cent) show little or no s...
25379  29119  Most victims (90 per cent) show little or no s...
25380  29119  Most victims (90 per cent) show little or no s...

[172 rows x 2 columns]
Number of columns of new dataframe excluding duplicates: 26214


## ML Pipeline

In [2]:
# Import Libraries
import sys
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import nltk
nltk.download(['punkt', 'wordnet'])
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rodrigo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rodrigo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [83]:
def load_data():

    engine = create_engine('sqlite:///Disaster_response_pipelines.db')
    df = pd.read_sql_table('Disaster_response_pipelines', con=engine)#, 'sqlite:///Disaster_response_pipelines.db')
    df = df[df['related'] != '2']
    X = df['message']
    y= df[df.columns[4:]].apply(pd.to_numeric, errors ='ignore')
    return X, y

In [9]:
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql_table('data/DisasterResponse.db', con=engine)
df = df[df['related'] != '2']

In [10]:
df.columns

Index(['id', 'message', 'genre', 'related', 'request', 'offer', 'aid_related',
       'medical_help', 'medical_products', 'search_and_rescue', 'security',
       'military', 'child_alone', 'water', 'food', 'shelter', 'clothing',
       'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather'],
      dtype='object')

In [11]:
def tokenize(text):
    
    # Normalize text and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # Tokenize text
    tokens = word_tokenize(text)
    stop_words = stopwords.words("english")
    words = [w for w in tokens if w not in stop_words]
    
    # Remove Stop Words
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer() 
    lemmed = [lemmatizer.lemmatize(w) for w in tokens]
    
    #clean_tokens = []
    #for tok in tokens:
    #    clean_tok = lemmatizer.lemmatize(tok).lower().strip()
    #    clean_tokens.append(clean_tok)

    return lemmed#clean_tokens

#X, y, cat_names = load_data()
#for message in X[:5]:
#    tokens = tokenize(message)
#    print(message)
#    print(tokens, '\n')

In [98]:
def build_model():

    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',  MultiOutputClassifier(RandomForestClassifier()))
    ])
    # to check parameters
    # pipeline.get_params()

    # specify parameters for grid search
    parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        #'vect__max_df': (0.5, 1.0),
        #'vect__max_features': (None, 5000, 10000),
        'tfidf__use_idf': (True, False),
        #'vect__max_features': (None, 5000),
        'clf__estimator__n_estimators': [10]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters, cv=2, verbose=3)

    return cv

In [99]:
def save_model(model, model_filepath):
    pickle.dump(model, open(model_filepath, 'wb'))

In [100]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # build model
    model = build_model()

    # train model
    model.fit(X_train, y_train)

    # predict on test data
    y_pred = model.predict(X_test)

    # Save model
    save_model(model, model_filepath = 'test.sav')

    # display results
    print(classification_report(y_test, y_pred, target_names=y.columns))

main()

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 1), score=0.392, total= 2.9min
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.9min remaining:    0.0s


[CV]  clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 1), score=0.385, total= 2.9min
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.9min remaining:    0.0s


[CV]  clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.395, total= 3.3min
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) 
[CV]  clf__estimator__n_estimators=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.403, total= 3.2min
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__estimator__n_estimators=10, tfidf__use_idf=False, vect__ngram_range=(1, 1), score=0.397, total= 3.0min
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=False, vect__ngram_range=(1, 1) 
[CV]  clf__estimator__n_estimators=10, tfidf__use_idf=False, vect__ngram_range=(1, 1), score=0.398, total= 2.9min
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV]  clf__estimator__n_estimators=10, tfidf__use_idf=False, vect__ngram_range=(1, 2), score=0.403, total= 3.2min
[CV] clf__estimator__n_estimators=10, tfidf__use_idf=False, vect__ngram_range=(1, 2) 
[CV

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 24.7min finished


                        precision    recall  f1-score   support

               request       0.80      0.44      0.57      1120
                 offer       0.00      0.00      0.00        27
           aid_related       0.76      0.55      0.64      2682
          medical_help       0.58      0.05      0.09       510
      medical_products       0.88      0.16      0.27       328
     search_and_rescue       0.69      0.05      0.10       169
              security       0.33      0.01      0.01       131
              military       0.60      0.03      0.06       205
           child_alone       0.00      0.00      0.00         0
                 water       0.87      0.27      0.41       401
                  food       0.81      0.54      0.65       746
               shelter       0.81      0.23      0.36       570
              clothing       0.82      0.08      0.15       106
                 money       1.00      0.03      0.06       149
        missing_people       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
