In [1]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [31]:
import re
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/pavan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pavan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pavan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import joblib

In [20]:
engine = create_engine('sqlite:///DisResp.db')
df = pd.read_sql_table('DisasterResponse', engine)
X = df['message']
Y = df.drop(['id', 'message', 'original', 'genre'], axis=1)
cat_names = list(df.columns[4:])

In [9]:
def tokenize(text):
    url_re = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_re, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(tok).lower().strip() for tok in tokens]
    STOPWORDS = list(set(stopwords.words('english')))
    clean_tokens = [token for token in clean_tokens if token not in STOPWORDS]    
    return clean_tokens

In [10]:
def build_pipeline():
    pipeline = Pipeline([
        ('vec', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators = 100, n_jobs = 6)))
    ])    
    return pipeline

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)
pipeline = build_pipeline()
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vec',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fd9f4d97160>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=6)))])

In [12]:
def build_report(pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)
    performances = []
    for i in range(len(y_test.columns)):
        performances.append([f1_score(y_test.iloc[:, i].values, y_pred[:, i], average='micro'),
                             precision_score(y_test.iloc[:, i].values, y_pred[:, i], average='micro'),
                             recall_score(y_test.iloc[:, i].values, y_pred[:, i], average='micro')])
    performances = pd.DataFrame(performances, columns=['f1 score', 'precision', 'recall'],
                                index = y_test.columns)   
    return performances

In [42]:
X_test.shape

(6554,)

In [13]:
build_report(pipeline, X_test, y_test)

Unnamed: 0,f1 score,precision,recall
related,0.812786,0.812786,0.812786
request,0.893958,0.893958,0.893958
offer,0.996643,0.996643,0.996643
aid_related,0.77632,0.77632,0.77632
medical_help,0.916692,0.916692,0.916692
medical_products,0.950107,0.950107,0.950107
search_and_rescue,0.974672,0.974672,0.974672
security,0.984742,0.984742,0.984742
military,0.968264,0.968264,0.968264
child_alone,1.0,1.0,1.0


In [50]:
pickle.dump(pipeline, open('randomforest.pkl', 'wb'))

In [44]:
parameters = {'clf__estimator__max_features':['sqrt', 0.5],
              'clf__estimator__n_estimators':[80, 100]}

cv = GridSearchCV(estimator=pipeline, param_grid = parameters)

In [45]:
cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vec',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fd9f4d97160>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=6)))]),
             param_grid={'clf__estimator__max_features': ['sqrt', 0.5],
                         'clf__estimator__n_estimators': [80, 100]})

In [46]:
build_report(cv, X_test, y_test)

Unnamed: 0,f1 score,precision,recall
related,0.804852,0.804852,0.804852
request,0.88816,0.88816,0.88816
offer,0.995423,0.995423,0.995423
aid_related,0.764571,0.764571,0.764571
medical_help,0.915624,0.915624,0.915624
medical_products,0.954837,0.954837,0.954837
search_and_rescue,0.974519,0.974519,0.974519
security,0.982148,0.982148,0.982148
military,0.965975,0.965975,0.965975
child_alone,1.0,1.0,1.0


In [48]:
pipeline_finetuned = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', MultiOutputClassifier(AdaBoostClassifier(n_estimators = 80)))
                            ])
pipeline_finetuned.fit(X_train, y_train)
y_pred_improved = pipeline_finetuned.predict(X_test)
build_report(pipeline_finetuned, X_test, y_test)

Unnamed: 0,f1 score,precision,recall
related,0.769912,0.769912,0.769912
request,0.890754,0.890754,0.890754
offer,0.995423,0.995423,0.995423
aid_related,0.761672,0.761672,0.761672
medical_help,0.924168,0.924168,0.924168
medical_products,0.954532,0.954532,0.954532
search_and_rescue,0.974672,0.974672,0.974672
security,0.982301,0.982301,0.982301
military,0.970095,0.970095,0.970095
child_alone,1.0,1.0,1.0


In [49]:
pickle.dump(pipeline_finetuned, open('adaboost.pkl', 'wb'))