In [6]:
import pickle
import sqlite3
import sys
from datetime import datetime as dt

import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, Real
from sqlalchemy import create_engine

In [86]:
def load_data(database_filepath):
    engine = create_engine(f"sqlite:///{database_filepath}")
    df = pd.read_sql("messages", engine)
    X = df['message']
    y = df.drop(columns=["message", "id", "original", "genre"])
    infrequent_labels = y.sum()[y.sum()<10].index.tolist()
    y.drop(columns=infrequent_labels, inplace = True)
    y[y>1]=1
    return (X, y, y.columns)


def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def build_model():
    model = Pipeline(
        [
            (
                "tf-idf vectorization",
                TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2)),
            ),
            (
                "classifier",
                MultiOutputClassifier(
                    estimator=RandomForestClassifier(
                        class_weight="balanced", n_estimators=200, max_depth=5,
                    )
                ),
            ),
        ],
        verbose=True,
    )
    return model


def evaluate_model(model, X, y):

    predictions = model.predict(X)
    predictions = pd.DataFrame(predictions, columns=y.columns)

    reports = []
    for label in list(y):
        ylab = y[label]
        predlab = predictions[label]
        rep = dict()
        rep["category"] = label
        rep["accuracy"] = accuracy_score(ylab, predlab)
        print(rep)

        # Confusion matrix:
        conf = pd.Series(
            confusion_matrix(ylab, predlab).ravel(), index=["tn", "fp", "fn", "tp"]
        )
        conf_dict = {ind: conf[ind] for ind in conf.index}
        rep.update(conf_dict)
        reports.append(pd.Series(rep))
        print(pd.Series(rep))

    df_report = np.round(pd.DataFrame(reports).set_index("category"), 2)
    return df_report


def save_model(model, model_filepath):
    """Perform pickle dump of the model"""

    pickle.dump(model, open(model_filepath, "wb"))



In [87]:
X, y, columns = load_data('../data/DisasterResponse.db')

In [88]:
model = build_model()

In [89]:
model.fit(X, y)

[Pipeline]  (step 1 of 2) Processing tf-idf vectorization, total=   8.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  43.9s


Pipeline(steps=[('tf-idf vectorization',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function tokenize at 0x7fc25bfb6af0>)),
                ('classifier',
                 MultiOutputClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                                        max_depth=5,
                                                                        n_estimators=200)))],
         verbose=True)

In [90]:
report = evaluate_model(model, X, y)

{'category': 'related', 'accuracy': 0.7541959108941104}
category     related
accuracy    0.754196
tn              4420
fp              1702
fn              4742
tp             15352
dtype: object
{'category': 'request', 'accuracy': 0.8765639304241685}
category     request
accuracy    0.876564
tn             19790
fp              1952
fn              1284
tp              3190
dtype: object
{'category': 'offer', 'accuracy': 0.996910283796155}
category      offer
accuracy    0.99691
tn            26066
fp               32
fn               49
tp               69
dtype: object
{'category': 'aid_related', 'accuracy': 0.7711702776930119}
category    aid_related
accuracy        0.77117
tn                13168
fp                 2188
fn                 3811
tp                 7049
dtype: object
{'category': 'medical_help', 'accuracy': 0.8453234665852914}
category    medical_help
accuracy        0.845323
tn                 20844
fp                  3288
fn                   767
tp               