In [6]:
import pickle
import sqlite3
import sys
from datetime import datetime as dt

import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, Real
from sqlalchemy import create_engine

In [86]:
def load_data(database_filepath):
    engine = create_engine(f"sqlite:///{database_filepath}")
    df = pd.read_sql("messages", engine)
    X = df['message']
    y = df.drop(columns=["message", "id", "original", "genre"])
    infrequent_labels = y.sum()[y.sum()<10].index.tolist()
    y.drop(columns=infrequent_labels, inplace = True)
    y[y>1]=1
    return (X, y, y.columns)


def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def build_model():
    model = Pipeline(
        [
            (
                "tf-idf vectorization",
                TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2)),
            ),
            (
                "classifier",
                MultiOutputClassifier(
                    estimator=RandomForestClassifier(
                        class_weight="balanced", n_estimators=200, max_depth=5,
                    )
                ),
            ),
        ],
        verbose=True,
    )
    return model


def evaluate_model(model, X, y):

    predictions = model.predict(X)
    predictions = pd.DataFrame(predictions, columns=y.columns)

    reports = []
    for label in list(y):
        ylab = y[label]
        predlab = predictions[label]
        rep = dict()
        rep["category"] = label
        rep["accuracy"] = accuracy_score(ylab, predlab)
        print(rep)

        # Confusion matrix:
        conf = pd.Series(
            confusion_matrix(ylab, predlab).ravel(), index=["tn", "fp", "fn", "tp"]
        )
        conf_dict = {ind: conf[ind] for ind in conf.index}
        rep.update(conf_dict)
        reports.append(pd.Series(rep))
        print(pd.Series(rep))

    df_report = np.round(pd.DataFrame(reports).set_index("category"), 3)
    # df_report.insert(0, "training_timestamp", dt.now().strftime("%Y-%m-%d, %H:%M:%S"))
    # report_filename = 'evaluation_report.csv'
    # if report_filename in os.listdir(os.getcwd()):
    #     df_report.to_csv(report_filename, mode='a', header=False)
    # else:
    #     df_report.to_csv(report_filename)
    return df_report


def save_model(model, model_filepath):
    """Perform pickle dump of the model"""

    pickle.dump(model, open(model_filepath, "wb"))



In [87]:
X, y, columns = load_data('../data/DisasterResponse.db')

In [88]:
model = build_model()

In [89]:
model.fit(X, y)

[Pipeline]  (step 1 of 2) Processing tf-idf vectorization, total=   8.3s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=  43.9s


Pipeline(steps=[('tf-idf vectorization',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function tokenize at 0x7fc25bfb6af0>)),
                ('classifier',
                 MultiOutputClassifier(estimator=RandomForestClassifier(class_weight='balanced',
                                                                        max_depth=5,
                                                                        n_estimators=200)))],
         verbose=True)

In [90]:
report = evaluate_model(model, X, y)

In [None]:
report

Unnamed: 0_level_0,accuracy,0,1,2,3
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
related,0.753,4439,1683,4802,15292
request,0.878,19848,1894,1294,3180
offer,0.996,26070,28,65,53
aid_related,0.763,12616,2740,3468,7392
medical_help,0.844,20805,3327,757,1327
medical_products,0.924,23403,1500,488,825
search_and_rescue,0.953,24618,874,360,364
security,0.974,25312,433,256,215
military,0.885,22538,2818,205,655
water,0.94,23383,1161,411,1261
