## Goal -
    Test initiative re how pipeline is constructed and trained in production, i.e. in train_classifer.py

### Current Initiative -
    Construct the pipeline in two pieces: 1 the nlp vectorizer piece and 2 the ML model. Cache intermediates when training.

### Load data from database file in `data` directory

In [None]:
import pandas as pd
import numpy as np
import sqlalchemy as sqal
from sklearn.model_selection import train_test_split
import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import pickle as pkl
import joblib

en_nlp = spacy.load('en')
stopwords = spacy.lang.en.stop_words.STOP_WORDS
stemmer = SnowballStemmer('english')


In [None]:
def load_data(database_filepath):
    # open the database file created by previous script
    engine = sqal.create_engine('sqlite:///' + database_filepath)
    # and grab the table therein
    df = pd.read_sql_table('MessageCategorization', engine)

    in_columns = 'message'
    out_columns = list(df.columns)[4:]

    # remove outliers from 'related' column
    df['related'] = np.clip(df['related'], 0, 1)

    text = df[in_columns].values
    y = df[out_columns].values

    # # save some for data for testing the trained model
    # text_train, text_test, y_train, y_test = \
    #     train_test_split(text, y, test_size=0.33, random_state=42)

    return text, y, out_columns


In [None]:
text, y, out_columns = load_data('../data/DisasterResponse.db')

In [None]:
text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.33, random_state=42)

### Construct the pipeline

In [None]:
def tokenize(text):
    # tokenize the text using spacy's model for English
    doc = en_nlp(text)
    # while we lemmatize the now tokenized text, let's not forget to drop
    #   tokens that are stop_words or punctuation
    lemmas = [token.lemma_ for token in doc
        if token not in stopwords and not token.is_punct]
    # Had better luck with this nltk stemmer
    return [stemmer.stem(lemma) for lemma in lemmas]

In [None]:
nlp_model = make_pipeline(
    TfidfVectorizer(tokenizer=tokenize, min_df=5))

In [None]:
ml_model = make_pipeline(
    MultiOutputClassifier(
        estimator=AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=2),
            n_estimators=10, learning_rate=1)))

#### Fit the two pieces of the pipeline

In [None]:
%%time
X_train = nlp_model.fit_transform(text_train)

In [None]:
%%time
ml_model.fit(X_train, y_train)

In [None]:
%%time
X_test = nlp_model.transform(text_test)

In [None]:
%%time
y_pred = ml_model.predict(X_test)

### Strategizing

The tasks we need to cache the model or it's products for are:
1. score the model
    - requires `y_test` and `y_pred`
2. generate the scatter plot with f1 scores vs. num per category
    - requires `y_test`, `y_pred` and `num_pos`
3. generate the bar chart with hovers listing relevant tokens for each category
    - requires `num_pos` and `canonTable`
4. compute predicted categories for novel messages
    - requires `model.predict()`

- `y_predicted` is cached on disk and read by `run.py`.
- `num_pos` is computed from df in `run.py`
- `df` is read from `DisasterResponse.db`
- `canonTable` can be cached to disk, and read by `run.py`

In [None]:
nlp_model['tfidfvectorizer']

In [None]:
vocab = nlp_model['tfidfvectorizer'].vocabulary_

In [None]:
n_vocab = len(vocab)
n_vocab

### Construct The Combined Pipeline

Can we use use, in particular can we cache, the combined pipeline?

In [None]:
model = make_pipeline(nlp_model, ml_model)

In [None]:
model.get_params

In [None]:
text_test[1]

In [None]:
model.predict([text_test[1]])

### Caching The Results

In [None]:
with open('../models/classifier.pkl', 'wb') as f:
    joblib.dump(model, f)
    
with open('../models/nlp_model.pkl', 'wb') as f:
    joblib.dump(nlp_model, f)

with open('../models/ml_model.pkl', 'wb') as f:
    joblib.dump(ml_model, f)