In [12]:
import pandas as pd
import numpy as np
import sqlalchemy as sqal
from sklearn.model_selection import train_test_split

In [13]:
# load data from database
engine = sqal.create_engine('sqlite:///../data/DisasterResponse.db')
df = pd.read_sql_table('MessageCategorization', engine)

In [14]:
# correct outlying values in `related` column
df['related'] = np.clip(df['related'], 0, 1)

In [15]:
in_columns = 'message'
out_columns = list(df.columns)[4:]

In [16]:
text = df[in_columns].values
y = df[out_columns].values

In [17]:
text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.33, random_state=42)

In [None]:
text_test.shape

### Define The Tokenizer

In [5]:
import spacy
en_nlp = spacy.load('en')

In [6]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [7]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [8]:
# https://realpython.com/natural-language-processing-spacy-python/ was helpful here
def tokenize(text):
    doc = en_nlp(text)
    lemmas = [token.lemma_ for token in doc if token not in stopwords and not token.is_punct]
    stems = [stemmer.stem(lemma) for lemma in lemmas]
    return stems

### Construct The Vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
vect = TfidfVectorizer(tokenizer=tokenize, min_df=5)

In [18]:
%%time
vect.fit(text_train)



TfidfVectorizer(min_df=5, tokenizer=<function tokenize at 0x126becdc0>)

In [21]:
%%time
X_pred = vect.transform(text_test)

CPU times: user 1min 31s, sys: 1.39 s, total: 1min 33s
Wall time: 1min 43s


In [22]:
X_pred.shape

(8652, 4859)

That is 8652 samples in the test set, and 4859 tokens in the vectorizer output vocabulary.

### Extract The Vocabulary

Is there some other way we could get the vocabulary?  Some way where we don't have to reconstruct and train the vectorizer?
- Can we store the vectorizer like we store the model?  Then we could just read it from disk.

In [23]:
# vocab = vect.vocabulary_
vocab = list(vect.vocabulary_.keys())
# No good.  vocabulary_ is a dict which is unordered.  We can't rely on it to return the tokens in the correct order.

In [24]:
n_voc = len(vocab)
n_voc

4859

### Reconstruct Model (without vectorizer)

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [27]:
pipe = make_pipeline(
    MultiOutputClassifier(
        estimator=AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=2),
            n_estimators=10, learning_rate=1)))

In [28]:
import pickle as pkl

In [29]:
fileObject = open('severedPipelineXtrain.pkl', 'rb')
X_train = pkl.load(fileObject)
fileObject.close()

fileObject = open('severedPipelineXtest.pkl', 'rb')
X_test = pkl.load(fileObject)
fileObject.close()

In [30]:
%%time
pipe.fit(X_train, y_train)

CPU times: user 31 s, sys: 330 ms, total: 31.3 s
Wall time: 35.2 s


Pipeline(steps=[('multioutputclassifier',
                 MultiOutputClassifier(estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                                                                    learning_rate=1,
                                                                    n_estimators=10)))])

### Extract Canon Tokens

Here's what we're up to...   For every token in the vocabulary we're going to take its vector representation (a single 1 and a lot of 0s) and feed it to the pipeline to get a prediction.  That prediction becomes a row with 36 elements which is appended to a big table which winds up with 4859 rows.  We get a big table 4859 x 36.

From this table we then extract the columns.  For each non-zero entry in a column we look up the associated token.  We call these tokens the "canon tokens".  

What we are doing is a crude kind of matrix inversion, which would give us our features perfectly if the pipeline were a strictly linear system, but of course it isn't.  But it's the best we got.

In [90]:
def genTokenTable(n_vocab):
    result = np.zeros(shape = (n_vocab, 36))
    for i in range(n_vocab):
        # construct token vector for single token
        tokenVec = np.zeros(shape=n_vocab)
        tokenVec[i] = 1
        # compute categories for that single token and append to table
        result[i] = pipe.predict([tokenVec])
    return result

~~Rewrite `genTokenTable()` to use csr (or lil) rather than np.array.~~  Ok. That was a terrible idea.  It's non-trivial to use sparse.lil arrays with the exceptionally useful `numpy.where()` function.  Let's go back to the original method.

In [63]:
from scipy import sparse

In [79]:
# def genTokenTable(n_vocab):
#     result = sparse.lil_matrix((n_vocab, 36), dtype=int)
#     for i in range(n_vocab):
#         # construct token vector for single token
#         tokenVec = np.zeros(shape=n_vocab)
#         tokenVec[i] = 1
#         # compute categories for that single token and append to table
#         result[i,:] = pipe.predict([tokenVec])
#     return result

In [91]:
%%time
tokenTable = genTokenTable(n_voc)

CPU times: user 5min 4s, sys: 2.76 s, total: 5min 7s
Wall time: 5min 10s


In [93]:
def genCanonTable(table):
    result = []
    for i in range(36):
        categoryVector = tokenTable[:,i]
        tokenIndices = np.where(categoryVector == 1)[0]
        # A wily trick for indexing a list
        result.append(list(np.array(vocab)[tokenIndices]))
    return(result)

In [94]:
# canonTable = [canonTokens(tokenTable, i) for i in range(36)]
canonTable = genCanonTable(tokenTable)

In [95]:
canonTable

[['the',
  'medan',
  'chapter',
  'of',
  'taiwan',
  'buddhist',
  'tzu',
  'chi',
  'foundat',
  'set',
  'up',
  'a',
  'recept',
  'center',
  'at',
  'an',
  'indonesian',
  'militari',
  'base',
  'in',
  'on',
  'dec',
  '29',
  'to',
  'help',
  'victim',
  'flee',
  'tsunami',
  'ravag',
  'ach',
  'provinc',
  'i',
  'be',
  'flood',
  'port',
  'au',
  'princ',
  'now',
  'live',
  'gonaiv',
  'need',
  'make',
  'sure',
  '-pron-',
  'manhattan',
  'have',
  'emerg',
  'food',
  'frankenstorm',
  'hurricanesandi',
  'get',
  "'s",
  'mini',
  'amp',
  'water',
  'for',
  'about',
  'these',
  'hurrican',
  'with',
  'go',
  'all',
  'digicel',
  'offic',
  'find',
  '160',
  'can',
  'not',
  'one',
  'deliveri',
  'more',
  'than',
  '41',
  'ton',
  'therapeut',
  'milk',
  'and',
  '1.5',
  'treat',
  'child',
  'acut',
  'sever',
  'malnutrit',
  '31',
  'feed',
  'centr',
  'oper',
  'by',
  'unicef',
  'partner',
  'both',
  'cloth',
  'non',
  'perish',
  'donat',
 

In [None]:
# canon = dict(pd.Series(data=canonTable, index=out_columns, name='Canon Tokens'))

In [39]:
import joblib

In [96]:
with open('canon.joblib', 'wb') as f:
    joblib.dump(canonTable, f)