In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sqal
from sklearn.model_selection import train_test_split

In [2]:
# load data from database
engine = sqal.create_engine('sqlite:///../data/DisasterResponse.db')
df = pd.read_sql_table('MessageCategorization', engine)

In [3]:
# correct outlying values in `related` column
df['related'] = np.clip(df['related'], 0, 1)

In [4]:
in_columns = 'message'
out_columns = list(df.columns)[4:]

In [5]:
text = df[in_columns].values
y = df[out_columns].values

In [6]:
text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.33, random_state=42)

In [30]:
text_test.shape

(8652,)

### Define The Tokenizer

In [7]:
import spacy
en_nlp = spacy.load('en')

In [8]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [9]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [10]:
# https://realpython.com/natural-language-processing-spacy-python/ was helpful here
def tokenize(text):
    doc = en_nlp(text)
    lemmas = [token.lemma_ for token in doc if token not in stopwords and not token.is_punct]
    stems = [stemmer.stem(lemma) for lemma in lemmas]
    return stems

### Construct The Vectorizer

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vect = TfidfVectorizer(tokenizer=tokenize, min_df=5)

In [13]:
vect.fit(text_train)



TfidfVectorizer(min_df=5, tokenizer=<function tokenize at 0x12bb1e670>)

In [17]:
X_pred = vect.transform(text_test)

In [20]:
X_pred.shape

(8652, 4859)

### Extract The Vocabulary

In [75]:
# vocab = vect.vocabulary_
vocab = list(vect.vocabulary_.keys())
# No good.  vocabulary_ is a dict which is unordered.  We can't rely on it to return the tokens in the correct order.

In [102]:
n_voc = len(vocab)
n_voc

4859

### Reconstruct Model (without vectorizer piece)

In [109]:
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [112]:
pipe = make_pipeline(
    MultiOutputClassifier(
        estimator=AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=2),
            n_estimators=10, learning_rate=1)))

In [114]:
import pickle as pkl

In [115]:
fileObject = open('severedPipelineXtrain.pkl', 'rb')
X_train = pkl.load(fileObject)
fileObject.close()

fileObject = open('severedPipelineXtest.pkl', 'rb')
X_test = pkl.load(fileObject)
fileObject.close()

In [117]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('multioutputclassifier',
                 MultiOutputClassifier(estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                                                                    learning_rate=1,
                                                                    n_estimators=10)))])

In [120]:
singleWord = np.zeros(shape=n_voc)
singleWord[3802] = 1

In [129]:
len(pipe.predict([singleWord])[0])

36

In [130]:
result = np.zeros(shape = (n_voc, 36))
for i in range(n_voc):
    word = np.zeros(shape=n_voc)
    word[i] = 1
    result[i] = pipe.predict([word])

In [163]:
wordIndexes = np.where(result[:,32] == 1)[0]
wordIndexes

array([ 333, 1513, 3497, 4476])

In [173]:
# vocab[wordIndexes]  this is what I wanted
np.array(vocab)[wordIndexes]  # this actually works, but how?

array(['burn', 'archipelago', 'unpreced', 'wen'], dtype='<U25')

In [254]:
def canonTokens(categoryNum):
    categoryVector = result[:,categoryNum]
    wordIndices = np.where(categoryVector == 1)[0]
    return(list(np.array(vocab)[wordIndices]))  # does what I wish vocab[wordIndexes] did
#    return(list(np.insert(rslt, obj=0, values=out_columns[categoryNum])))

In [262]:
canonTable = [canonTokens(i) for i in range(36)]

In [263]:
canon = pd.Series(data=canonTable, index=out_columns, name='Canon Tokens')

In [266]:
with open('canon.joblib', 'wb') as f:
    joblib.dump(canon, f)

### Castoffs

#### Vectorized results have shape = (8652, 4859) which is (size of the test set x size of the vocabulary)

In [78]:
X_pred

<8652x4859 sparse matrix of type '<class 'numpy.float64'>'
	with 164020 stored elements in Compressed Sparse Row format>

In [25]:
print(X_pred)

  (0, 4018)	0.4474519253581993
  (0, 3315)	0.4030081710950588
  (0, 1962)	0.4896258646027413
  (0, 639)	0.599561530146984
  (0, 8)	0.19535524769936843
  (1, 4639)	0.35674001048835935
  (1, 4569)	0.22391707639938604
  (1, 4404)	0.08183440786525194
  (1, 4341)	0.0679104422297342
  (1, 4338)	0.13849647071560753
  (1, 3689)	0.2862505695925295
  (1, 3560)	0.35032973570931314
  (1, 3407)	0.3195461548203802
  (1, 3357)	0.30951582864165794
  (1, 2251)	0.08465362515363767
  (1, 2089)	0.10672449321178847
  (1, 1359)	0.23456534131860912
  (1, 1136)	0.35032973570931314
  (1, 630)	0.14991489548526055
  (1, 418)	0.08381579020210825
  (1, 237)	0.10380747166451167
  (1, 178)	0.28878522923815964
  (1, 161)	0.26371763732409353
  (2, 4783)	0.1951507356918339
  (2, 4756)	0.1367086556979358
  :	:
  (8650, 2652)	0.2116532136200761
  (8650, 2201)	0.08694327569913407
  (8650, 2170)	0.16847050909411956
  (8650, 2119)	0.1770485338213811
  (8650, 2113)	0.10811836715507016
  (8650, 2059)	0.1430870746538258
  (865

In [34]:
print(X_pred[0,:])

  (0, 4018)	0.4474519253581993
  (0, 3315)	0.4030081710950588
  (0, 1962)	0.4896258646027413
  (0, 639)	0.599561530146984
  (0, 8)	0.19535524769936843


In [39]:
X_pred[0,:]

<1x4859 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [83]:
df2 = pd.DataFrame(X_pred.toarray().transpose(),
                   index=vect.get_feature_names())

In [84]:
df2.head(-20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8642,8643,8644,8645,8646,8647,8648,8649,8650,8651
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272146,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
york,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yorker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
wordVector = df2[0].values

In [85]:
wordVector

array([0., 0., 0., ..., 0., 0., 0.])

In [58]:
np.argmax(wordVector)

639

In [59]:
wordVector[639]

0.599561530146984

In [60]:
wordVector.sum()

2.135002738902352

In [95]:
top5indexes = np.argpartition(wordVector, kth=-5)[-5:]
top5indexes

array([   8, 4018, 3315, 1962,  639])