In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras import optimizers

Using TensorFlow backend.


In [4]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.keras.backend.set_session(tf.Session(config=config))

In [5]:
df = pd.read_pickle('./data/tokenized.pkl')
df['text_edit'] = df['text_edit'].apply(list)
df.head(5)

Unnamed: 0,text,sex,age,event,text_edit,token_len
0,57YOM WITH CONTUSION TO FACE AFTER STRIKING IT...,1,57,62,"[contus, face, strike, post, pounder, set, fen...",8
1,A 45YOM FELL ON ARM WHILE WORKING HAD SLIPPED ...,1,45,42,"[fell, arm, work, slip, water, fx, wrist]",7
2,58YOM WITH CERVICAL STRAIN BACK PAIN S P REST...,1,58,26,"[cervic, strain, back, pain, p, restrain, taxi...",16
3,33 YOM LAC TO HAND FROM A RAZOR KNIFE,1,33,60,"[lac, hand, razor, knife]",4
4,53YOM AT WORK IN A WAREHOUSE DOING UNSPECIFIED...,1,53,71,"[work, warehous, unspecifi, lift, strain, lo, ...",8


Create random seed split

In [6]:
np.random.seed(42)
df['split'] = np.random.choice(5,df.shape[0])

Create reference vocabulary used for training

In [None]:
# vocab = set()
# for _,e in df['text_edit'].iteritems():
#     vocab = vocab.union(set(e))
#
# len(vocab)

In [None]:
# len(vocab)

### Try Keras that fits the embedding

Determine maximum token sequence length for training. We don't want to simply use that max as reviews 20 or longer are rare. Based on distribution below we select a max length of 15.

In [7]:
df['token_len'].value_counts().sort_index()

1         3
2         4
3       465
4      2050
5      5694
6      9954
7     13875
8     16838
9     17009
10    16252
11    14736
12    13000
13    10920
14     9344
15     7513
16     5979
17     4335
18     2870
19     1641
20      835
21      369
22      179
23       66
24       14
25        8
26        2
28        1
Name: token_len, dtype: int64

In [8]:
max_len = 15

In [9]:
docs = list(df['text_edit'].str[:max_len])

In [10]:
tokenizer = Tokenizer()

In [11]:
tokenizer.fit_on_texts(docs)

In [12]:
encoded_doc = tokenizer.texts_to_sequences(docs)

In [14]:
docs[0]

['contus', 'face', 'strike', 'post', 'pounder', 'set', 'fenc', 'post']

In [18]:
tokenizer.texts_to_sequences(['iaaad', 'asdfasdf'])

[[], []]

In [13]:
encoded_doc

[[18, 45, 257, 453, 3850, 844, 382, 453],
 [7, 32, 1, 34, 165, 38, 28],
 [209, 10, 6, 3, 23, 154, 902, 117, 31, 765, 442, 355, 206, 90, 519],
 [13, 9, 386, 65],
 [1, 506, 182, 11, 10, 1749, 3242, 6],
 [188, 157, 755, 225, 439, 137, 137, 183, 6, 16, 24, 32, 21, 24, 32],
 [15, 4, 1387, 65, 1, 2, 86, 2910, 107],
 [5, 1, 189, 456, 93, 64, 88, 266, 20, 324, 2, 84, 203, 324, 2546],
 [8, 107, 6, 3, 171, 824, 359, 55, 11, 26, 241, 178, 1, 2, 53],
 [14, 3, 11, 708, 1],
 [251, 12, 58, 266, 82, 447, 2146, 1, 176, 40],
 [97, 9, 16, 1193, 2, 10793, 9],
 [12, 3, 1, 2388, 547, 64, 123, 20, 12, 8, 3, 2304, 36],
 [1, 242, 302, 108, 152, 1713, 39, 5, 148, 25, 152, 2, 152, 76],
 [13, 32, 44, 972, 65, 1, 2, 32, 13],
 [38, 310, 23, 94, 188, 1, 68, 45, 388, 245, 90],
 [1, 44, 51, 1522, 56, 232, 233, 4],
 [7, 1, 341, 2, 14, 243],
 [2, 41, 22, 10, 23, 194, 378, 385, 26, 1335, 1],
 [58, 1600, 26, 11, 1, 2727, 8, 14, 3],
 [365, 93, 5, 15, 303, 105, 2193, 15, 303, 355, 345, 372, 15, 122, 2],
 [459, 1, 130, 220, 

In [13]:
tokenizer.texts_to_sequences(['work','a','b','fx'])

[[1], [], [107], [38]]

In [14]:
encoded_doc[:3]

[[18, 45, 257, 453, 3850, 844, 382, 453],
 [7, 32, 1, 34, 165, 38, 28],
 [209, 10, 6, 3, 23, 154, 902, 117, 31, 765, 442, 355, 206, 90, 519]]

In [15]:
from collections import Counter

In [24]:
pad_sequences([[1,2,],[]], maxlen=max_len, padding='post')

array([[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [16]:
x = pad_sequences(encoded_doc, maxlen=max_len, padding='post')

In [17]:
x[0]

array([  18,   45,  257,  453, 3850,  844,  382,  453,    0,    0,    0,
          0,    0,    0,    0], dtype=int32)

In [18]:
vocab_size = len(tokenizer.word_index) + 1
# we add +1 to account for missing words

Create target matrix.  
As this is a multi-class model keras requires the target to be in the form of a matrix.

In [19]:
encoder = LabelEncoder()

In [20]:
encoder.fit(df['event'].values)

LabelEncoder()

In [21]:
yencoded = encoder.fit_transform(df['event'].values)

In [22]:
np.bincount(yencoded)

array([    2,  8935,  2236,  3256,    12,    36,    50,   283,  1016,
          97,  2691,   844,     1,     1,   902,   330,    50,  1512,
       15624,  6549,   372,    14,    26,    26,   496,   474,  3897,
          10, 11672,     6,     4,  8982,    52, 24402,  9058,  4381,
          48,  2873,    60,    96,  5315, 25910,   764,  8314,     4,
         888,    12,  1373])

In [23]:
target_size = len(np.unique(yencoded))
target_size

48

In [24]:
event_nums = encoder.inverse_transform(range(target_size))
event_nums

array([10, 11, 12, 13, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 40,
       41, 42, 43, 44, 45, 49, 50, 51, 52, 53, 54, 55, 56, 59, 60, 61, 62,
       63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 78, 79, 99])

In [25]:
y = np_utils.to_categorical(yencoded,num_classes=target_size)

Quick check since I'm paranoid about correct encoding

In [26]:
yencoded[:3]

array([33, 18, 10])

In [27]:
y[:3].argmax(1)

array([33, 18, 10])

Check CV Slicing

In [28]:
def build_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_len))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(target_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=0.0001), metrics=['accuracy'])
    return model

### Out of sample predictions

In [29]:
sorted(df['event'].drop_duplicates())

[10,
 11,
 12,
 13,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 29,
 30,
 31,
 32,
 40,
 41,
 42,
 43,
 44,
 45,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 69,
 70,
 71,
 72,
 73,
 74,
 78,
 79,
 99]

In [30]:
event_cols = [str(e) for e in encoder.classes_]
# probability columns names
prob_col_names = [e+'_prob' for e in event_cols]
# prediction columns names
pred_col_names = [e+'_pred' for e in event_cols]

# initialize prediction columns in dataframe
for c in pred_col_names:
    df[c] = 0

for c in prob_col_names:
    df[c] = 0


In [31]:
np.random.seed(42)
for i in range(5):
    print(f"------Running CV {i}------------------\n")
    xtrain = x[df['split'] != i]
    ytrain = y[df['split'] != i]
    xtest = x[df['split'] == i]
    
    model = build_model()
    model.fit(xtrain, ytrain, epochs=10,verbose=2)
    
    preds = model.predict(xtest)
    df.loc[df['split'] == i,prob_col_names] = preds
    df.loc[df['split'] == i,pred_col_names] = (preds == preds.max(1).reshape((-1,1)))*1

------Running CV 0------------------







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/10




 - 17s - loss: 1.8087 - acc: 0.4874
Epoch 2/10
 - 15s - loss: 0.9302 - acc: 0.7301
Epoch 3/10
 - 15s - loss: 0.7681 - acc: 0.7716
Epoch 4/10
 - 15s - loss: 0.6804 - acc: 0.7943
Epoch 5/10
 - 15s - loss: 0.6163 - acc: 0.8119
Epoch 6/10
 - 15s - loss: 0.5658 - acc: 0.8256
Epoch 7/10
 - 15s - loss: 0.5246 - acc: 0.8378
Epoch 8/10
 - 15s - loss: 0.4895 - acc: 0.8491
Epoch 9/10
 - 15s - loss: 0.4593 - acc: 0.8579
Epoch 10/10
 - 15s - loss: 0.4320 - acc: 0.8668
------Running CV 1------------------

Epoch 1/10
 - 15s - loss: 1.8146 - acc: 0.4805
Epoch 2/10
 - 15s - loss: 0.9382 - acc: 0.7264
Epoch 3/10
 - 15s - loss: 0.7603 - acc: 0.7734
Epoch 4/10
 - 15s - loss: 0.6682 - acc: 0.7972
Epoch 5/10
 - 15s - loss: 0.6057 - acc: 0.8145
Epoch 6/10
 - 15s - loss: 0.5573 - acc: 0.8288
Epoch 7/10
 - 15s - loss: 0.5189 - acc: 0.8394
Epoch 8/10
 - 15s 

In [32]:
df.head()

Unnamed: 0,text,sex,age,event,text_edit,token_len,split,10_pred,11_pred,12_pred,...,67_prob,69_prob,70_prob,71_prob,72_prob,73_prob,74_prob,78_prob,79_prob,99_prob
0,57YOM WITH CONTUSION TO FACE AFTER STRIKING IT...,1,57,62,"[contus, face, strike, post, pounder, set, fen...",8,3,0,0,0,...,4.537691e-05,0.001359468,2.113529e-06,0.0003806565,1.680058e-06,1.497195e-05,1.152805e-08,1.833185e-07,2.009588e-05,0.0002636049
1,A 45YOM FELL ON ARM WHILE WORKING HAD SLIPPED ...,1,45,42,"[fell, arm, work, slip, water, fx, wrist]",7,4,0,0,0,...,1.756378e-08,8.075343e-09,1.507955e-10,4.066872e-08,1.266433e-12,1.98643e-07,3.8260850000000005e-17,9.815665e-12,5.768759e-08,2.898164e-08
2,58YOM WITH CERVICAL STRAIN BACK PAIN S P REST...,1,58,26,"[cervic, strain, back, pain, p, restrain, taxi...",16,2,0,0,0,...,2.575115e-06,2.44779e-11,1.248271e-06,3.492302e-05,1.129743e-08,7.103457e-06,5.556671000000001e-17,2.019554e-08,3.95072e-07,5.198274e-08
3,33 YOM LAC TO HAND FROM A RAZOR KNIFE,1,33,60,"[lac, hand, razor, knife]",4,4,0,0,0,...,2.319273e-06,0.0001140138,5.596004e-08,3.390745e-06,5.609609e-08,6.265324e-09,1.975676e-13,7.722478e-12,1.79012e-07,0.0002006451
4,53YOM AT WORK IN A WAREHOUSE DOING UNSPECIFIED...,1,53,71,"[work, warehous, unspecifi, lift, strain, lo, ...",8,4,0,0,0,...,1.689248e-05,6.212469e-06,0.0001523291,0.9982912,2.977378e-06,1.539981e-05,2.463776e-13,0.0008794522,3.204683e-06,1.191234e-05


In [33]:
df[prob_col_names+pred_col_names].to_pickle('./data/OOS_keras_preds.pkl')

In [34]:
ypred = encoder.inverse_transform(df[pred_col_names].values.argmax(1))

In [35]:
dfres = pd.DataFrame(data=np.column_stack([df['event'].values,ypred]),columns=['act','pred'])
dfres.head()

Unnamed: 0,act,pred
0,62,62
1,42,42
2,26,26
3,60,62
4,71,71


In [36]:
np.sum(dfres['act'] == dfres['pred'])/dfres.shape[0]

0.7932526176310114

In [37]:
pd.concat([dfres['act'].value_counts(),dfres['pred'].value_counts()],1).sort_index().fillna('')


Unnamed: 0,act,pred
10,2,
11,8935,9627.0
12,2236,1471.0
13,3256,3219.0
20,12,
21,36,
22,50,
23,283,248.0
24,1016,1011.0
25,97,


### Fit on all data instead of using a validation set
Based on above fit we will fit 10 epochs

In [38]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(target_size, activation='softmax'))

In [39]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 15, 100)           2684500   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 8, 32)             25632     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 4, 32)             0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 100)               12900     
_________________________________________________________________
dense_12 (Dense)             (None, 48)                4848      
Total params: 2,727,880
Trainable params: 2,727,880
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=0.0001), metrics=['accuracy'])

In [41]:
model.fit(x, y, epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fadb0872490>

In [42]:
model.save('./models/keras_try1.h5')