In [1]:
%matplotlib inline

import itertools
import spacy
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import keras.backend as K
import tensorflow as tf

from collections import defaultdict, Counter

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Embedding, Dropout, Reshape, TimeDistributed
from keras.utils import to_categorical, plot_model

from keras_tqdm import TQDMNotebookCallback, TQDMCallback

from pprint import pprint

from scipy import stats
from sklearn.metrics import cohen_kappa_score, accuracy_score, classification_report
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit

from tqdm import tqdm_notebook as tqdm

Using TensorFlow backend.


In [2]:
# Check if GPU enabled

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/cpu:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3553982065031890252, name: "/gpu:0"
 device_type: "GPU"
 memory_limit: 10968950375
 locality {
   bus_id: 1
 }
 incarnation: 3989715045338005209
 physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:04:00.0", name: "/gpu:1"
 device_type: "GPU"
 memory_limit: 10968950375
 locality {
   bus_id: 1
 }
 incarnation: 13143969203012567611
 physical_device_desc: "device: 1, name: GeForce GTX 1080 Ti, pci bus id: 0000:05:00.0", name: "/gpu:2"
 device_type: "GPU"
 memory_limit: 485883904
 locality {
   bus_id: 1
 }
 incarnation: 13343466826744085488
 physical_device_desc: "device: 2, name: GeForce GTX 1080 Ti, pci bus id: 0000:08:00.0", name: "/gpu:3"
 device_type: "GPU"
 memory_limit: 6748274688
 locality {
   bus_id: 1
 }
 incarnation: 12595380018081630311
 physical_device_desc: "device: 3, name: GeForce GTX 1080 Ti, pci bus id: 0000:09:00.0", name: "/gpu:4"
 device_type: "GPU"
 m

# Data Processing

## Twitter

In [3]:
with open('/usr0/home/mamille2/twitter/data/huang2016/huang2016_train.aligned.pkl', 'rb') as f:
#     twitter_texts, twitter_tags, twitter_histories = pickle.load(f)
    twitter_texts, twitter_tags, _ = pickle.load(f)
#     , twitter_tags, _ = pickle.load(f)
    
with open('/usr0/home/mamille2/twitter/data/huang2016/huang2016_valid.aligned.pkl', 'rb') as f:
#     dev_texts, dev_tags, dev_histories = pickle.load(f)
    dev_texts, dev_tags, _ = pickle.load(f)
#     _, dev_tags, _ = pickle.load(f)
    
with open('/usr0/home/mamille2/twitter/data/huang2016/huang2016_test.aligned.pkl', 'rb') as f:
#     _, test_tags, _ = pickle.load(f)
    test_texts, test_tags, _ = pickle.load(f)

In [4]:
# Extract character set

chars = set()
for text in twitter_texts:
    chars.update(set(text))
    
chars = sorted(chars)
chars = ['<PAD>', '<UNK>'] + chars
print ('{} unique characters.'.format(len(chars)))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

1802 unique characters.


In [5]:
MAX_LENGTH = 140

In [6]:
# Vectorize tweets

# train
train_X = np.zeros((len(twitter_texts), MAX_LENGTH), dtype=np.bool)
for i, text in tqdm(enumerate(twitter_texts), total=len(twitter_texts)):
    for t, char in enumerate(text):
        if t >= MAX_LENGTH:
            break
        train_X[i, t] = char_indices[char]
        
# dev
dev_X = np.zeros((len(dev_texts), MAX_LENGTH), dtype=np.bool)
for i, text in tqdm(enumerate(dev_texts), total=len(dev_texts)):
    for t, char in enumerate(text):
        if t >= MAX_LENGTH:
            break
        dev_X[i, t] = char_indices.get(char, '<UNK>')
        
# test
test_X = np.zeros((len(test_texts), MAX_LENGTH), dtype=np.bool)
for i, text in tqdm(enumerate(test_texts), total=len(test_texts)):
    for t, char in enumerate(text):
        if t >= MAX_LENGTH:
            break
        test_X[i, t] = char_indices.get(char, '<UNK>')










In [7]:
# Extract tag set

tag_set = set()
for t in twitter_tags:
    tag_set.update(set(t))
    
tag_set = sorted(tag_set)
print ('{} unique tags.'.format(len(tag_set)))

# Select top 3883 tags like Huang+ 2016
twitter_tags_c = Counter([t for ts in twitter_tags for t in ts]).most_common(3883)
top_tags = set([t for t,_ in twitter_tags_c])

tag_indices = dict((t, i) for i, t in enumerate(top_tags))
indices_tag = dict((i, t) for i, t in enumerate(top_tags))
print ('Selected {} tags.'.format(len(top_tags)))

37399 unique tags.
Selected 3883 tags.


In [8]:
# Vectorize tags

# train
train_y = np.zeros((len(twitter_tags), len(top_tags)), dtype=np.bool)
for i, tags in tqdm(enumerate(twitter_tags), total=len(twitter_tags)):
    for tag in tags:
        if tag in top_tags:
            train_y[i, tag_indices[tag]] = 1
        
# dev
dev_y = np.zeros((len(dev_tags), len(top_tags)), dtype=np.bool)
for i, tags in tqdm(enumerate(dev_tags), total=len(dev_tags)):
    for tag in tags:
        if tag in top_tags:
            dev_y[i, tag_indices[tag]] = 1
            
# test
test_y = np.zeros((len(test_tags), len(top_tags)), dtype=np.bool)
for i, tags in tqdm(enumerate(test_tags), total=len(test_tags)):
    for tag in tags:
        if tag in top_tags:
            test_y[i, tag_indices[tag]] = 1










In [9]:
train_X.shape, train_y.shape, dev_X.shape, dev_y.shape, test_X.shape, test_y.shape

((212855, 140),
 (212855, 3883),
 (25817, 140),
 (25817, 3883),
 (19614, 140),
 (19614, 3883))

# Build Model

## BiLSTM Baseline

In [None]:
model = Sequential()
model.add(Embedding(len(chars) + 1,
                    64,
                    input_length=MAX_LENGTH,
                    trainable=True,
                    mask_zero=True))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(.5))
model.add(Dense(len(top_tags), activation='softmax'))
print (model.output_shape)

In [41]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

In [42]:
filepath="./weights.twitter.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_categorical_accuracy', patience=3)
callbacks_list = [checkpoint, early_stop]

In [46]:
model.fit(train_X,
              train_y,
              validation_data=(dev_X, dev_y),
              shuffle=True,
              batch_size=32,
              epochs=10,
              verbose=True,
              callbacks=callbacks_list)

print('BiLSTM Baseline Training complete!')

Train on 212855 samples, validate on 25817 samples
Epoch 1/10
 24608/212855 [==>...........................] - ETA: 1804s - loss: 8.9141 - categorical_accuracy: 0.0040

KeyboardInterrupt: 

In [16]:
model.load_weights('./weights.twitter.best.hdf5')

In [18]:
pred = model.predict(test_X)
send_text('DONE making predictions.')

Messsage sent.


In [19]:
p = np.argmax(pred, axis=1)
y = np.argmax(test_y, axis=1)
p.shape, y.shape

((123802,), (123802,))

In [20]:
accuracy_score(p, y)

0.18271918062713041

In [24]:
pre_count = 0

for i, p in enumerate(pred):
    args = np.argsort(p)[::-1]
    if test_y[i][args[0]] == 1.:
        pre_count += 1

pre_count / pred.shape[0]
        

0.2033408184035799

In [58]:
tag_counts = defaultdict(lambda: 0)

for y in test_y:
    targets = np.nonzero(y)[0]
    #print (targets)
    for t in targets:
        tag_counts[t] += 1
        
freq_tags = set()
for k, v in tag_counts.items():
    if v > 19000:
        freq_tags.add(k)
        
len(freq_tags)
#indices_tag[299]
#len(tag_counts)

0

In [61]:
mean_ranks = []

for i, p in enumerate(pred):
    ranks = []
    args = np.argsort(p)[::-1]
    #print (p[args], args)
    #break
    targets = np.nonzero(test_y[i])[0]
    for t in targets:
        ranks.append(np.nonzero(args == t)[0][0])
    mean_ranks.extend(ranks)

np.mean(np.asarray(mean_ranks))

3.8947458528740975

In [62]:
recalls = []

for i, p in enumerate(pred):
    recall_count = 0
    args = list(np.argsort(p)[::-1][:10])
    targets = np.nonzero(test_y[i])[0]
    if len(targets) == 0:
        continue
    num_tags = len(targets)
    for t in targets:
        if t in args:
            recall_count += 1
    #mean_ranks.append(np.mean(np.asarray(ranks)))
    recalls.append(recall_count / num_tags)

np.mean(np.asarray(recalls))

0.93907085507503907