In [45]:
%matplotlib inline

import itertools
import spacy
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import keras.backend as K
import tensorflow as tf

from collections import defaultdict

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Embedding, Dropout, Reshape, TimeDistributed
from keras.utils import to_categorical, plot_model

from keras_tqdm import TQDMNotebookCallback, TQDMCallback

from pprint import pprint

from scipy import stats
from sklearn.metrics import cohen_kappa_score, accuracy_score, classification_report
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit

from tqdm import tqdm

In [2]:
TWITTER_DATA_FILEPATH = '/home/james/corpora/twitter/trec2011_with_hashtags.pkl'
TUMBLR_DATA_FILEPATH = '/home/james/corpora/tumblr/halfday_text.pkl'

In [17]:
# Email Settings
import smtplib

wherefrom = 'j.m.fiacco@gmail.com'
whereto = '6037323260@msg.fi.google.com'
gmail_pw = 'thdqhntcjnqglhxk'

def send_text(text='Hello World!'):
    try:
        server = smtplib.SMTP("smtp.gmail.com", 587)
        server.ehlo()
        server.starttls()
        server.login(wherefrom, gmail_pw)
        ret = server.sendmail(wherefrom, whereto, ' '.join(['[[ASSIGNMENT]]', text]))
        server.quit()
        print ('Messsage sent.')
    except:
        print ('Message failed.')

# Data Processing

## Twitter

In [4]:
twitter_data = pd.read_pickle(TWITTER_DATA_FILEPATH)
twitter_data

Unnamed: 0,text,id_str,id,created_at,retweeted,retweet_count,favorited,requested_id,text_no_tags,user_id,user_screen_name,tags
0,#imSorry. Kasi hindi ko masuklian pagmamahal k...,31334887931777024,31334887931777024,Sat Jan 29 12:56:34 +0000 2011,False,0,False,31334887931777024,. Kasi hindi ko masuklian pagmamahal ko sayo. :(,41072582,Sassysofia,[#imSorry]
1,"RT @she3sha3y: #Jan25 #Egypt ""not baradei, not...",31334648130830336,31334648130830336,Sat Jan 29 12:55:36 +0000 2011,True,34,True,31334894172901376,"RT @she3sha3y: ""not baradei, not Ikhwan (broth...",29979814,monaeltahawy,"[#Jan25, #Egypt]"
2,#buik #barutahu kalo sebelum cinta fitri di pu...,31334897025024000,31334897025024000,Sat Jan 29 12:56:36 +0000 2011,False,0,False,31334897025024000,kalo sebelum cinta fitri di puter ada lagu in...,129381481,kadekdoi,"[#buik, #barutahu, #pangling]"
3,@1DPaynettes #Twitition @onedirection to gig i...,31334903031267328,31334903031267328,Sat Jan 29 12:56:37 +0000 2011,False,0,False,31334903031267328,@1DPaynettes @onedirection to gig in birmingha...,50865325,keelybennett12,[#Twitition]
4,#SS3SG Eunhyuk !!! <3 http://twitpic.com/3uhp5y,31334401195380736,31334401195380736,Sat Jan 29 12:54:38 +0000 2011,True,18,True,31334906613202944,Eunhyuk !!! <3 http://twitpic.com/3uhp5y,136221182,eunhaecouple,[#SS3SG]
5,From me too...RT @archanavijaya: Its sooo nice...,31334910400663552,31334910400663552,Sat Jan 29 12:56:39 +0000 2011,False,0,False,31334910400663552,From me too...RT @archanavijaya: Its sooo nice...,187203939,Barodian_,[#Mumbai]
6,@fernando_x Resumindo: #jogojusto fail #saraiv...,31334915085697024,31334915085697024,Sat Jan 29 12:56:40 +0000 2011,True,2,False,31334915085697024,@fernando_x Resumindo: fail,13802992,pabloprime,"[#jogojusto, #saraivawins]"
7,Parketpolitie int 750 euro aan openstaande boe...,31334919284195328,31334919284195328,Sat Jan 29 12:56:41 +0000 2011,False,0,False,31334919284195328,Parketpolitie int 750 euro aan openstaande boe...,86923067,VeendamNL,[#Veendam]
8,[Update] #SS3SG Shake It Up http://yfrog.com/h...,31334930269081601,31334930269081601,Sat Jan 29 12:56:44 +0000 2011,True,19,True,31334930269081601,[Update] Shake It Up http://yfrog.com/hsz86hyj...,150212472,DonghaeBiased,[#SS3SG]
9,#anispa #joqr スフィアきた！！,31334948820484096,31334948820484096,Sat Jan 29 12:56:48 +0000 2011,False,0,False,31334948820484096,スフィアきた！！,111845155,8000kei,"[#anispa, #joqr]"


In [5]:
twitter_text = twitter_data['text_no_tags']
twitter_tags = twitter_data['tags']
len(twitter_text), len(twitter_tags)

(1238020, 1238020)

In [6]:
twitter_lengths = twitter_text.apply(lambda s: len(s))
print ('Twitter Max Length (characters):', np.max(twitter_lengths))
print ('Twitter 99th Percentile:', np.percentile(twitter_lengths, 99))
print ('Twitter 140 character percentile:', '{:.3f}'.format(stats.percentileofscore(twitter_lengths, 140)))
num_tags = twitter_tags.apply(lambda t: len(t))
print ('Twitter Max Tags:', np.max(num_tags))

Twitter Max Length (characters): 216
Twitter 99th Percentile: 133.0
Twitter 140 character percentile: 99.324
Twitter Max Tags: 10


In [7]:
# Extract character set

chars = set()
for text in twitter_text:
    chars.update(set(text))
    
chars = sorted(chars)
chars = ['<PAD>'] + chars
print ('{} unique characters.'.format(len(chars)))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

1286 unique characters.


In [8]:
MAX_LENGTH = 140

In [9]:
# Vectorize tweets

twitter_X = np.zeros((len(twitter_text), MAX_LENGTH), dtype=np.bool)
for i, text in tqdm(enumerate(twitter_text), total=len(twitter_text)):
    for t, char in enumerate(text):
        if t >= MAX_LENGTH:
            break
        twitter_X[i, t] = char_indices[char]

100%|██████████| 1238020/1238020 [00:24<00:00, 50959.29it/s]


In [10]:
# Extract tag set

tag_set = set()
for t in twitter_tags:
    tag_set.update(set(t))
    
tag_set = sorted(tag_set)
print ('{} unique tags.'.format(len(tag_set)))

tag_indices = dict((t, i) for i, t in enumerate(tag_set))
indices_tag = dict((i, t) for i, t in enumerate(tag_set))

748 unique tags.


In [11]:
# Vectorize tags

twitter_y = np.zeros((len(twitter_tags), len(tag_set)), dtype=np.bool)
for i, tags in tqdm(enumerate(twitter_tags), total=len(twitter_tags)):
    for tag in tags:
        twitter_y[i, tag_indices[tag]] = 1

100%|██████████| 1238020/1238020 [00:00<00:00, 1302916.64it/s]


In [12]:
sss = ShuffleSplit(n_splits=1, random_state=12345)

# Create X and y matricies
for train_idx, test_idx in sss.split(twitter_X, twitter_y):
    train_X, test_X = twitter_X[train_idx], twitter_X[test_idx]
    train_y, test_y = twitter_y[train_idx], twitter_y[test_idx]
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((1114218, 140), (1114218, 748), (123802, 140), (123802, 748))

# Build Model

## BiLSTM Baseline

In [13]:
model = Sequential()
model.add(Embedding(len(chars) + 1,
                    64,
                    input_length=MAX_LENGTH,
                    trainable=True,
                    mask_zero=True))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(.5))
model.add(Dense(len(tag_set), activation='softmax'))
print (model.output_shape)


(None, 748)


In [14]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

In [15]:
filepath="./weights.twitter.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_categorical_accuracy', patience=3)
callbacks_list = [checkpoint, early_stop]

In [16]:
model.fit(train_X,
              train_y,
              validation_data=(test_X, test_y),
              shuffle=True,
              batch_size=32,
              epochs=10,
              verbose=0,
              callbacks=callbacks_list)

send_text('BiLSTM Baseline Training complete!')

Epoch 00000: val_categorical_accuracy improved from -inf to 0.16518, saving model to ./weights.twitter.best.hdf5
Epoch 00001: val_categorical_accuracy did not improve
Epoch 00002: val_categorical_accuracy improved from 0.16518 to 0.17534, saving model to ./weights.twitter.best.hdf5
Epoch 00003: val_categorical_accuracy improved from 0.17534 to 0.17878, saving model to ./weights.twitter.best.hdf5
Epoch 00004: val_categorical_accuracy did not improve
Epoch 00005: val_categorical_accuracy did not improve
Epoch 00006: val_categorical_accuracy improved from 0.17878 to 0.18272, saving model to ./weights.twitter.best.hdf5
Epoch 00007: val_categorical_accuracy did not improve
Epoch 00008: val_categorical_accuracy did not improve
Epoch 00009: val_categorical_accuracy did not improve
Messsage sent.


In [16]:
model.load_weights('./weights.twitter.best.hdf5')

In [18]:
pred = model.predict(test_X)
send_text('DONE making predictions.')

Messsage sent.


In [19]:
p = np.argmax(pred, axis=1)
y = np.argmax(test_y, axis=1)
p.shape, y.shape

((123802,), (123802,))

In [20]:
accuracy_score(p, y)

0.18271918062713041

In [24]:
pre_count = 0

for i, p in enumerate(pred):
    args = np.argsort(p)[::-1]
    if test_y[i][args[0]] == 1.:
        pre_count += 1

pre_count / pred.shape[0]
        

0.2033408184035799

In [58]:
tag_counts = defaultdict(lambda: 0)

for y in test_y:
    targets = np.nonzero(y)[0]
    #print (targets)
    for t in targets:
        tag_counts[t] += 1
        
freq_tags = set()
for k, v in tag_counts.items():
    if v > 19000:
        freq_tags.add(k)
        
len(freq_tags)
#indices_tag[299]
#len(tag_counts)

0

In [61]:
mean_ranks = []

for i, p in enumerate(pred):
    ranks = []
    args = np.argsort(p)[::-1]
    #print (p[args], args)
    #break
    targets = np.nonzero(test_y[i])[0]
    for t in targets:
        ranks.append(np.nonzero(args == t)[0][0])
    mean_ranks.extend(ranks)

np.mean(np.asarray(mean_ranks))

3.8947458528740975

In [62]:
recalls = []

for i, p in enumerate(pred):
    recall_count = 0
    args = list(np.argsort(p)[::-1][:10])
    targets = np.nonzero(test_y[i])[0]
    if len(targets) == 0:
        continue
    num_tags = len(targets)
    for t in targets:
        if t in args:
            recall_count += 1
    #mean_ranks.append(np.mean(np.asarray(ranks)))
    recalls.append(recall_count / num_tags)

np.mean(np.asarray(recalls))

0.93907085507503907