In [None]:
import random
import spacy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from collections import defaultdict
from time import time
from tqdm import tqdm

try:
    import cPickle as pickle
except ImportError:
    import pickle

#import dynet_config
#dynet_config.set(mem=1024, random_seed=12345)
#dynet_config.set_gpu()
import dynet as dy

In [None]:
print("Loading spaCy")
nlp = spacy.load('en')
assert nlp.path is not None
print ('Done.')

## Settings

In [None]:
MAX_LEN = 52
EMBEDDING_DIM = 100
HIDDEN_DIM = 300
HOPS = 2
UNK_THRESHOLD = 10000
LEARNING_RATE = 0.01
DROPOUT = 0.2
NUM_HISTORIES = 5
NUM_TAGS = 3883
BATCH_SIZE = 16
TEST_SAMPLE = 1000

UNK = '<UNK>'
START = '<S>'
END = '</S>'

## Format data and align user histories

Note that there is code that loads a preprocessed data so it does not need to recompute values if it needs to be restarted which can take a long time.

In [None]:
train_df = pd.read_pickle('huang2016_train.pkl')
dev_df = pd.read_pickle('huang2016_valid.pkl')
test_df = pd.read_pickle('huang2016_test.pkl')

In [None]:
train_hist_df = pd.read_pickle('huang2016_user_histories_train.pkl')
dev_hist_df = pd.read_pickle('huang2016_user_histories_valid.pkl')
test_hist_df = pd.read_pickle('huang2016_user_histories_test.pkl')

In [None]:
def align_histories(data, histories):
    aligned_texts = []
    aligned_tags = []
    aligned_histories = []

    for idx, row in tqdm(data.iterrows(), total=len(data)):
        hist = histories[histories['user_screen_name'] == row['user_screen_name']]
        if len(hist) != NUM_HISTORIES:
            continue
        aligned_texts.append(row['text_no_tags'])
        aligned_tags.append(row['tags'])
        aligned_histories.append(list(hist['text_no_tags']))
    
    return aligned_texts, aligned_tags, list(zip(*aligned_histories))

In [None]:
try:
    with open('huang2016_train.aligned.pkl', 'rb') as f:
        twitter_texts, twitter_tags, twitter_histories = pickle.load(f)
except:
    twitter_texts, twitter_tags, twitter_histories = align_histories(train_df, train_hist_df)
    with open('huang2016_train.aligned.pkl', 'wb') as f:
        pickle.dump((twitter_texts, twitter_tags, twitter_histories), f)

In [None]:
try:
    with open('huang2016_valid.aligned.pkl', 'rb') as f:
        dev_texts, dev_tags, dev_histories = pickle.load(f)
except:
    dev_texts, dev_tags, dev_histories = align_histories(dev_df, dev_hist_df)
    with open('huang2016_valid.aligned.pkl', 'wb') as f:
        pickle.dump((dev_texts, dev_tags, dev_histories), f)

In [None]:
try:
    with open('huang2016_test.aligned.pkl', 'rb') as f:
        test_texts, test_tags, test_histories = pickle.load(f)
except:
    test_texts, test_tags, test_histories = align_histories(test_df, test_hist_df)
    with open('huang2016_test.aligned.pkl', 'wb') as f:
        pickle.dump((test_texts, test_tags, test_histories), f)

## Data processing

### Extract tags

In [None]:
def index_tags(tags_list, tag_set, tag_dict):
    return [[tag_dict[tag] for tag in tags if tag in tag_set] for tags in tags_list]

In [None]:
# Extract tag set
tag_counts = defaultdict(int)
for t in twitter_tags:
    for x in t:
        tag_counts[x] += 1
top_k_tags = set(sorted(tag_counts, key=tag_counts.get, reverse=True)[:NUM_TAGS])

tag_set = set()
for t in twitter_tags:
    tag_set.update(set([x for x in t if x in top_k_tags]))
    
tag_set = sorted(tag_set)
print ('{} unique tags.'.format(len(tag_set)))
#NUM_TAGS = len(tag_set)

#tag_indices = dict((t, i) for i, t in enumerate(tag_set))
#indices_tag = dict((i, t) for i, t in enumerate(tag_set))
tag_indexes = defaultdict(lambda: len(tag_indexes))
parsed_tags = index_tags(twitter_tags, tag_set, tag_indexes)

In [None]:
# Vectroize tags (DON'T USE)

twitter_y = np.zeros((len(twitter_tags), len(tag_set)), dtype=np.bool)
for i, tags in tqdm(enumerate(twitter_tags), total=len(twitter_tags)):
    for tag in tags:
        twitter_y[i, tag_indices[tag]] = 1

In [None]:
twitter_y

### Parse training data

In [None]:
try:
    print ('Attempting to open preprecessed TRAIN data ... ', end='')
    t0=time()
    with open('parsed_twitter_train_data.pkl', 'rb') as f:
        vocab, parsed_texts, parsed_histories = pickle.load(f)
    print ('DONE. ({:.3f}s)'.format(time()-t0))
        
except:
    print ('FAIL.')
    vocab = defaultdict(lambda: len(vocab))
    print ('\tParsing texts ... ', end='')
    t0=time()
    parsed_texts = [[vocab[str(w)] for w in t if not w.is_stop][:MAX_LEN] for t in nlp.pipe([x.encode('ascii', 'ignore').decode('ascii').lower() for x in twitter_texts], n_threads=3, batch_size=20000)]
    print ('DONE. ({:.3f}s)'.format(time()-t0))

    print ('\tParsing histories ... ', end='')
    t0=time()
    parsed_histories = [[[vocab[str(w)] for w in t if not w.is_stop][:MAX_LEN] for t in nlp.pipe([x.encode('ascii', 'ignore').decode('ascii').lower() for x in h], n_threads=3, batch_size=20000)] for h in twitter_histories]
    parsed_histories = list(zip(*parsed_histories))
    print ('DONE. ({:.3f}s)'.format(time()-t0))
    
    unk_idx = vocab[UNK]
    sos_idx = vocab[START]
    eos_idx = vocab[END]
    
    print ('\tSAVING parsed data ... ', end='')
    t0=time()
    with open('parsed_twitter_train_data.pkl', 'wb') as f:
        pickle.dump((dict(vocab), parsed_texts, parsed_histories), f) 
    print ('DONE. ({:.3f}s)'.format(time()-t0))

unk_idx = vocab[UNK]
sos_idx = vocab[START]
eos_idx = vocab[END]
# Set unknown words to be UNK --> note as written, the paper does not indicate that any training data is labeled as UNK...
vocab = defaultdict(lambda: unk_idx, vocab)
idx_to_vocab = {v: k for k, v in vocab.items()}

VOCAB_SIZE = len(vocab)

### Parse dev and test data

In [None]:
try:
    print ('Attempting to open preprecessed DEV and TEST data ... ', end='')
    t0=time()
    with open('parsed_twitter_test_dev_data.pkl', 'rb') as f:
        parsed_dev_texts, parsed_test_texts, parsed_dev_histories, parsed_test_histories = pickle.load(f)
    print ('DONE. ({:.3f}s)'.format(time()-t0))
        
except:
    print ('FAIL.')
    print ('\tParsing texts ... ', end='')
    t0=time()
    parsed_dev_texts = [[vocab[str(w)] for w in t if not w.is_stop][:MAX_LEN] for t in nlp.pipe([x.encode('ascii', 'ignore').decode('ascii').lower() for x in dev_texts], n_threads=3, batch_size=20000)]
    parsed_test_texts = [[vocab[str(w)] for w in t if not w.is_stop][:MAX_LEN] for t in nlp.pipe([x.encode('ascii', 'ignore').decode('ascii').lower() for x in test_texts], n_threads=3, batch_size=20000)]
    print ('DONE. ({:.3f}s)'.format(time()-t0))

    print ('\tParsing histories ... ', end='')
    t0=time()
    parsed_dev_histories = [[[vocab[str(w)] for w in t if not w.is_stop][:MAX_LEN] for t in nlp.pipe([x.encode('ascii', 'ignore').decode('ascii').lower() for x in h], n_threads=3, batch_size=20000)] for h in dev_histories]
    parsed_test_histories = [[[vocab[str(w)] for w in t if not w.is_stop][:MAX_LEN] for t in nlp.pipe([x.encode('ascii', 'ignore').decode('ascii').lower() for x in h], n_threads=3, batch_size=20000)] for h in test_histories]
    parsed_dev_histories = list(zip(*parsed_dev_histories))
    parsed_test_histories = list(zip(*parsed_test_histories))
    print ('DONE. ({:.3f}s)'.format(time()-t0))
    
    print ('\tSAVING parsed data ... ', end='')
    t0=time()
    with open('parsed_twitter_test_dev_data.pkl', 'wb') as f:
        pickle.dump((parsed_dev_texts, parsed_test_texts, parsed_dev_histories, parsed_test_histories), f) 
    print ('DONE. ({:.3f}s)'.format(time()-t0))

## Construct the model

In [None]:
# Initialize dynet model
model = dy.ParameterCollection()

# The paper uses AdaGrad
trainer = dy.AdagradTrainer(model, learning_rate=LEARNING_RATE)

# Embedding parameters
EMBED_A = model.add_lookup_parameters((VOCAB_SIZE, EMBEDDING_DIM))
EMBED_B = model.add_lookup_parameters((VOCAB_SIZE, EMBEDDING_DIM))
EMBED_C = model.add_lookup_parameters((VOCAB_SIZE, EMBEDDING_DIM))

# User interest encoder parameters
p_W_o = model.add_parameters((NUM_HISTORIES, EMBEDDING_DIM))
p_W_s = model.add_parameters((NUM_HISTORIES, EMBEDDING_DIM))
p_W_ms = model.add_parameters(NUM_HISTORIES)

# Final prediction layer
p_theta_s = model.add_parameters((NUM_TAGS, NUM_TAGS))
p_W_f = model.add_parameters((NUM_TAGS, EMBEDDING_DIM*2))
p_b_f = model.add_parameters(NUM_TAGS)

In [None]:
def calc_scores(tweet, histories, h=HOPS):
    
    
    W_o     = dy.parameter(p_W_o)
    W_s     = dy.parameter(p_W_s)
    W_ms    = dy.parameter(p_W_ms)
    W_f     = dy.parameter(p_W_f)
    theta_s = dy.parameter(p_theta_s)
    b_f     = dy.parameter(p_b_f)
    
    # Tweet encoder
    w_emb = [dy.lookup(EMBED_A, x) for x in tweet]
    o_0 = dy.esum(w_emb)
    #print ('o_0:\t', o_0.dim())
    
    o_h = o_0
    
    # User interest encoder
    for i in range(h):

        # Word-level attention
        m_emb = [dy.concatenate([dy.lookup(EMBED_B, x) for x in t], 1) for t in histories]
        #print ('m:\t' ,[x.dim() for x in m_emb])
        p = [dy.softmax(dy.transpose(o_h) * m, 1) for m in m_emb]
        #print ('p:\t' ,[x.dim() for x in p])

        c_emb = [dy.concatenate([dy.lookup(EMBED_B, x) for x in t], 1) for t in histories]
        #print ('c:\t' ,[x.dim() for x in c_emb])
        s = [dy.sum_dim(dy.cmult(p[i], c), [1], True) for i, c in enumerate(c_emb)]
        #print ('s:\t' , [x.dim() for x in s])

        # Sentence-level attention
        m_s = [dy.tanh(W_o * o_h + W_s * s_i) for s_i in s]
        #print ('m_s:\t' , [x.dim() for x in m_s])

        p_s = [dy.softmax(dy.transpose(W_ms) * m, 0) for m in m_s]
        #print ('p_s:\t' , [x.dim() for x in p_s])

        u = dy.esum([dy.cmult(p_s[i], s_i) for i, s_i in enumerate(s)])
        #print ('u:\t' , u.dim())
        
        o_h = o_h + u
        
    # Final prediction
    out = dy.softmax(dy.transpose(theta_s) * (W_f * dy.concatenate([o_0, o_h]) + b_f))
    #out = dy.transpose(theta_s) * (W_f * dy.concatenate([o_0, o_h]) + b_f)
    #print ('out:\t', out.dim())
    
    return out
    

In [None]:
train = list(zip(parsed_texts, parsed_tags, parsed_histories))
dev_tags = index_tags(dev_tags, tag_set, tag_indexes)
dev = list(zip(parsed_dev_texts, dev_tags, parsed_dev_histories))

print ('Using batch size of {}.'.format(BATCH_SIZE))

for ITER in range(60):
    # Train
    random.shuffle(train)
    
    batches = [train[i:i + BATCH_SIZE] for i in range(0, len(train), BATCH_SIZE)]    
    
    for i, batch in enumerate(tqdm(batches, total=len(batches))):
        dy.renew_cg()
        train_loss = 0.0
        t0 = time()

        losses =[]

        for words, tags, hists in batch:
            if len(tags) == 0 or len(hists) == 0 or len(words) == 0 or not all(len(h) > 0 for h in hists):
                continue
            scores = calc_scores(words, hists)
            loss = dy.esum([-dy.log(scores[tag]) for tag in tags])
            losses.append(loss)

        # Batch update
        batch_loss = dy.esum(losses)/BATCH_SIZE
        batch_loss.backward()
        trainer.update()
        
        # Do frequent tests to get a loss graph
        if i == 0 or (i % 100) == 0:
            tt = time()
            dev_loss = 0.
            for words, tags, hists in random.sample(dev, TEST_SAMPLE):
                dy.renew_cg()
                if len(tags) == 0 or len(hists) == 0 or len(words) == 0 or not all(len(h) > 0 for h in hists):
                    continue
                scores = calc_scores(words, hists)
                loss = dy.esum([-dy.log(scores[tag]) for tag in tags])
                dev_loss += loss.value()
            print("val_loss={:.4f}, time={:.3f}".format(dev_loss / TEST_SAMPLE, time()-tt))
    print("Iteration %r: train loss/tweet=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time() - t0))
    
    dev_loss = 0.
    for words, tags, hists in tqdm(random.sample(dev, TEST_SAMPLE), total=TEST_SAMPLE):
        dy.renew_cg()
        if len(tags) == 0 or len(hists) == 0 or len(words) == 0 or not all(len(h) > 0 for h in hists):
            continue
        scores = calc_scores(words, hists)
        loss = dy.esum([-dy.log(scores[tag]) for tag in tags])
        dev_loss += loss.value()
    print("Iteration %r: val loss=%.4f" % (ITER, dev_loss / len(dev)))

In [None]:
model.save("sota.model")

In [None]:
model.populate('sota.model')