In [1]:
import tensorflow as tf
import zipfile
import lxml.etree
import re
import urllib.request
import os
import numpy as np


In [2]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

## Data Preparation

In [5]:
def parse(doc):
    return [(f.xpath("./content/text()")[0], f.xpath("./head/keywords/text()")[0]) for f in doc.xpath("//file")]

def label(keywords):
    kwset = set([kw.strip().lower() for kw in keywords.split(",")])
    lbl = np.zeros(8)
    lbl[4 * ("technology" in kwset) + 2 * ("entertainment" in kwset) + 1 * ("design" in kwset)] = 1
    return lbl

def tokenised_sentences(text):
    input_text_noparens = re.sub(r'\([^)]*\)', '', text)
    sentence_strings = []
    for line in input_text_noparens.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        sentence_strings.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
    return [re.sub(r"[^a-z0-9]+", " ", sentence.lower()).split() for sentence in sentence_strings]

with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))

all_data_raw = [(tokenised_sentences(content), label(keywords)) for (content, keywords) in parse(doc)]
del doc

print(all_data_raw[0][0][:5])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation'], ['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing'], ['consider', 'facit'], ['i', 'm', 'actually', 'old', 'enough', 'to', 'remember', 'them']]


## Word Embeddings

In [6]:
embedding_size = 50

In [7]:
from gensim.models import Word2Vec

all_sentences = [sentence for content, _ in all_data_raw for sentence in content]

word2vec = Word2Vec(all_sentences, size=embedding_size, window=5, min_count=5, workers=12)

word2vec.most_similar("computer")

[('machine', 0.7851342558860779),
 ('robot', 0.7790449857711792),
 ('software', 0.7522289156913757),
 ('simulation', 0.7189420461654663),
 ('video', 0.7187880277633667),
 ('device', 0.714058518409729),
 ('chip', 0.7101123929023743),
 ('3d', 0.7047430276870728),
 ('keyboard', 0.6985288858413696),
 ('animation', 0.6957977414131165)]

Split into training/validation/test sets:

In [10]:
training_sentences = all_data_raw[:1585]
validation = all_data_raw[1585:1585+250]
test = all_data_raw[1585+250:]

## Model

Need to pad all talks to maximum word count, as TF requires consistent length of training inputs:

In [18]:
training_words = [([word for sentence in sentences for word in sentence], label) 
                  for sentences, label in training_sentences]
print(training_words[0][0][:40])

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new', 'to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']


In [29]:
max_length = max([len(words) for words, _ in training_words])
print("max length: %d" % max_length)
pad_word = np.array([.0] * embedding_size)

# wrapper on word2vec model that handles unknown words
def w2v(word):
    return word2vec[word] if word in word2vec else pad_word

training_vectors = [([w2v(word) for word in words] + ([pad_word] * (max_length - len(words))),
                     len(words),
                     label) for words, label in training_words]

print(training_vectors[0][0][:1])
print(training_vectors[0][0][-1:])

max length: 5387
[array([ 1.7053777 ,  0.44375089,  2.40147877,  1.56469464, -0.5336141 ,
       -0.68148601,  0.82912147,  2.25279331, -0.95593244,  1.33701634,
        0.76937181, -3.19962335,  1.60973144,  0.19883487,  1.22950065,
       -0.51542664,  0.34617937, -0.08367246, -0.81200767,  1.57739055,
        0.71759522,  1.76497483,  1.08457708,  2.28248453,  0.81796366,
       -0.9854821 , -1.71136069,  1.10090268,  1.52877855, -1.37763691,
       -3.15039611,  0.15048994,  0.86681312, -1.33970571,  0.74007791,
       -0.23219641, -0.67506731, -2.35610747, -2.12424803, -1.2421999 ,
       -1.05194223,  1.17314637,  0.26060024,  1.90340781,  1.84374464,
       -0.12940843,  0.61424488, -1.1746037 , -0.72992647, -0.30587038], dtype=float32)]
[array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  

Metaparameters:

In [51]:
learning_rate = 0.001
training_epochs = 15
batch_size = 50

Network parameters:

In [52]:
embedding_size = 50
hidden_size = 10

We separate embeddings out, even though there are just weights, the same as in the subsequent layers:

In [53]:
w2v = tf.Variable(tf.random_normal([zhv_size, embedding_size]))

Model:

In [54]:
def embedding(text, w2v):
    return tf.matmul(text, w2v)

def model(text, weights, biases, w2v):
    x = embedding(text, w2v)
    h = tf.tanh(tf.add(tf.matmul(x, weights['W']), biases['b']))
    u = tf.add(tf.matmul(h, weights['V']), biases['c'])
    return u

Network state:

In [55]:
# parameters that are fixed for the task:
n_classes = 8

text = tf.placeholder(tf.float32, [None, zhv_size])
label = tf.placeholder(tf.float32, [None, n_classes])

weights = {
    'W': tf.Variable(tf.random_normal([embedding_size, hidden_size])),
    'V': tf.Variable(tf.random_normal([hidden_size, n_classes]))
}
biases = {
    'b': tf.Variable(tf.random_normal([hidden_size])),
    'c': tf.Variable(tf.random_normal([n_classes])),
}

Train:

In [61]:
pred = model(text, weights, biases, w2v)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=label))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(np.ceil(len(training) / batch_size))
        for i in range(total_batch):
            # TODO: sample the batch
            batch = training[i * batch_size : (i+1) * batch_size]
            batch_text = np.array([text for (text, label) in batch])
            batch_label = np.array([label for (text, label) in batch])
            _, c = sess.run([optimizer, cost], feed_dict = { text: batch_text, label: batch_label})
            avg_cost += c / total_batch
        print("Epoch: ", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
        
    correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(label, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, "float"))
    test_text = np.array([text for (text, label) in test])
    test_label = np.array([label for (text, label) in test])
    print("test set accuracy: ", accuracy.eval({text: test_text, label: test_label}))

Epoch:  0001 cost= 2.130254768
Epoch:  0002 cost= 1.490307493
Epoch:  0003 cost= 1.352485389
Epoch:  0004 cost= 1.279339911
Epoch:  0005 cost= 1.216089798
Epoch:  0006 cost= 1.162066670
Epoch:  0007 cost= 1.113673551
Epoch:  0008 cost= 1.069420913
Epoch:  0009 cost= 1.028165305
Epoch:  0010 cost= 0.989132142
Epoch:  0011 cost= 0.951763185
Epoch:  0012 cost= 0.915668884
Epoch:  0013 cost= 0.880585162
Epoch:  0014 cost= 0.846335384
Epoch:  0015 cost= 0.812796473
test set accuracy:  0.329218
