In [21]:
import tensorflow as tf
import zipfile
import lxml.etree
import re
import urllib.request
import os
import numpy as np


In [7]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

## Data Preparation

In [16]:
def parse(doc):
    return [(f.xpath("./content/text()")[0], f.xpath("./head/keywords/text()")[0]) for f in doc.xpath("//file")]

def label(keywords):
    kwset = set([kw.strip().lower() for kw in keywords.split(",")])
    return 4 * ("technology" in kwset) + 2 * ("entertainment" in kwset) + 1 * ("design" in kwset)

def normalise(text):
    input_text_noparens = re.sub(r'\([^)]*\)', '', text)
    no_speakers = []
    for line in input_text_noparens.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        no_speakers.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

    return re.sub(r"[^a-z0-9]+", " ", ' '.join(no_speakers).lower()).split()

with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))

all_data_raw = [(normalise(content), label(keywords)) for (content, keywords) in parse(doc)]
del doc


Split into training/validation/test sets:

In [9]:
training = all_data[:1585]
validation = all_data[1585:1585+250]
test = all_data[1585+250:]

Define vocabulary:

In [17]:
from collections import Counter
word_counts = Counter()

for (content, lbl) in all_data_raw:
    for word in content:
        word_counts[word] += 1

# remove some of the least 454 words to ensure we have some unknown words in training and test sets
vocabulary = [word for (word, cnt) in word_counts.most_common()][:-454]
print(len(vocabulary))
print(vocabulary[:20])
print(vocabulary[-20:])

54000
['the', 'and', 'to', 'of', 'a', 'that', 'i', 'in', 'it', 'you', 'we', 'is', 's', 'this', 'so', 'they', 'was', 'for', 'are', 'have']
['underplanted', 'monzon', 'disempowered', 'benartzi', 'serenades', 'barberton', 'weedicides', 'prepotent', 'globalfamilyreunion', 'behan', 'telemetered', 'postulation', 'diminutive', 'timberland', 'startin', 'spatio', 'perishable', 'muay', 'heschel', 'inducement']


Map to one-hot vectors:

In [41]:
word2index = dict(zip(vocabulary, range(len(vocabulary))))

zhv_size = len(vocabulary) + 1 # +1 for unknown token
unknown_index = zhv_size - 1

def add_word_to_ohvec(word, vec):
    vec[word2index.get(word, unknown_index)] += 1

def doc2bomvec(doc):
    vec = np.zeros(zhv_size)
    for word in doc:
        add_word_to_ohvec(word, vec)
    vec /= len(doc)
    return vec

print(doc2bomvec(all_data_raw[0][0])[:20])

all_data_ohbom = [(doc2bomvec(doc), label) for (doc, label) in all_data_raw if len(doc) > 0]
print(len(all_data_ohbom))

[ 0.04692082  0.02492669  0.02272727  0.01979472  0.01612903  0.01392962
  0.021261    0.01392962  0.01832845  0.00659824  0.02859238  0.02052786
  0.01686217  0.00293255  0.01099707  0.021261    0.00366569  0.00513196
  0.01173021  0.00733138]
2078


## Model

**x** = embedding(*text*)<br>
**h** = tanh(**Wx** + **b**)<br>
**u** = **Vh** + **c**<br>
**p** = softmax(**u**)<br>
if testing:<br>
&nbsp;&nbsp;&nbsp;&nbsp;prediction = arg max<sub>y’</sub> p<sub>y’</sub><br>
else: # training, with y as the given gold label<br>
&nbsp;&nbsp;&nbsp;&nbsp;loss = -log(p<sub>y</sub>)  # cross entropy criterion

In [None]:
vocabulary_size


embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

x = tf.placeholder(tf.float32, [None, ])
