# TED - Classification

In [2]:
import urllib.request
import zipfile
import lxml.etree

import numpy as np
import os
from random import shuffle
import re

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

### Pre-Processing

In [3]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [4]:
# For now, we're only interested in the subtitle text, so let's extract that from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
texts = doc.xpath('//content/text()')
keywords = doc.xpath('//keywords/text()')
del doc

In [5]:
print(len(texts))
print(texts[0][:250])

2085
Here are two reasons companies fail: they only do more of the same, or they only do what's new.
To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but 


In [24]:
def keywordsToLabel(keywords):
    label = ['o','o','o']
    keywords_lower = [k.strip().lower() for k in keywords.split(',')]
    
    if 'technology' in keywords_lower:
        label[0] = 'T'
    if 'entertainment' in keywords_lower:
        label[1] = 'E'
    if 'design' in keywords_lower:
        label[2] = 'D'
    
    return "".join(label)

def labelToOneHot(label):
    lookup = {
        "ooo": np.array([1,0,0,0,0,0,0,0]),
        "Too": np.array([0,1,0,0,0,0,0,0]),
        "oEo": np.array([0,0,1,0,0,0,0,0]),
        "ooD": np.array([0,0,0,1,0,0,0,0]),
        "TEo": np.array([0,0,0,0,1,0,0,0]),
        "ToD": np.array([0,0,0,0,0,1,0,0]),
        "oED": np.array([0,0,0,0,0,0,1,0]),
        "TED": np.array([0,0,0,0,0,0,0,1])
    }
    
    return lookup[label]

In [21]:
labels = [keywordsToLabel(k) for k in keywords]

In [8]:
# Stats about labels
from collections import Counter

label_counter = Counter(labels)
print(label_counter)

Counter({'ooo': 1134, 'Too': 389, 'oEo': 173, 'ooD': 158, 'ToD': 137, 'TEo': 37, 'TED': 33, 'oED': 24})


### Neural Network

In [101]:
import tensorflow as tf

class TEDClassifier:
    def __init__(self, num_classes, embedding_size, batch_size):
        self.num_classes = num_classes
        self.embedding_size = embedding_size
        self.batch_size = batch_size
        
        # Placeholders
        self.input_x = tf.placeholder(tf.float32, [None, self.embedding_size], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y')

        # Hidden Layer
        self.W = tf.Variable(tf.zeros([self.embedding_size, self.embedding_size]))
        self.b = tf.Variable(tf.zeros([self.embedding_size]))
        
        self.h = tf.tanh(tf.matmul(self.input_x, self.W) + self.b)
        
        # Output Layer
        self.V = tf.Variable(tf.zeros([self.embedding_size, self.num_classes]))
        self.c = tf.Variable(tf.zeros([self.num_classes]))
        
        # TODO: Do I need softmax?
        self.y = tf.matmul(self.h, self.V) + self.c
        
        # Loss function
        self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.y, self.input_y))

    
    def fit(self, X, Y):
        init = tf.global_variables_initializer()

        tf.add_to_collection("y", self.y)
        saver = tf.train.Saver()
        
        with tf.Session() as sess:
            # Initialise variables
            sess.run(init)
        
            
            n = X.shape[0] # number of datapoints
        
            train_step = tf.train.AdamOptimizer(1e-4).minimize(self.cross_entropy)
            correct_prediction = tf.equal(tf.argmax(self.y,1), tf.argmax(self.input_y,1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            sess.run(tf.global_variables_initializer())
        
        
            i = 0
            while i < n:
                j = min(i + self.batch_size, n)
                batch = (X[i:j], Y[i:j])

                if i%100 == 0:
                    train_accuracy = accuracy.eval(feed_dict={
                        self.input_x:batch[0], self.input_y: batch[1]})
                    print("step %d, training accuracy %g"%(i, train_accuracy))
                train_step.run(feed_dict={self.input_x: batch[0], self.input_y: batch[1]})

                # Next batch
                i = j
            
            saver.save(sess, 'ted-model')
    
    def predict(self, X):
        # Restore Session:
        sess = tf.Session()
        new_saver = tf.train.import_meta_graph('ted-model.meta')
        new_saver.restore(sess, tf.train.latest_checkpoint('./'))
        self.y = tf.get_collection('y')[0]
        with tf.Session() as sess:
            classification = sess.run(self.y, feed_dict={self.input_x: X})
    

    

### Read Embeddings

In [10]:
with zipfile.ZipFile('../../glove.6B.zip', 'r') as z:
    embeddings = z.open('glove.6B.50d.txt', 'r').readlines()
  


In [11]:
embedding_dict = {token[0]: np.array(token[1:], dtype='f')
                  for token in [line.decode("utf-8").split()
                  for line in embeddings] }

In [12]:
def similarity(a, b):
    x = embedding_dict[a]
    y = embedding_dict[b]
    
    return np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

similarity('man', 'woman')

0.88603377

In [13]:
def bag_of_means(words, embedding_dict):
    acc = np.zeros(50)
    count = 0
    for word in words:
        if not word in embedding_dict:
            continue
        acc = acc + embedding_dict[word]
        count += 1
    return acc / count

### Prepare the Data

In [84]:
X = np.array([bag_of_means(words,embedding_dict) for words in keywords])
Y = np.array([labelToOneHot(label) for label in labels])

print(X.shape)
print(Y.shape)

# Shuffle the data
# np.random.seed(0)
shuffle = np.random.permutation(X.shape[0])
X = X[shuffle]
Y = Y[shuffle]

# Split in Training, Cross-Val, and Test Set
n_train = 1585
n_val = 250
n_test = 250

X_train = X[:n_train]
X_val = X[n_train:n_train + n_val]
X_test = X[n_train + n_val:]

Y_train = Y[:n_train]
Y_val = Y[n_train:n_train + n_val]
Y_test = Y[n_train + n_val:] 

(2085, 50)
(2085, 8)


### Train the Model

In [98]:
NUM_CLASSES = 8
EMBEDDING_SIZE = 50
BATCH_SIZE = 50

cls = TEDClassifier(NUM_CLASSES, EMBEDDING_SIZE, BATCH_SIZE)
cls.fit(X_train,Y_train)


step 0, training accuracy 0.42
step 100, training accuracy 0.66
step 200, training accuracy 0.64
step 300, training accuracy 0.3
step 400, training accuracy 0.52
step 500, training accuracy 0.66
step 600, training accuracy 0.58
step 700, training accuracy 0.58
step 800, training accuracy 0.58
step 900, training accuracy 0.58
step 1000, training accuracy 0.48
step 1100, training accuracy 0.7
step 1200, training accuracy 0.54
step 1300, training accuracy 0.52
step 1400, training accuracy 0.54
step 1500, training accuracy 0.56


In [99]:
cls.predict(X_test)

FailedPreconditionError: Attempting to use uninitialized value Variable_95
	 [[Node: Variable_95/read = Identity[T=DT_FLOAT, _class=["loc:@Variable_95"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable_95)]]

Caused by op 'Variable_95/read', defined at:
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-98-1f592525efaa>", line 5, in <module>
    cls = TEDClassifier(NUM_CLASSES, EMBEDDING_SIZE, BATCH_SIZE)
  File "<ipython-input-97-544e5a29644f>", line 21, in __init__
    self.c = tf.Variable(tf.zeros([self.num_classes]))
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 224, in __init__
    expected_shape=expected_shape)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tensorflow/python/ops/variables.py", line 370, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1424, in identity
    result = _op_def_lib.apply_op("Identity", input=input, name=name)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
    op_def=op_def)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "//anaconda/envs/ox-dl-py3/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
    self._traceback = _extract_stack()

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value Variable_95
	 [[Node: Variable_95/read = Identity[T=DT_FLOAT, _class=["loc:@Variable_95"], _device="/job:localhost/replica:0/task:0/cpu:0"](Variable_95)]]
