# TED - Classification

In [109]:
import urllib.request
import zipfile
import lxml.etree

import numpy as np
import os
from random import shuffle
import re

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

### Pre-Processing

In [110]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [111]:
# For now, we're only interested in the subtitle text, so let's extract that from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
texts = doc.xpath('//content/text()')
keywords = doc.xpath('//keywords/text()')
del doc

In [112]:
print(len(texts))
print(texts[0][:250])

2085
Here are two reasons companies fail: they only do more of the same, or they only do what's new.
To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but 


In [113]:
def keywordsToLabel(keywords):
    label = ['o','o','o']
    keywords_lower = [k.strip().lower() for k in keywords.split(',')]
    
    if 'technology' in keywords_lower:
        label[0] = 'T'
    if 'entertainment' in keywords_lower:
        label[1] = 'E'
    if 'design' in keywords_lower:
        label[2] = 'D'
    
    return "".join(label)   

In [114]:
labels = [keywordsToLabel(k) for k in keywords]

In [115]:
# Stats about labels
from collections import Counter

label_counter = Counter(labels)
print(label_counter)

Counter({'ooo': 1134, 'Too': 389, 'oEo': 173, 'ooD': 158, 'ToD': 137, 'TEo': 37, 'TED': 33, 'oED': 24})


### Neural Network

In [116]:
import tensorflow as tf

class TEDClassifier:
    def __init__(self, num_classes, embedding_size, batch_size):
        self.num_classes = num_classes
        self.embedding_size = embedding_size
        self.batch_size = batch_size
        
        # Placeholders
        self.input_x = tf.placeholder(tf.float32, [None, self.embedding_size], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y')

        # Hidden Layer
        self.W = tf.Variable(tf.zeros([self.embedding_size, self.embedding_size]))
        self.b = tf.Variable(tf.zeros([self.embedding_size]))
        
        self.h = tf.tanh(tf.matmul(self.input_x, self.W) + self.b)
        
        # Output Layer
        self.V = tf.Variable(tf.zeros([self.embedding_size, self.num_classes]))
        self.c = tf.Variable(tf.zeros([self.num_classes]))
        
        # TODO: Do I need softmax?
        self.y = tf.matmul(self.h, self.V) + self.c
        
        # Loss function
        self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.y, self.input_y))

    
    def fit(self, X, Y):
        n = X.shape[1] # number of datapoints
        
        train_step = tf.train.AdamOptimizer(1e-4).minimize(self.cross_entropy)
        correct_prediction = tf.equal(tf.argmax(self.y,1), tf.argmax(self.input_y,1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        sess.run(tf.global_variables_initializer())
        
        
        i = 0
        while i < n:
            j = min(i + self.batch_size, n)
            batch = (X[i:j], Y[i:j])
            
            if i%100 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x:batch[0], y_: batch[1], keep_prob: 1.0})
                print("step %d, training accuracy %g"%(i, train_accuracy))
            train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})

        i = j
    
    def predict(self, X):
        pass
    

    

### Read Embeddings

In [117]:
with zipfile.ZipFile('../../glove.6B.zip', 'r') as z:
    embeddings = z.open('glove.6B.50d.txt', 'r').readlines()
  


In [118]:
embedding_dict = {token[0]: np.array(token[1:], dtype='f')
                  for token in [line.decode("utf-8").split()
                  for line in embeddings] }

In [119]:
def similarity(a, b):
    x = embedding_dict[a]
    y = embedding_dict[b]
    
    return np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

similarity('man', 'woman')

0.88603377

In [129]:
def bag_of_means(words, embedding_dict):
    acc = np.zeros(50)
    count = 0
    for word in words:
        if not word in embedding_dict:
            continue
        acc = acc + embedding_dict[word]
        count += 1
    return acc / count