# Preparing data (JSON export)

In [1]:
import pandas as pd
import re, datetime, nltk, string
from nltk.corpus import stopwords
from collections import Counter,defaultdict
import math
import ast
import sys
import json
from elasticsearch import Elasticsearch
import datetime

In [2]:
# Load first CSV file containing transcripts
transcripts = pd.DataFrame.from_csv('ted_transcripts.csv').reset_index()

In [3]:
# Removing irrelevant comments from transcripts 
transcripts['transcript'] = transcripts['transcript'].apply(lambda x: re.sub(r'\([^()]*\)', ' ', x))

In [4]:
def tokenizeString(transcript):
    '''
    Return dictionary with tokens as keys and term frequencies as values based on the input of a string.
    '''
    tokens = defaultdict(int)
    for word in transcript.split():
        word = word.lower().translate(None, string.punctuation)
        if word not in stopwords.words('english') and len(word) > 3:
            tokens[word] += 1
            
    return tokens

# Adding column with tokens and TF(t,d)
%time transcripts['tokenized'] = transcripts.apply(lambda row: tokenizeString(row['transcript']), axis=1)



CPU times: user 10min, sys: 1min 4s, total: 11min 5s
Wall time: 11min 21s


In [5]:
# Adding column with tokens only
%time transcripts['tokens'] = transcripts.apply(lambda row: [key for key, value in row['tokenized'].iteritems() for i in range(value)], axis=1)

CPU times: user 643 ms, sys: 19.3 ms, total: 662 ms
Wall time: 693 ms


In [6]:
def ExtractVocabulary(df):
    '''
    Function to extract the vocabulary based on the input of a dataframe.
    '''
    V = []
    for row in df.itertuples():
        V += row[3]
    return list(set(V))
    
def ComputeDocFreq(df):
    '''
    Function to compute the document frequency for every term in a dataframe.
    '''
    voc = ExtractVocabulary(df)
    doc_freq = defaultdict(int)
    for token in voc:
        for row in df.itertuples():
            if token in row[3]:
                doc_freq[token] += 1
    return doc_freq

def CreateWordCloud(tokens, N):
    output = {}
    for token, tf in tokens.iteritems():
        output[token] = tf * (math.log(N/docf[token],10))
    return ", ".join(dict(sorted(output.iteritems(), key=lambda x:-x[1])[:50]).keys())

# Using TF-IDF a column with wordclouds for every document are 
# added with the 50 most important words for that doc
docf = ComputeDocFreq(transcripts)
N = transcripts.shape[0]

%time transcripts['word_cloud'] = transcripts.apply(lambda row: CreateWordCloud(row['tokenized'],N), axis=1)

CPU times: user 1.25 s, sys: 5.27 ms, total: 1.26 s
Wall time: 1.27 s


In [7]:
# After word cloud is computed, the column tokenizedis 
# not necessary anymore, so it is dropped from the dataframe
transcripts = transcripts.drop('tokenized', 1)

In [8]:
# This is what the transcripts dataframe looks like right now
transcripts.head()

Unnamed: 0,transcript,url,tokens,word_cloud
0,"Good morning. How are you? It's been great, ha...",https://www.ted.com/talks/ken_robinson_says_sc...,"[secondly, highlytalented, since, switched, da...","highlytalented, gillian, hears, hierarchy, roy..."
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...,"[monterey, consider, consider, consider, chain...","branding, calculator, washington, carbonneutra..."
2,"Hello voice mail, my old friend. I've called ...",https://www.ted.com/talks/david_pogue_says_sim...,"[code, code, code, code, code, code, whoever, ...","cupertino, code, features, links, buttons, gad..."
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...,"[surrounded, grassroots, grassroots, dollar, a...","poor, neighborhood, grassroots, lowincome, com..."
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...,"[neighbors, incomewe, concept, concept, search...","expectancy, population, statistical, searchabl..."


<hr>

In [9]:
# Loading the csv with the metadata for all the TED talks
metadata = pd.DataFrame.from_csv('ted_main.csv').reset_index()

In [10]:
# In order to make the dataset searchable, some columns are edited/converted
# Changing the format of film_date and published_date to a format Elastic Search can understand
metadata['film_date']      = metadata['film_date'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))
metadata['published_date'] = metadata['published_date'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d'))

# Converting the duration from seconds to a more readable format MM:SS
metadata['duration']       = metadata['duration'].apply(lambda x: str(x/60) + ':' + str(x%60))

# Filtering only the three ratings with the highest score, the others are not relevant 
metadata['ratings']        = metadata['ratings'].apply(lambda x: ", ".join(str(x) for x in [rating[0] for rating in Counter({item['name']: item['count'] for item in ast.literal_eval(x)}).most_common(3)]) )

# Converting the tags to a string, so Elastic Search will interpret it correctly
metadata['tags']           = metadata['tags'].apply(lambda x: ", ".join(str(x) for x in ast.literal_eval(x)))

# We decided to drop the column related talks, since this only contains speaker names and URLS and it will only
# provide noise when ranking documents / performing searches
metadata = metadata.drop('related_talks', 1)

In [11]:
# This is what the metadata dataframe looks like right now
metadata.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,19:24,TED2006,2006-02-25,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,2006-06-27,"Inspiring, Funny, Persuasive",Author/educator,"children, creativity, culture, dance, educatio...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,16:17,TED2006,2006-02-25,43,Al Gore,Al Gore: Averting the climate crisis,1,2006-06-27,"Funny, Informative, Inspiring",Climate advocate,"alternative energy, cars, climate change, cult...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,21:26,TED2006,2006-02-24,26,David Pogue,David Pogue: Simplicity sells,1,2006-06-27,"Funny, Informative, Inspiring",Technology columnist,"computers, entertainment, interface design, me...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",18:36,TED2006,2006-02-26,35,Majora Carter,Majora Carter: Greening the ghetto,1,2006-06-27,"Inspiring, Courageous, Persuasive",Activist for environmental justice,"MacArthur grant, activism, business, cities, e...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,19:50,TED2006,2006-02-22,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,2006-06-27,"Informative, Fascinating, Jaw-dropping",Global health expert; data visionary,"Africa, Asia, Google, demo, economics, global ...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


<hr>

In [12]:
# Both dataframes need to be merges. Luckily they both contain the URL of the TEDtalk, 
# so they can be merges based on this column
tedTalksData = pd.merge(metadata, transcripts, on = "url")

# This is what the final dataset looks like
tedTalksData.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,speaker_occupation,tags,title,url,views,transcript,tokens,word_cloud
0,4553,Sir Ken Robinson makes an entertaining and pro...,19:24,TED2006,2006-02-25,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,2006-06-27,"Inspiring, Funny, Persuasive",Author/educator,"children, creativity, culture, dance, educatio...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,"Good morning. How are you? It's been great, ha...","[secondly, highlytalented, since, switched, da...","highlytalented, gillian, hears, hierarchy, roy..."
1,265,With the same humor and humanity he exuded in ...,16:17,TED2006,2006-02-25,43,Al Gore,Al Gore: Averting the climate crisis,1,2006-06-27,"Funny, Informative, Inspiring",Climate advocate,"alternative energy, cars, climate change, cult...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre...","[monterey, consider, consider, consider, chain...","branding, calculator, washington, carbonneutra..."
2,124,New York Times columnist David Pogue takes aim...,21:26,TED2006,2006-02-24,26,David Pogue,David Pogue: Simplicity sells,1,2006-06-27,"Funny, Informative, Inspiring",Technology columnist,"computers, entertainment, interface design, me...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292,"Hello voice mail, my old friend. I've called ...","[code, code, code, code, code, code, whoever, ...","cupertino, code, features, links, buttons, gad..."
3,200,"In an emotionally charged talk, MacArthur-winn...",18:36,TED2006,2006-02-26,35,Majora Carter,Majora Carter: Greening the ghetto,1,2006-06-27,"Inspiring, Courageous, Persuasive",Activist for environmental justice,"MacArthur grant, activism, business, cities, e...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550,If you're here today — and I'm very happy that...,"[surrounded, grassroots, grassroots, dollar, a...","poor, neighborhood, grassroots, lowincome, com..."
4,593,You've never seen data presented like this. Wi...,19:50,TED2006,2006-02-22,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,2006-06-27,"Informative, Fascinating, Jaw-dropping",Global health expert; data visionary,"Africa, Asia, Google, demo, economics, global ...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869,"About 10 years ago, I took on the task to teac...","[neighbors, incomewe, concept, concept, search...","expectancy, population, statistical, searchabl..."


In [13]:
# The dataset is exported to a JSON file 
tedTalksData.to_json('tedTalksData.json', orient='index')

# Elastic Search & Kibana

In [22]:
# Initialize Elastic Search
HOST = 'http://localhost:9200/'
es = Elasticsearch(hosts=[HOST])

In [27]:
# Delete index if it already exists to prevent conflicts
es.indices.delete(index='ted-talks', ignore=[400, 404]) 

{u'acknowledged': True}

In [28]:
# Create index with proper mapping
! curl -X PUT "http://localhost:9200/ted-talks/" -d '{ "settings" : { "number_of_shards" : 1 }, "mappings" : { "talks" : { "properties" : { "comments" : { "type" : "long" }, "description": { "type": "text" }, "duration" : { "type" : "text" }, "event" : { "type" : "text", "index": "no" }, "film_date" : { "type" : "date" }, "languages" : { "type" : "long" }, "main_speaker" : { "type" : "text", "norms": { "enabled": false } }, "name" : { "type" : "text", "norms": { "enabled": false } }, "num_speaker" : { "type" : "long" }, "published_date" : { "type" : "date" }, "ratings" : { "type": "text", "norms": { "enabled": false }, "fielddata": true, "fields": { "raw": { "type": "keyword" } } }, "speaker_occupation" : { "type" : "text", "norms": { "enabled": false } }, "tags" : { "type": "text", "fielddata": true, "fields": { "raw": { "type": "keyword" } } }, "title" : { "type" : "text", "index": "no" }, "url" : { "type" : "text" }, "views" : { "type" : "long" }, "transcript": { "type": "text", "index":"no" }, "tokens": { "type": "text", "fields": { "raw": { "type": "keyword" } } }, "word_cloud" : { "type": "text", "fielddata": true, "fields": { "raw": { "type": "keyword" } } } } } } }'


{"acknowledged":true,"shards_acknowledged":true,"index":"ted-talks"}

In [29]:
# Load data from JSON file and import in index
with open('tedTalksData.json') as data_file:    
    data = json.load(data_file)
    
for index, row in data.iteritems():
    es.index(index='ted-talks', doc_type='talks', id=index, body=row)

In [36]:
# Count the number of items in the index to check if data is loaded correctly
added = es.count(index='ted-talks')['count']
check = tedTalksData.shape[0]

print "%s rows should have been added: %s" % (check, (check == added))

2467 rows should have been added: True
