In [1]:
import pandas

## Intro: loading datasets 

In [2]:
transcripts = pandas.read_csv('transcripts.csv')

In [3]:
transcripts.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


In [4]:
ted = pandas.read_csv('ted_main.csv')

In [5]:
ted.iloc[0]

comments                                                           4553
description           Sir Ken Robinson makes an entertaining and pro...
duration                                                           1164
event                                                           TED2006
film_date                                                    1140825600
languages                                                            60
main_speaker                                               Ken Robinson
name                          Ken Robinson: Do schools kill creativity?
num_speaker                                                           1
published_date                                               1151367060
ratings               [{'id': 7, 'name': 'Funny', 'count': 19645}, {...
related_talks         [{'id': 865, 'hero': 'https://pe.tedcdn.com/im...
speaker_occupation                                      Author/educator
tags                  ['children', 'creativity', 'culture', 'dan

## 1. Unpacking tags list

In [6]:
# check the type of tags column 
type(ted.tags[0])

str

In [7]:
import ast

In [8]:
type(ast.literal_eval(ted.related_talks[0]))

list

In [9]:
# also make tags_list a mapping dict with id#  for later lookup usage
ted['tags_list'] = ted.tags.apply(lambda x: ast.literal_eval(x))

In [10]:
type(ted.tags_list[0])

list

## 2. Which keyword has been most taged for talks?

In [11]:
from collections import defaultdict

In [12]:
#create mappings for each tag keyword with its associated ids
tag_keywords_mapping=defaultdict(list)
for i,v in enumerate(ted.tags_list):
    for keyword in v:
        tag_keywords_mapping[keyword].append(i)

In [13]:
# take a closer look at the dict, total number of tags for entire datasets
len(tag_keywords_mapping.keys())

416

In [14]:
# count the frequncy for each keyword, check the most taged 20 keywords
tag_keywords = list(tag_keywords_mapping.keys())
tag_keywords_freq = []
for k in tag_keywords:
    tag_keywords_freq.append(len(tag_keywords_mapping[k]))

tag_freq_df  = pandas.DataFrame(list(zip(tag_keywords, tag_keywords_freq)),
                                columns=['keyword', 'freq'])
tag_freq_df.sort_values(by='freq', ascending=False)[:20]

Unnamed: 0,keyword,freq
14,technology,727
12,science,567
11,global issues,501
2,culture,486
293,TEDx,450
54,design,418
25,business,348
16,entertainment,299
37,health,236
148,innovation,229


In [15]:
#TODO: build bubble chart

## 3. Retrieving related talk id# 

In [16]:
print(ted.related_talks[0])

[{'id': 865, 'hero': 'https://pe.tedcdn.com/images/ted/172559_800x600.jpg', 'speaker': 'Ken Robinson', 'title': 'Bring on the learning revolution!', 'duration': 1008, 'slug': 'sir_ken_robinson_bring_on_the_revolution', 'viewed_count': 7266103}, {'id': 1738, 'hero': 'https://pe.tedcdn.com/images/ted/de98b161ad1434910ff4b56c89de71af04b8b873_1600x1200.jpg', 'speaker': 'Ken Robinson', 'title': "How to escape education's death valley", 'duration': 1151, 'slug': 'ken_robinson_how_to_escape_education_s_death_valley', 'viewed_count': 6657572}, {'id': 2276, 'hero': 'https://pe.tedcdn.com/images/ted/3821f3728e0b755c7b9aea2e69cc093eca41abe1_2880x1620.jpg', 'speaker': 'Linda Cliatt-Wayman', 'title': 'How to fix a broken school? Lead fearlessly, love hard', 'duration': 1027, 'slug': 'linda_cliatt_wayman_how_to_fix_a_broken_school_lead_fearlessly_love_hard', 'viewed_count': 1617101}, {'id': 892, 'hero': 'https://pe.tedcdn.com/images/ted/e79958940573cc610ccb583619a54866c41ef303_2880x1620.jpg', 'speak

In [17]:
# convert the related-talk column into list format
# check the type of related_talks column 
ted['related_talks_list']= ted.related_talks.apply(lambda x: ast.literal_eval(x))

In [18]:
# take a test function for the first record
for d in ted.related_talks_list[0]:
    print(d['id'])

865
1738
2276
892
1232
2616


In [19]:
def get_related_talk_id(list_of_dicts):
    id_list = []
    for d in list_of_dicts:
        id_list.append(d['id'])
    return id_list

In [20]:
get_related_talk_id(ted.related_talks_list[0])

[865, 1738, 2276, 892, 1232, 2616]

In [21]:
# each talk has 6 related talks, however many of recommended talk ids are not in our database
ted['related_talk_ids'] = ted.related_talks_list.apply(lambda x: get_related_talk_id(x))
ted.related_talk_ids[:10]

0      [865, 1738, 2276, 892, 1232, 2616]
1      [243, 547, 2093, 2784, 2339, 2331]
2      [1725, 2274, 172, 2664, 436, 1546]
3    [1041, 1892, 2078, 2873, 2840, 2839]
4        [2056, 2296, 620, 974, 140, 912]
5         [229, 70, 97, 1094, 2861, 2279]
6             [22, 94, 856, 112, 71, 308]
7         [750, 2092, 2183, 359, 231, 31]
8         [71, 2011, 234, 113, 301, 1327]
9           [94, 676, 2011, 301, 86, 308]
Name: related_talk_ids, dtype: object

### * coorelations within related talks

In [22]:
related_to_first =[]
for n in ted.related_talk_ids[0]:
    if n<= 2549:
        related_to_first = related_to_first + ted.related_talk_ids[n]
print(len(related_to_first))
print(len(set(related_to_first)))

30
28


* the diff of between set and talk list is only 2, which means that the overlapping between the related talks for the first entry is very limited. build a function to compute the percentage of the overlappings (as some ids are out of range, absolute value might differ a lot)

In [23]:
def comp_overlapping_percentage(related_talk_ids):
    related =[]
    for n in related_talk_ids:
        if n<= 2549:
            related = related + ted.related_talk_ids[n]
    return (len(related) - len(set(related)))/float(len(related)) if len(related) else 0

In [24]:
comp_overlapping_percentage(ted.related_talk_ids[0])

0.06666666666666667

In [25]:
percentage_list = ted.related_talk_ids.apply(lambda x: comp_overlapping_percentage(x))

In [26]:
percentage_list.mean()

0.007144624207358342

* very low overlapping percent number means the talks are not suggested backforward for their cooresponding related talks, which also indicate that the ted recommendations tends to suggesting lastest talks other than viewed ones, 

In [27]:
# check the last 10 records
ted.related_talk_ids[-10:]

2540    [2655, 2609, 2671, 1005, 2563, 1335]
2541       [2491, 2439, 2654, 253, 194, 251]
2542    [2873, 2863, 2589, 2803, 2788, 2732]
2543        [2490, 2632, 2416, 1223, 437, 6]
2544      [2873, 2560, 2875, 154, 151, 2462]
2545    [2596, 2813, 1368, 2038, 2819, 2791]
2546     [2491, 2656, 2677, 421, 2235, 2476]
2547     [2346, 2825, 2852, 1376, 355, 1908]
2548    [2512, 1378, 2837, 2802, 2217, 2796]
2549     [2682, 2839, 1501, 1429, 1865, 213]
Name: related_talk_ids, dtype: object