In [1]:
import pandas

## Intro: loading datasets 

In [2]:
transcripts = pandas.read_csv('transcripts.csv')

In [3]:
transcripts.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


In [4]:
ted = pandas.read_csv('ted_main.csv')

In [5]:
ted.iloc[0]

comments                                                           4553
description           Sir Ken Robinson makes an entertaining and pro...
duration                                                           1164
event                                                           TED2006
film_date                                                    1140825600
languages                                                            60
main_speaker                                               Ken Robinson
name                          Ken Robinson: Do schools kill creativity?
num_speaker                                                           1
published_date                                               1151367060
ratings               [{'id': 7, 'name': 'Funny', 'count': 19645}, {...
related_talks         [{'id': 865, 'hero': 'https://pe.tedcdn.com/im...
speaker_occupation                                      Author/educator
tags                  ['children', 'creativity', 'culture', 'dan

## 1. Unpacking tags list

In [6]:
# check the type of tags column 
type(ted.tags[0])

str

In [7]:
import ast

In [8]:
type(ast.literal_eval(ted.related_talks[0]))

list

In [9]:
# also make tags_list a mapping dict with id#  for later lookup usage
ted['tags_list'] = ted.tags.apply(lambda x: ast.literal_eval(x))

In [10]:
type(ted.tags_list[0])

list

## 2. Which keyword has been most taged for talks?

In [11]:
from collections import defaultdict

In [12]:
tag_keywords_counter=defaultdict(int)
for i,v in enumerate(ted.tags_list):
    for keyword in v:
        tag_keywords_counter[keyword]+=1
sorted(tag_keywords_counter.items(), key=lambda x: x[1], reverse=True)[:10]

[('technology', 727),
 ('science', 567),
 ('global issues', 501),
 ('culture', 486),
 ('TEDx', 450),
 ('design', 418),
 ('business', 348),
 ('entertainment', 299),
 ('health', 236),
 ('innovation', 229)]

In [13]:
# Alternative way to count the frequncy for each keyword
# create mappings for each tag keyword with its associated ids
tag_keywords_mapping=defaultdict(list)
for i,v in enumerate(ted.tags_list):
    for keyword in v:
        tag_keywords_mapping[keyword].append(i)

In [14]:
# take a closer look at the dict, total number of tags for entire datasets
len(tag_keywords_mapping.keys())

416

In [15]:
# check the most taged 10 keywords
tag_keywords = list(tag_keywords_mapping.keys())
tag_keywords_freq = []
for k in tag_keywords:
    tag_keywords_freq.append(len(tag_keywords_mapping[k]))

tag_df  = pandas.DataFrame(list(zip(tag_keywords, tag_keywords_freq)),
                                columns=['keyword', 'freq'])
tag_df.sort_values(by='freq', ascending=False)[:10]

Unnamed: 0,keyword,freq
14,technology,727
12,science,567
11,global issues,501
2,culture,486
293,TEDx,450
54,design,418
25,business,348
16,entertainment,299
37,health,236
148,innovation,229


In [16]:
sum(tag_keywords_freq)

19154

In [17]:
# average appearance for each keyword
sum(tag_keywords_freq)/len(tag_keywords)

46.04326923076923

In [18]:
#TODO: build bubble chart

## 3. Exam correlations between tag keywords

In [19]:
# within a certain talk, count the appearces for each keyword

In [20]:
from collections import Counter

In [21]:
# write a test function for the 'technology' term
tech_correlation=[]
for i,v in enumerate(ted.tags_list):
    if 'technology' in v:
        tech_correlation= tech_correlation+v
        
tech_counter = Counter(tech_correlation)

In [22]:
tech_counter.pop('technology')
tech_counter.most_common(10)

[('science', 249),
 ('design', 205),
 ('innovation', 130),
 ('business', 113),
 ('future', 112),
 ('TEDx', 108),
 ('global issues', 104),
 ('invention', 95),
 ('computers', 86),
 ('culture', 84)]

In [23]:
len(tech_counter.most_common())

370

In [24]:
correlation = defaultdict(list)
for tag_list in ted.tags_list:
    for tag in tag_list:
        correlation[tag].extend(tag_list)


In [25]:
counter_list = []
for k,v in correlation.items():
    counter = Counter(v)
    counter.pop(k) #remove present keyword count
    counter_list.append(dict(counter.most_common()))


In [26]:
tag_df['correlation_counter'] = pandas.Series(counter_list)

In [27]:
tag_df['probability'] = tag_df.freq / len(ted)

In [28]:
tag_df.sort_values('probability', ascending=False)[:10]

Unnamed: 0,keyword,freq,correlation_counter,probability
14,technology,727,"{'science': 249, 'design': 205, 'innovation': ...",0.285098
12,science,567,"{'technology': 249, 'biology': 149, 'health': ...",0.222353
11,global issues,501,"{'culture': 131, 'technology': 104, 'business'...",0.196471
2,culture,486,"{'global issues': 131, 'technology': 84, 'busi...",0.190588
293,TEDx,450,"{'technology': 108, 'science': 93, 'global iss...",0.176471
54,design,418,"{'technology': 205, 'art': 87, 'invention': 73...",0.163922
25,business,348,"{'technology': 113, 'global issues': 92, 'econ...",0.136471
16,entertainment,299,"{'culture': 81, 'music': 77, 'technology': 76,...",0.117255
37,health,236,"{'science': 117, 'medicine': 103, 'health care...",0.092549
148,innovation,229,"{'technology': 130, 'science': 103, 'future': ...",0.089804


* 'correlation_counter' is number of times that two keywords are shown together in talk tags list