In [1]:
# Imports
import pandas as pd
import json
import numpy as np

In [2]:
# Read in data
data = []
with open('talk_data.json') as f:
    for line in f:
        data.append(json.loads(line))

In [3]:
# Extract data by column from JSONs
titles = []
upload_dates = []
links = []
topics = []
transcripts = []
views = []
durations = []
speakers = []
descriptions = []
unique_topics = set()

for talk_data in data:
    title = str([x for x in talk_data.keys()][0])
    titles.append(title)
    metadata = talk_data[title]
    upload_dates.append(str(metadata['posted_date']))
    links.append(str(metadata['talk_link']))
    topics.append(metadata['talk_topics'])
    for topic in metadata['talk_topics']:
        unique_topics.add(topic)
    transcript_raw = metadata['transcript']
    transcript = ''
    for section in transcript_raw:
        transcript += section
        transcript += ' '
    transcripts.append(str(transcript))
    views.append(int(metadata['view_count']))
    durations.append(float(metadata['duration'])/60)
    speakers.append(str(metadata['speaker']))
    descriptions.append(str(metadata['description']))

In [4]:
# Create and populate DataFrame
df = pd.DataFrame()
df['title'] = titles
df['upload_date'] = upload_dates
df['link'] = links
df['topics'] = topics
df['transcript'] = transcripts
df['views'] = views
df['log_views'] = np.log(1 + df['views'])
df['duration'] = durations
df['speaker'] = speakers
df['description'] = descriptions

df.head()

Unnamed: 0,title,upload_date,link,topics,transcript,views,log_views,duration,speaker,description
0,How do viruses jump from animals to humans?,2019-08-08,https://www.ted.com/talks/ben_longdon_how_do_v...,"[TED-Ed, human body, animals, science, biology...","At a Maryland country fair in 2017,the prize p...",144067,11.878041,4.783333,Ben Longdon,"At a Maryland country fair in 2017, farmers re..."
1,From pacifist to spy: WWII's surprising secret...,2019-08-07,https://www.ted.com/talks/shrabani_basu_from_p...,"[animation, war, activism, TED-Ed, women, femi...",Noor Inayat Khan was in the midst of a despera...,178556,12.092663,4.2,Shrabani Basu,"In May 1940, with the German army ready to occ..."
2,How to use family dinner to teach politics,2019-07-23,https://www.ted.com/talks/hajer_sharief_how_to...,"[politics, children, social change, women, soc...","Twenty years ago,my family introduced a system...",1086005,13.898017,11.35,Hajer Sharief,Everyone should participate in decision-making...
3,Neoliberalism's time has passed. We need a new...,2019-07-26,https://www.ted.com/talks/george_monbiot_the_n...,"[politics, social change, democracy, community...",Do you feel trappedin a broken economic model?...,1176004,13.977634,15.25,George Monbiot,"To get out of the mess we're in, we need a new..."
4,Why governments should prioritize well-being,2019-07-29,https://www.ted.com/talks/nicola_sturgeon_why_...,"[politics, social change, economics, leadershi...","Just over a mile away from here,in Edinburgh's...",984998,13.800396,10.0,Nicola Sturgeon,"In 2018, Scotland, Iceland and New Zealand est..."


In [5]:
df.to_csv('og_dataset.csv')

In [6]:
topics_list = list(unique_topics)
topics_list.sort()

In [7]:
topics_matrix = np.zeros((df.shape[0], len(topics_list)))

In [8]:
for i, talk_topics in enumerate(topics):
    for topic in talk_topics:
        col_index = topics_list.index(topic)
        topics_matrix[i, col_index] = 1

In [9]:
for i in range(df.shape[0]):
    assert(len(topics[i]) == np.sum(topics_matrix[i,:]))

In [10]:
for i, topic in enumerate(topics_list):
    df[topic] = topics_matrix[:, i]

In [11]:
df.drop(labels = ['topics', 'views'], axis = 1, inplace = True)
df.head()

Unnamed: 0,title,upload_date,link,transcript,log_views,duration,speaker,description,3D printing,AI,...,wikipedia,wind energy,women,women in business,work,work-life balance,world cultures,writing,wunderkind,youth
0,How do viruses jump from animals to humans?,2019-08-08,https://www.ted.com/talks/ben_longdon_how_do_v...,"At a Maryland country fair in 2017,the prize p...",11.878041,4.783333,Ben Longdon,"At a Maryland country fair in 2017, farmers re...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,From pacifist to spy: WWII's surprising secret...,2019-08-07,https://www.ted.com/talks/shrabani_basu_from_p...,Noor Inayat Khan was in the midst of a despera...,12.092663,4.2,Shrabani Basu,"In May 1940, with the German army ready to occ...",0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,How to use family dinner to teach politics,2019-07-23,https://www.ted.com/talks/hajer_sharief_how_to...,"Twenty years ago,my family introduced a system...",13.898017,11.35,Hajer Sharief,Everyone should participate in decision-making...,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Neoliberalism's time has passed. We need a new...,2019-07-26,https://www.ted.com/talks/george_monbiot_the_n...,Do you feel trappedin a broken economic model?...,13.977634,15.25,George Monbiot,"To get out of the mess we're in, we need a new...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Why governments should prioritize well-being,2019-07-29,https://www.ted.com/talks/nicola_sturgeon_why_...,"Just over a mile away from here,in Edinburgh's...",13.800396,10.0,Nicola Sturgeon,"In 2018, Scotland, Iceland and New Zealand est...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df.to_csv('dataset_exploded_topics.csv')