-
Notifications
You must be signed in to change notification settings - Fork 0
/
TEDExtract.py
160 lines (136 loc) · 4.86 KB
/
TEDExtract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
'''
Copyright 2018, University of Freiburg.
Chair of Algorithms and Data Structures.
Markus Näther <naetherm@informatik.uni-freiburg.de>
'''
import urllib
import codecs
import os
import glob
import http
from time import sleep
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize
from six.moves import cPickle
import http.client
drop_out_lines = [
"Subscribe to receive email notifications whenever new talks are published.",
"Thanks! Please check your inbox for a confirmation email. ",
"If you want to get even more from TED, like the ability to save talks to watch later, sign up for a TED account now. ",
"TED.com translations are made possible by volunteer translators. Learn more about the Open Translation Project."
]
class TEDExtract(object):
'''
Simple ted extraction tool. Returns csv files for all transcripts of
all ted talks (defined by the number of pages)
'''
def __init__(self, args):
'''
Constructor.
'''
self.args = args
# Dictionary containing all talks
self.all_talks = {}
self.header = { 'User-Agent' : 'TEDExtract script by naetherm' }
self.conn = http.client.HTTPSConnection('www.ted.com')
def run(self):
'''
Receive all information from the ted page.
Parse all pages and save their transcripts in own csv files.
'''
if os.path.isfile(os.path.join(self.args.output, 'talk_list.pkl')):
with open(os.path.join(self.args.output, 'talk_list.pkl'), 'rb') as fin:
self.all_talks = cPickle.load(fin)
else:
# Collect all talks from all pages
for p in range(1, self.args.max_pages + 1):
path = '/talks?page={}'.format(p)
self._fetch_talk_list(path)
with open(os.path.join(self.args.output, 'talk_list.pkl'), 'wb') as fout:
cPickle.dump(self.all_talks, fout)
# DEBUG
# Loop through all talks and download the content for all available languages
for i in self.all_talks:
self._fetch_talk_content(i)
self.conn.close()
def _fetch_talk_list(self, path):
'''
This method is used to receive all
'''
print("Reading talks of \'{}\'".format(path))
content = self._get_content2(path)
soup = BeautifulSoup(content)
talks = soup.find_all("a", class_='ga-link')
for i in talks:
if i.attrs['href'].find('/talks/') == 0 and self.all_talks.get(i.attrs['href']) != 1:
self.all_talks[i.attrs['href']] = 1
def _get_content2(self, uri):
'''
Reading and returning the content of the provided uri.
'''
self.conn.request('GET', uri, headers=self.header)
resp = None
# do try to fetch the content of the uri
while True:
try:
resp = self.conn.getresponse()
except ConnectionError as e:
print("Received an error from server, wait for {} seconds.".format(
self.args.delay))
sleep(self.args.delay)
else:
break
return resp.read()
def _fetch_talk_content(self, talk):
'''
This method will read all transcriptions of a specific talk, do some
cleanup (removing line breaks, etc.) and save everything within a
separate csv file.
'''
# Extract the talk name
talkname = talk[7:]
if os.path.isfile(os.path.join(self.args.output, talkname + '.csv')):
print("Already downloaded, skip {}".format(talk))
else:
# The data frame object for saving all languages
req = self._get_content2(talk + '/transcript')
print("Reading transcriptions of {}".format(talk + '/transcript'))
soup = BeautifulSoup(req)
df = pd.DataFrame()
for i in soup.findAll('link'):
if i.get('href') != None and i.attrs['href'].find('?language=') != -1:
lang = i.attrs['hreflang']
path = i.attrs['href']
print("Path: {}".format(path))
r1 = self._get_content2(path)
soup1 = BeautifulSoup(r1)
text_talk = []
for i in soup1.findAll('p',class_=''):
temo = i.text.replace('\n',' ')
temo = temo.replace('\t','')
temo = temo.replace('\r','')
temo = temo.replace(' ',' ')
if temo[0] == ' ':
temo = temo[1:]
if temo in drop_out_lines:
pass
else:
text_talk.append(temo)
# Split the fetched data into sentences. It could be the case that there are multiple
# sentences within one 'p' environment
final_text_talk = []
for line in text_talk:
sent_tokenize_list = sent_tokenize(line)
for sent in sent_tokenize_list:
final_text_talk.append(sent)
df1 = pd.DataFrame()
df1[lang] = final_text_talk
df = pd.concat([df,df1],axis=1)
df.to_csv(os.path.join(
self.args.output,
talkname + '.csv'),
sep='\t',
encoding='utf-8')