/
scraper.py
336 lines (297 loc) · 10.1 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import hashlib
import time
import datetime
import random
from urlparse import urljoin
import requests
import pyquery
import feedparser
from django.utils import timezone
from django.db.models import Count
from django.db.utils import DataError
from peterbecom.podcasttime.models import (
Podcast,
Episode,
PodcastError,
NotAnImageError,
)
from peterbecom.podcasttime.utils import (
download,
parse_duration_ffmpeg,
get_image_url,
)
class BadPodcastEntry(Exception):
pass
def itunes_lookup(itunes_id):
url = 'https://itunes.apple.com/lookup'
response = requests.get(url, {'id': itunes_id})
return response.json()
def itunes_search(term, **options):
options.update({'term': term, 'entity': 'podcast'})
url = 'https://itunes.apple.com/search'
response = requests.get(url, options)
return response.json()
def download_some_episodes(max_=5, verbose=False):
# first attempt podcasts that have 0 episodes
podcasts = Podcast.objects.all().annotate(
subcount=Count('episode')
).filter(subcount=0)
for podcast in podcasts.order_by('?')[:max_ * 2]:
if verbose:
print (podcast.name, podcast.last_fetch)
download_episodes(podcast)
# secondly, do those whose episodes have never been fetched
podcasts = Podcast.objects.filter(
last_fetch__isnull=True
).order_by('?')
for podcast in podcasts[:max_]:
if verbose:
print (podcast.name, podcast.last_fetch)
download_episodes(podcast)
# then do the ones with the oldest updates
podcasts = Podcast.objects.filter(
last_fetch__isnull=False
).order_by('last_fetch')
for podcast in podcasts[:max_]:
if verbose:
print (podcast.name, podcast.last_fetch)
download_episodes(podcast)
def download_episodes(podcast, verbose=True):
try:
_download_episodes(podcast, verbose=verbose)
except Exception:
PodcastError.create(podcast)
raise
def _download_episodes(podcast, verbose=True):
xml = download(podcast.url)
d = feedparser.parse(xml)
def get_duration(entry):
if not entry.get('itunes_duration'):
try:
for link in entry['links']:
if (
link['type'] == 'audio/mpeg' or
link['href'].lower().endswith('.mp3')
):
return parse_duration_ffmpeg(
link['href']
)
except KeyError:
try:
print entry.enclosure
raise Exception(entry.enclosure)
except AttributeError:
# no 'itunes:duration' and no links
print "SKIPPING", entry
return
elif entry['itunes_duration'].count(':') >= 1:
try:
itunes_duration = entry['itunes_duration']
# a bug in bad podcasts
itunes_duration = itunes_duration.replace('>', '')
itunes_duration = itunes_duration.replace(';', '')
itunes_duration = [
int(float(x)) for x in itunes_duration.split(':')
if x.strip()
]
except ValueError:
print "SKIPPING, BAD itunes_duration"
print entry
print 'itunes_duration=', repr(entry['itunes_duration'])
return
duration = 0
itunes_duration.reverse()
duration += itunes_duration[0] # seconds
if len(itunes_duration) > 1:
duration += 60 * itunes_duration[1] # minutes
if len(itunes_duration) > 2:
duration += 60 * 60 * itunes_duration[2] # hours
if duration > 24 * 60 * 60:
entry['itunes_duration'] = None
return get_duration(entry)
return duration
else:
if not entry['itunes_duration']:
print "BUT!", xml.find('<itunes:duration')
return
try:
return int(float(entry['itunes_duration']))
except ValueError:
# pprint(entry)
print "SKIPPING itunes_duration not a number"
print repr(entry['itunes_duration'])
return
for entry in d['entries']:
if not entry.get('published_parsed'):
print "Entry without a valid 'published_parsed'!"
print entry
raise BadPodcastEntry("Entry without a valid 'published_parsed'!")
published = datetime.datetime.fromtimestamp(
time.mktime(entry['published_parsed'])
)
if published.tzinfo is None:
published = published.replace(tzinfo=timezone.utc)
duration = get_duration(entry)
if duration is None:
continue
try:
guid = entry.guid
except AttributeError:
try:
guid = entry.id
except AttributeError:
print "No guid or id. Going to use the summary."
try:
guid = hashlib.md5(
entry.summary.encode('utf-8')
).hexdigest()
except AttributeError:
print "No guid or id or summary. ",
print "Going to use the title."
guid = hashlib.md5(
entry.title.encode('utf-8')
).hexdigest()
# raise
try:
ep = Episode.objects.get(
podcast=podcast,
guid=guid
)
if ep.duration != duration:
print "DURATION CHANGED!!!"
else:
print "Duration unchanged"
if ep.published != published:
print "PUBLISHED CHANGED!!!"
else:
print "Published unchanged"
except Episode.DoesNotExist:
pass
try:
episode = Episode.objects.get(
podcast=podcast,
guid=guid
)
episode.duration = duration
episode.published = published
try:
episode.save()
print "SAVED",
except DataError:
print "FROM", podcast.url
print "ENTRY"
print entry
print "TRIED TO SAVE DURATION", duration
PodcastError.create(podcast, notes='Tried to save duration')
raise
except Episode.DoesNotExist:
episode = Episode.objects.create(
podcast=podcast,
duration=duration,
published=published,
guid=guid,
)
print "CREATED",
print (
episode.podcast.name,
episode.guid,
episode.duration,
episode.published
)
podcast.last_fetch = timezone.now()
podcast.save()
def find_podcasts(baseurl, verbose=False):
html = download(baseurl)
doc = pyquery.PyQuery(html)
urls = []
for a in doc('ul.nav ul.dropdown-menu li a'):
href = a.attrib['href']
if '/browse/' in href:
url = urljoin(baseurl, href)
urls.append(url)
max_ = 10
random.shuffle(urls)
for url in urls[:max_]:
_scrape_index(
url,
verbose=verbose,
max_=max_,
)
def _scrape_index(url, verbose=False, max_=1000):
html = download(url, gently=True)
doc = pyquery.PyQuery(html)
links = doc('.thumbnails a')
shows = []
for link in links:
show_url = link.attrib['href']
show_url = urljoin(url, show_url)
link = pyquery.PyQuery(link)
for h4 in link.find('h4'):
name = h4.text_content()
shows.append((name, show_url))
existing_names = Podcast.objects.all().values_list('name', flat=True)
# XXX might not keep this
shows = [
(n, u) for (n, u) in shows
if n not in existing_names
]
random.shuffle(shows)
for name, show_url in shows[:max_]:
rss_url = _scrape_show(show_url)
if not rss_url:
print "Skipping", name, show_url
continue
image_url = get_image_url(rss_url)
if not image_url:
print "Skipping (no image)", name, rss_url
continue
assert '://' in image_url, image_url
# print "IMAGE_URL", image_url
try:
podcast = Podcast.objects.get(name=name)
podcast.url = rss_url
podcast.image_url = image_url
podcast.save()
created = False
except Podcast.DoesNotExist:
raise Exception("ABOUT TO CREATE: %s" % name)
podcast = Podcast.objects.create(
name=name,
url=rss_url,
image_url=image_url,
)
created = True
try:
podcast.download_image()
except (AssertionError, NotAnImageError):
if verbose:
print "Got an error trying to download the image :("
print "IGNORING AND MOVING ON"
PodcastError.create(podcast)
if verbose:
if created:
print "CREATED",
else:
print "NOT NEW",
print repr(name)
def _scrape_show(url):
html = download(url)
doc = pyquery.PyQuery(html)
for a in doc('.sidebar-nav a'):
for h4 in pyquery.PyQuery(a).find('h4'):
if h4.text_content() == 'Open RSS feed':
rss_url = urljoin(url, a.attrib['href'])
return rss_url
def fix_podcast_images(max_, verbose=False):
podcasts = Podcast.objects.filter(
image__isnull=True,
image_url__isnull=False,
itunes_lookup__isnull=False,
)
for podcast in podcasts.order_by('?')[:max_]:
print repr(podcast.name)
print podcast.image
print podcast.image_url
print podcast.itunes_lookup['artworkUrl600']
print
raise NotImplementedError