In [1]:
import os.path
import shelve

import feedparser
import requests
import parsel
from pydub import AudioSegment as AS

In [2]:
asc_url = 'http://www.npr.org/sections/allsongs/163479981/our-show'
response = requests.get(asc_url)
sel = parsel.Selector(text=response.content.decode('utf-8'))

In [3]:
show_url = 'http://www.npr.org/sections/allsongs/2016/02/09/466146044/new-mix-shearwater-lily-madeleine-eskimeaux-more'

In [4]:
def xcls(cls):
    return 'contains(concat(" ", @class, " "), " {} ")'.format(cls)

In [23]:
class Show(object):
    def __init__(self, url):
        self.url = url
        self.bad = False
        self.response = requests.get(url)
        self.sel = parsel.Selector(self.response.content.decode('utf-8'))
        
        self.extract_timestamp()
        if self.bad:
            return
        self.songs = self.count_songs()
    
    def extract_timestamp(self):
        base_url = self.sel.xpath('//a[@class="download"]/@href').extract_first()
        if base_url is None:
            self.bad = True
            return
        assert 'asc_wholeshow' in base_url
        x = base_url.find('npr/asc/') + len('npr/asc/')
        y = base_url.find('_asc_')
        self.url_timestamp = base_url[x:y]
        self.small_timestamp = self.url_timestamp[self.url_timestamp.rfind('/')+1:]
        
    def song_url(self, n):
        return ('http://public.npr.org/anon.npr-mp3/npr/asc/{}_asc_{:02d}.mp3'
                .format(self.url_timestamp, n))
    
    def count_songs(self):
        songs = []
        selectors = self.sel.xpath('//*[{}]//*[{}]'.format(
                xcls('playlistwrap'), 
                xcls('playlistitem'),
        ))
        for n, s in enumerate(selectors, start=1):
            def subsel(cls):
                text = (s.xpath('.//*[{}]/text()'.format(xcls(cls)))
                         .extract_first())
                if text is not None:
                    return text.strip()
                return text
            
            song = {
                '_title': (s
                    .xpath('.//h4/a/text()')
                    .extract_first()
                    .strip()),
                'song': subsel('song'),
                'artist': subsel('artist'),
                'album': subsel('album'),
                'url': self.song_url(n),
            }
            if song['song'] is None:
                song['song'] = song.pop('_title')
            elif song['artist'] is None:
                song['artist'] = song.pop('_title')
            
            url = song['url']
            song['save_as'] = os.path.join('songs', url[url.rfind('/')+1:])
            
            songs.append(song)
        return songs
        
    def __iter__(self):
        yield from self.songs
        
    def download(self, song):
        r = requests.get(song['url'], stream=True)
        with open(song['save_as'], 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024): 
                if chunk:
                    f.write(chunk)
        
    def scrape(self):
        for song in self:
            self.download(song)

    def assemble(self):
        stitched = sum(AS.from_file(s['save_as']) for s in self)
        stitched.export(os.path.join('shows', self.small_timestamp + '_asnp.mp3'))

In [9]:
show = Show(show_url)

In [10]:
show.assemble()

("unsupported operand type(s) for +: 'int' and 'AudioSegment'",)

In [144]:
show.url_timestamp

'2016/02/20160209'

In [150]:
ss.export(os.path.join('shows', show.small_timestamp + '_asnp.mp3'))

<_io.BufferedRandom name='shows/20160209.mp3'>

In [18]:
bad_shows = [
    'http://www.npr.org/sections/allsongs/2016/02/05/465551231/all-songs-1-andrew-bird-gets-personal',
    'http://www.npr.org/sections/allsongs/2016/01/15/463163048/all-songs-1-glen-hansard-plays-our-sweet-16-celebration',
]
good_shows = [
    'http://www.npr.org/sections/allsongs/2016/01/26/463484599/our-top-discoveries-at-globalfest-2016',
    'http://www.npr.org/sections/allsongs/2016/02/09/466146044/new-mix-shearwater-lily-madeleine-eskimeaux-more',
]

for su in bad_shows:
    show = Show(su)
    assert show.bad
for su in good_shows:
    show = Show(su)
    assert not show.bad

In [31]:
import feedparser

rss_url = 'http://www.npr.org/rss/rss.php?id=163479981'
# podcast_url = 'http://www.npr.org/templates/rss/podlayer.php?id=15709577'
rss = feedparser.parse(rss_url)
# podcast = feedparser.parse(podcast_url)

links = [ent['link'] for ent in rss['entries']]

for link in links[:5]:
   show = Show(link)
   print(show.bad)