In [1]:
import pandas as pd
import json
import re
import isodate
import requests
from time import sleep
from dateutil import parser
from email import utils
from datetime import datetime
from rfeed import Item, Feed, iTunesItem, Guid, Enclosure

In [2]:
with open('../../metadata/youtube-api/videos_list_yutt.json') as f:
    videos_list_yutt = json.loads(f.read())

In [3]:
videos = [v for v in videos_list_yutt if 'Dhammapada' in v['snippet']['title']]
titles = [v['snippet']['title'] for v in videos]

In [4]:
numbers = []
mapping = {}
replacements = {
    'Dhammapada 1: ': 'Dhammapada Verse 1:',
    'Dhammapada Verse Two': 'Dhammapada Verse 2',
    'Dhammapada Verses Three and Four': 'Dhammapada Verses 3 & 4',
    'Dhammapada Verse Five': 'Dhammapada Verse 5',
    'Dhammapada Verse Six': 'Dhammapada Verse 6',
    'Dhammapada Verses Seven & Eight': 'Dhammapada Verses 7 & 8',
    'Dhammapada Verses Nine and Ten': 'Dhammapada Verses 9 & 10'
}

titles

for i, title in enumerate(titles):
    for old_title, new_title in replacements.items():
        if title.startswith(old_title):
            title = title.replace(old_title, new_title)
            videos[i]['snippet']['title'] = title
            break

    m = re.findall(r'([0-9]+)', title)

    new_numbers = []
    if len(m) == 3:
        new_numbers = [int(x) for x in m]
    elif len(m) == 2:
        new_numbers = list(range(int(m[0]), int(m[1]) + 1))
    elif len(m) == 1:
        new_numbers = [int(m[0])]
    
    numbers += new_numbers
    
    for n in new_numbers:
        mapping[n] = videos[i]

In [5]:
[x for x in range(1, 224) if not x in numbers] # written numbers missing

[]

In [6]:
with open('verses.txt') as f:
    verses = f.read().split('\n---\n')

In [7]:
exceptions = [207, 210, 198, 189, 190, 191, 184, 138, 139, 88]

for verse in verses:
    for num in [int(n) for n in re.findall(r'[0-9]+', verse)]:
        assert (num in exceptions or str(num) in mapping[num]['snippet']['title'])
        
last_nums = [227] # greatest number + 1
for i, verse in enumerate(verses):
    nums = sorted([int(n) for n in re.findall(r'[0-9]+', verse)])
    for num in nums:
        assert (num in exceptions or str(num) in videos[i]['snippet']['title'])
    
    if (set(nums) - set(exceptions)) != set():
        # check order
        assert nums[0] < last_nums[0]
    
    last_nums = nums

In [8]:
df = pd.read_json('../../metadata/subtitles/top_keywords_from_subtitles.json')
keywords = { row.link: row.keywords for row in df.itertuples() }

In [37]:
def create_desc(verse, verse_english, video, tags):
    verse = '<br>'.join(verse.split('\n'))
    template = """<![CDATA[%s
<br>
%s
<br>
<ul>
<li><b>Original YouTube-Video</b>: <a href="%s" target="_blank">%s</a></li>
<li><b>For more information on the Dhammapada</b>: <a href="http://www.buddhanet.net/e-learning/buddhism/dhammapada.htm" target="_blank">http://www.buddhanet.net/e-learning/buddhism/dhammapada.htm</a></li>
<li><b>Automatic Keywords</b>: %s</li>
</ul>]]>
""" % (verse, verse_english, video, video, ', '.join(tags))
    return template

In [38]:
files_sizes = {}
with open('file-sizes.json') as f:
    files_sizes = json.loads(f.read())
    
#for v in videos:
#    audioUrl = 'https://almedia.blob.core.windows.net/audio/%s.mp3' % v['id']
#    
#    try:
#        files_sizes[v['id']] = int(requests.head(audioUrl).headers['Content-Length'])
#    except:
#        print('Error', v['id'])
#    
#    sleep(0.5)
#
#with open('file-sizes.json', 'w') as f:
#    f.write(json.dumps(files_sizes))

In [39]:
with open('verses_english.json') as f:
    dhp_english = json.loads(f.read())

In [40]:
verses_english = []

for v in verses:
    nums = []
    
    for line in v.split('\n'):
        temp = line.strip().replace('.', '')

        if re.match(r'^[0-9]+$', temp):
            nums.append(int(temp))
    
    translation = ''
    
    for i in nums:
        translation += 'Verse %i:' % i
        translation += '\n\n'
        translation += dhp_english[str(i)]
        translation += '\n\n'

    verses_english.append(translation)

In [41]:
items = []

for i in range(len(videos)):
    episode = len(videos) - i
    link = 'https://youtu.be/' + videos[i]['id']
    tags = 'None' if not link in keywords else keywords[link]
    tags = tags.split(',')[:5]
    desc = create_desc(verses[i], verses_english[i], link, tags)
    duration = isodate.parse_duration(videos[i]['contentDetails']['duration']).seconds
    pubDate = utils.format_datetime(parser.parse(videos[i]['snippet']['publishedAt']))
    guid = videos[i]['id']
    audioUrl = 'https://almedia.blob.core.windows.net/audio/%s.mp3' % videos[i]['id']
    audioType = 'audio/mpeg'
    audioFileSize = files_sizes[guid] if guid in files_sizes else 0

    items.append({
        'episode': episode,
        'guid': guid,
        'title': videos[i]['snippet']['title'].replace('&', '&amp;'),
        'description': desc,
        'pubDate': pubDate,
        'duration': duration,
        'audioUrl': audioUrl,
        'audioType': audioType,
        'audioFileSize': audioFileSize,
        'categories': tags
    })

In [42]:
with open('dhammapada-items.json', 'w') as f:
    f.write(json.dumps(items, indent=2))

In [27]:
#pd.DataFrame(items)[['title', 'description', 'audioUrl', 'audioFileSize', 'audioType', 'duration', 'guid', 'pubDateS', 'pubDateHour', 'pubDateMin', 'categories']].to_excel('test123.xlsx')

In [25]:
#feed = Feed(
#    title = "Sample RSS Feed",
#    link = "http://www.example.com/rss",
#    description = "This is an example of how to use rfeed to generate an RSS 2.0 feed",
#    language = "en-US",
#    lastBuildDate = datetime.now(),
#    items = items,
#)
#with open('test-feed.xml', 'w') as f:
#    f.write(feed.rss())