In [64]:
import pandas as pd
import json
import re
import isodate
import requests
from time import sleep
from dateutil import parser
from email import utils
from datetime import datetime
from rfeed import Item, Feed, iTunesItem, Guid, Enclosure
from collections import defaultdict
from more_itertools import unique_everseen

In [5]:
with open('../../metadata/youtube-api/videos_list_yutt.json') as f:
    videos_list_yutt = json.loads(f.read())

In [6]:
videos = [v for v in videos_list_yutt if 'Dhammapada' in v['snippet']['title']]
titles = [v['snippet']['title'] for v in videos]

In [7]:
numbers = []
mapping = {}
replacements = {
    'Dhammapada 1: ': 'Dhammapada Verse 1:',
    'Dhammapada Verse Two': 'Dhammapada Verse 2',
    'Dhammapada Verses Three and Four': 'Dhammapada Verses 3 & 4',
    'Dhammapada Verse Five': 'Dhammapada Verse 5',
    'Dhammapada Verse Six': 'Dhammapada Verse 6',
    'Dhammapada Verses Seven & Eight': 'Dhammapada Verses 7 & 8',
    'Dhammapada Verses Nine and Ten': 'Dhammapada Verses 9 & 10'
}

titles

for i, title in enumerate(titles):
    for old_title, new_title in replacements.items():
        if title.startswith(old_title):
            title = title.replace(old_title, new_title)
            videos[i]['snippet']['title'] = title
            break

    m = re.findall(r'([0-9]+)', title)

    new_numbers = []
    if len(m) == 3:
        new_numbers = [int(x) for x in m]
    elif len(m) == 2:
        new_numbers = list(range(int(m[0]), int(m[1]) + 1))
    elif len(m) == 1:
        new_numbers = [int(m[0])]
    
    numbers += new_numbers
    
    for n in new_numbers:
        mapping[n] = videos[i]

In [8]:
[x for x in range(1, 224) if not x in numbers] # written numbers missing

[]

In [9]:
with open('verses.txt') as f:
    verses = f.read().split('\n---\n')

In [10]:
exceptions = [207, 210, 198, 189, 190, 191, 184, 138, 139, 88]

for verse in verses:
    for num in [int(n) for n in re.findall(r'[0-9]+', verse)]:
        assert (num in exceptions or str(num) in mapping[num]['snippet']['title'])
        
last_nums = [227] # greatest number + 1
for i, verse in enumerate(verses):
    nums = sorted([int(n) for n in re.findall(r'[0-9]+', verse)])
    for num in nums:
        assert (num in exceptions or str(num) in videos[i]['snippet']['title'])
    
    if (set(nums) - set(exceptions)) != set():
        # check order
        assert nums[0] < last_nums[0]
    
    last_nums = nums

In [11]:
df = pd.read_json('../../metadata/subtitles/top_keywords_from_subtitles.json')
keywords = { row.link: row.keywords for row in df.itertuples() }

In [103]:
def create_desc(verse, verse_english, video, chapter):
    verse = '<br>'.join(verse.split('\n'))
    template = """<![CDATA[<p>Dhammapada verse in Pali:</p>
<p>%s</p>
<p>A series on the Dhammapada, a set of 423 verse teachings given by the Buddha, including explanation of the Pali verse, a synopsis of the background story and application of the teaching to our practice by Yuttadhammo Bhikkhu.</p>
<p>Yuttadhammo Bhikkhu is a Canadian-born Theravada Buddhist monk, ordained in 2001 under the guidance of Venerable Ajaan Tong Sirimangalo. He has taught meditation courses in the Mahasi Sayadaw tradition since 2003 and gives online teachings via YouTube, as well as Dhamma talks to both intensive meditators and by invitation to the general public.</p>
<br>
<p>Original Youtube Video: <a href="%s" target="_blank">%s</a></p>
<p>Our podcasts: <a href=" https://podcast.sirimangalo.org/" target="_blank"> https://podcast.sirimangalo.org/</a></p>
<p>How To Meditate Booklet: <a href=" https://htm.sirimangalo.org/" target="_blank"> https://htm.sirimangalo.org/</a></p>
<p>To join our meditation community and sign up for an at-home meditation course, please visit <a href="https://meditation.sirimangalo.org/" target="_blank">https://meditation.sirimangalo.org/</a></p>
<p>Our Website: <a href="https://www.sirimangalo.org/" target="_blank">https://www.sirimangalo.org/</a></p>
<p>Supporting This Work: <a href="https://www.sirimangalo.org/support" target="_blank">https://www.sirimangalo.org/support</a></p>
<p>Keywords: Buddhism, Dhamma, Buddha, Monk, Bhikkhu, Sirimangalo, Yuttadhammo, Tipitaka, Pali, %s</p>]]>
""" % (verse, video, video, chapter)
    return template

In [104]:
def create_summary(translated_verse):
    return "%s (Translated by Acharya Buddharakkhita)" % (translated_verse)

In [105]:
files_sizes = {}
with open('file-sizes.json') as f:
    files_sizes = json.loads(f.read())
    
#for v in videos:
#    audioUrl = 'https://almedia.blob.core.windows.net/audio/%s.mp3' % v['id']
#    
#    try:
#        files_sizes[v['id']] = int(requests.head(audioUrl).headers['Content-Length'])
#    except:
#        print('Error', v['id'])
#    
#    sleep(0.5)
#
#with open('file-sizes.json', 'w') as f:
#    f.write(json.dumps(files_sizes))

In [106]:
with open('verses_english.json') as f:
    dhp_english = json.loads(f.read())

In [107]:
verses_english = []

for v in verses:
    nums = []
    
    for line in v.split('\n'):
        temp = line.strip().replace('.', '')

        if re.match(r'^[0-9]+$', temp):
            nums.append(int(temp))
    
    translation = ''
    
    for i in nums:
        translation += 'Verse %i:' % i
        translation += '\n\n'
        translation += dhp_english[str(i)]
        translation += '\n\n'

    verses_english.append(translation)

In [108]:
with open('chapters.txt') as f:
    chapters = [x for x in f.read().split('\n') if len(x) > 0]
    
chapters = chapters[::-1]

In [109]:
items = []
audio_metadata = {}

unique_chapters = list(unique_everseen(chapters))
current_episode = defaultdict(int)

for i in range(len(videos)):
    video = videos[::-1][i]
    verse = verses[::-1][i]
    verse_english = verses_english[::-1][i]
    chapter = chapters[i]

    # episode = len(videos) - i
    title = video['snippet']['title'].replace('&', '&amp;')
    link = 'https://youtu.be/' + video['id']
    tags = 'None' if not link in keywords else keywords[link]
    tags = tags.split(',')[:5]
    duration = isodate.parse_duration(video['contentDetails']['duration']).seconds
    pubDate = utils.format_datetime(parser.parse(video['snippet']['publishedAt']))
    guid = video['id']
    audioUrl = 'https://almedia.blob.core.windows.net/audio/%s.mp3' % video['id']
    audioType = 'audio/mpeg'
    audioFileSize = files_sizes[guid] if guid in files_sizes else 0
    subtitle = '%s from the Chapter "%s"' % (title.split(':')[0], chapter)
    content_encoded = create_desc(verse, verse_english, link, chapter)
    summary = create_summary(verse_english)
    categories = ["Buddhism", "Dhamma", "Buddha", "Monk", "Bhikkhu", "Sirimangalo", "Yuttadhammo", "Tipitaka", "Pali", chapter]
    season = unique_chapters.index(chapter) + 1
    current_episode[season] += 1
    episode = current_episode[season]
    
    items.append({
        'guid': guid,
        'title': title,
        'subtitle': subtitle,
        'contentEncoded': content_encoded,
        'description': summary,
        'pubDate': pubDate,
        'duration': duration,
        'audioUrl': audioUrl,
        'audioType': audioType,
        'audioSize': audioFileSize,
        'duration': duration,
        'categories': categories,
        'episode': episode,
        'season': season
    })
    
    audio_metadata[audioUrl] = {
        'guid': videos[i]['id'],
        'duration': duration,
        'type': audioType,
        'size': audioFileSize
    }


items = items[::-1]

In [110]:
with open('dhammapada-items.json', 'w') as f:
    f.write(json.dumps(items, indent=2))

In [111]:
with open('audio_metadata.json', 'w') as f:
    f.write(json.dumps(audio_metadata, indent=2))

In [112]:
description = """
<![CDATA[
<p>A series on the Dhammapada, a set of 423 verse teachings given by the Buddha, including explanation of the Pali verse, a synopsis of the background story and application of the teaching to our practice by Yuttadhammo Bhikkhu.</p>
<p>Yuttadhammo Bhikkhu is a Canadian-born Theravada Buddhist monk, ordained in 2001 under the guidance of Venerable Ajaan Tong Sirimangalo. He has taught meditation courses in the Mahasi Sayadaw tradition since 2003 and gives online teachings via YouTube, as well as Dhamma talks to both intensive meditators and by invitation to the general public.</p>
<br>
<p>How To Meditate Booklet: <a href=" https://htm.sirimangalo.org/" target="_blank"> https://htm.sirimangalo.org/</a></p>
<p>To join our meditation community and sign up for an at-home meditation course, please visit <a href="https://meditation.sirimangalo.org/" target="_blank">https://meditation.sirimangalo.org/</a></p>
<p>Our Website: <a href="https://www.sirimangalo.org/" target="_blank">https://www.sirimangalo.org/</a></p>
<p>Supporting This Work: <a href="https://www.sirimangalo.org/support" target="_blank">https://www.sirimangalo.org/support</a></p>
<p>YouTube: <a href="https://www.youtube.com/yuttadhammo" target="_blank">https://www.youtube.com/yuttadhammo</a></p>
<p>Keywords: Buddhism, Dhamma, Buddha, Monk, Bhikkhu, Sirimangalo, Yuttadhammo, Tipitaka, Pali</p>
]]>
"""
channel = {
    "lastBuildDate": utils.format_datetime(datetime.now()),
    "ownerName": "Sirimangalo International",
    "ownerEmail": "podcast@sirimangalo.org",
    "link": "https://podcast.sirimangalo.org/dhammapada/",
    "atomLink": "https://podcast.sirimangalo.org/dhammapada/rss.xml",
    "title": "Dhammapada",
    "description": description,
    "subtitle": "A series on the Dhammapada",
    "imageUrl": "fd",
    "category": "Religion &amp; Spirituality",
    "subcategory": "Buddhism",
    "author": "Yuttadhammo Bhikkhu",
    "language": "en",
    "type": "serial",
    "complete": "No",
    "episodes": items
}

In [113]:
with open('dhammapada-channel.json', 'w') as f:
    f.write(json.dumps(channel, indent=2))

In [114]:
#pd.DataFrame(items)[['title', 'description', 'audioUrl', 'audioFileSize', 'audioType', 'duration', 'guid', 'pubDateS', 'pubDateHour', 'pubDateMin', 'categories']].to_excel('test123.xlsx')

In [25]:
#feed = Feed(
#    title = "Sample RSS Feed",
#    link = "http://www.example.com/rss",
#    description = "This is an example of how to use rfeed to generate an RSS 2.0 feed",
#    language = "en-US",
#    lastBuildDate = datetime.now(),
#    items = items,
#)
#with open('test-feed.xml', 'w') as f:
#    f.write(feed.rss())