In [None]:
import xml.etree.ElementTree as ET
from mutagen.mp4 import MP4
from mutagen.mp3 import MP3
from datetime import datetime
import requests
from io import BytesIO
import pandas as pd
import logging
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = ['Heiti TC']
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
def parse_overcast(opml_file, year, previous_df=None):
    tree = ET.parse(opml_file)
    if previous_df is not None:
        df = pd.read_csv(previous_df)
        df['updatedate'] = None
    else:
        df = pd.DataFrame(columns=['name', 'url', 'title', 'length', 'pubdate', 'audio_url', 'updatedate'])
    name, url = None, None
    for node in tree.iter('outline'):
        if node.attrib.get('type') == 'rss':
            name = node.attrib.get('text')
            url = node.attrib.get('xmlUrl')        
            logging.info('%s(%s)' % (name, url))
        elif node.attrib.get('type') == 'podcast-episode':
            if node.attrib.get('played') == '1':
                title = node.attrib.get('title')
                enclosureurl = node.attrib.get('enclosureUrl')
                pubdate = node.attrib.get('pubDate')
                updatedate = node.attrib.get('userUpdatedDate')
                dt = datetime.strptime(updatedate, '%Y-%m-%dT%H:%M:%S%z')
                if dt.year != year:
                    continue
                check_exist = df.query("(name == \"%s\") & (title == \"%s\")" % (name, title))
                if len(check_exist) > 0:
                    df.loc[(df['name'] == name) & (df['title'] == title), 'updatedate'] = updatedate
                    continue
                logging.info('  %s' % title)
                try:
                    response = requests.get(enclosureurl)
                    obj = BytesIO(response.content)
                    if "mp3" in enclosureurl: 
                        audio = MP3(obj)
                    elif "m4a" in enclosureurl:
                        audio = MP4(obj)
                    line = {}
                    line['name'] = name
                    line['url'] = url
                    line['title'] = title
                    line['length'] = audio.info.length
                    line['pubdate'] = pubdate
                    line['audio_url'] = enclosureurl
                    df = pd.concat([df, pd.DataFrame.from_records([line])])
                except:
                    logging.error('Error', enclosureurl)
    return df

In [None]:
def plot_df(df, year):
    df['year'] = pd.to_datetime(df['updatedate'], utc=True).dt.year
    df = df[df['year'] == int(year)]
    df_plot = df.groupby('name', as_index=False)['length'].sum()
    df_plot['length'] = df_plot['length']/3600
    df_plot = df_plot.sort_values(by='length', ascending=False)
    print('Total hours: %.1f (%.1f days)' % (df_plot.length.sum(), df_plot.length.sum()/24))
    df_plot = df_plot.iloc[:20]
    values = df_plot['length'].values
    labels = df_plot['name'].astype(str).values
    plt.figure(figsize = (200, 100))
    ax = sns.barplot(x='name', y='length', data=df_plot)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=70, fontsize=80)
    ax.bar_label(ax.containers[0], fontsize=80)

In [None]:
export_filename = "overcast.opml"
dataframe_name = "pod_2022.csv"
year = 2022

In [None]:
df = parse_overcast(export_filename, year, dataframe_name)

In [None]:
df.to_csv(dataframe_name, index=False)

In [None]:
plot_df(df, year)