Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
executable file 407 lines (336 sloc) 15.3 KB
#!/usr/bin/env python
import itertools as it, functools as ft, operator as op
import pathlib as pl, datetime as dt, urllib.parse as up, hashlib as hl, subprocess as sp
import xml.etree.ElementTree as etree
import os, sys, re, logging, time, calendar, base64, json, textwrap, unicodedata
import feedparser as fp # pip install --user feedparser
import http.client # feedparser fetches can raise errors from here
class LogMessage:
def __init__(self, fmt, a, k): self.fmt, self.a, self.k = fmt, a, k
def __str__(self): return self.fmt.format(*self.a, **self.k) if self.a or self.k else self.fmt
class LogStyleAdapter(logging.LoggerAdapter):
def __init__(self, logger, extra=None): super().__init__(logger, extra or dict())
def log(self, level, msg, *args, **kws):
if not self.isEnabledFor(level): return
log_kws = {} if 'exc_info' not in kws else dict(exc_info=kws.pop('exc_info'))
msg, kws = self.process(msg, kws)
self.logger._log(level, LogMessage(msg, args, kws), (), **log_kws)
err_fmt = lambda err: '[{}] {}'.format(err.__class__.__name__, err)
get_logger = lambda name: LogStyleAdapter(logging.getLogger(f'ytf2e.{name}'))
str_norm = lambda v: unicodedata.normalize('NFKC', v.strip()).casefold()
str_hash = lambda p: base64.urlsafe_b64encode(
hl.blake2s(str(p).encode(), person=b'ytf2e.s1').digest() ).decode()[:12]
def tuple_hash(*data):
if len(data) == 1 and isinstance(data[0], (tuple, list)): data = data[0]
src = list()
for v in data:
if v is None: src.append('\ue003')
elif isinstance(v, (int, str, dt.tzinfo)): src.append(str(v))
elif isinstance(v, (tuple, list)): src.append(tuple_hash(v))
elif isinstance(v, dt.datetime):
src.append(conv_ts_utc(v).strftime('%Y-%m-%dT%H:%M:%S'))
elif isinstance(v, dt.timedelta): src.append('\ue002{v.total_seconds()}')
elif isinstance(v, set): src.append(tuple_hash(sorted(v)))
else: raise ValueError(type(v), v)
return str_hash('\ue000'.join(
'\ue001{}\ue001'.format(v.replace('\ue001', '\ue001'*2)) for v in src ))
def str_repr(s, max_len=160, len_bytes=False, ext=' ...[{s_len}]'):
if isinstance(s, bytes): s = s.decode('utf-8', 'replace')
if not isinstance(s, str): s = str(s)
s_len, s_repr, ext_tpl = f'{len(s):,d}', repr(s)[1:-1], ext.format(s_len='12/345')
s_repr = s_repr.replace("\\'", "'").replace('\\"', '"')
if max_len > 0 and len(s_repr) > max_len:
s_len = f'{max_len}/{s_len}'
if not len_bytes: s_repr = s_repr[:max_len - len(ext_tpl)] + ext.format(s_len=s_len)
else:
n = max_len - len(ext_tpl.encode())
s_repr = s_repr.encode()[:n].decode(errors='ignore') + ext.format(s_len=s_len)
return s_repr
dd = lambda text: (textwrap.dedent(text).strip('\n') + '\n').replace('\t', ' ')
fill = lambda s,w=90,ind='',ind_next=' ',**k: textwrap.fill(
s, w, initial_indent=ind, subsequent_indent=ind if ind_next is None else ind_next, **k )
class YTFeed:
title = chan = url = title_filter = None
ts_last_check = 0
delay_ewma = 24 * 3600
delay_ewma_max = 30 * 24 * 3600
delay_ewma_a = 0.3
etag = seen_entries = None
def __init__(self, url, title=None, title_filter=None, chan=None):
if not chan: chan, = up.parse_qs(up.urlparse(url).query)['channel_id']
if not title: title = f'chan.{chan}'
title = ' '.join(title.replace('\n', ' ').split())
if len(title) > 40: title = f'{title[:38]}--'
self.title, self.chan, self.url, self.title_filter = title, chan, url, title_filter
def __repr__(self):
chan = self.chan
if self.title_filter: chan = f'{chan} || {self.title_filter}'
f'YTFeed({self.title} [{chan}])'
@classmethod
def from_xml(cls, attrs):
return cls(attrs['xmlUrl'], attrs.get('title'))
@classmethod
def from_line(cls, line):
title_filter, line = None, line.strip().split(None, 1)
url, title = line if len(line) > 1 else (line[0], None)
if '||' in title:
title, title_filter = map(str.strip, title.split('||', 1))
title_filter = re.compile(title_filter)
return cls(url, title, title_filter)
def ts_check_next(self):
# /2 to run 2x fetches per (average-ish) interval between entries
return self.ts_last_check + self.delay_ewma / 2
def entry_id(self, e):
e_id = None
for k in 'id', 'yt_videoid', 'title', 'link', 'published', 'modified', 'created':
if e_id := e.get(k): break
if e_id: e_id = tuple_hash('id.1', e_id)
return e_id
def filter_check(self, title):
if not self.title_filter: return True
return bool(self.title_filter.search(title))
class YTFeedIndex:
def __init__(self, *feed_idxs):
self.idx = dict()
if feed_idxs:
for idx in feed_idxs: self.add(idx)
def add(self, feeds):
if isinstance(feeds, YTFeed): self.idx[feeds.chan] = feeds
elif isinstance(feeds, YTFeedIndex): self.idx.update(feeds.idx)
else:
for feed in feeds: self.idx[feed.chan] = feed
def get(self, chan, fallback=None):
return self.idx.get(chan, fallback)
__bool__ = lambda s: bool(s.idx)
__contains__ = lambda s,f: f.chan in s.idx
__len__ = lambda s: len(s.idx)
__iter__ = lambda s: iter(s.idx.values())
# Line format: {ts} :: {chan-id} {chan-name!r} :: {update-json}
state_log_name = 'updates.log'
state_log_max_size = 3 * 2**20
def state_process(state_dir, feeds):
if not state_dir: return
state_dir, log = pl.Path(state_dir), get_logger('state')
if not state_dir.exists():
state_dir.mkdir(mode=0o700, parents=True, exists_ok=True)
state_last, state_log = dict(), state_dir / state_log_name
with state_log.open('a+') as src:
src.seek(0)
for line in src:
try: update = json.loads(line.split(' :: ', 2)[-1])
except Exception as err:
log.error('Failed to process feed-state entry: {} -- {!r}', err_fmt(err), line)
continue
feed = feeds.get(update['chan'])
if not feed:
log.debug('Dropping update(s) for nx feed: {}', str_repr(line))
continue
state_last[feed.chan] = line, feed, update
state_log_new = None
if state_log.stat().st_size > state_log_max_size:
state_log.rename(state_dir / f'{state_log_name}.old')
state_log_new = state_log.open('a')
try:
for line, feed, update in state_last.values():
for k in 'ts_last_check etag seen_entries delay_ewma'.split():
try: setattr(feed, k, update[k])
except KeyError:
if k != 'seen_entries': raise
if state_log_new: state_log_new.write(line)
finally:
if state_log_new: state_log_new.close()
return state_log
def state_update(state_log, ts, feed, update):
ts = time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))
update = json.dumps(update)
with state_log.open('a') as dst:
dst.write(f'{ts} :: {feed.chan} {feed.title!r} :: {update}\n')
class FeedFetchError(Exception): pass
feed_user_agent = f'yt-feed-to-email/0.1 feedparser/{fp.__version__}'
feed_accept_header = ( 'application/atom+xml,application/rdf+xml,'
'application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2' )
def feed_fetch(log, url, etag):
try:
feed = fp.parse( url, agent=feed_user_agent,
etag=etag, request_headers=dict(Accept=feed_accept_header) )
status, bozo, bozo_err = (
feed.get(k) for k in ['status', 'bozo', 'bozo_exception'] )
except http.client.HTTPException as err:
status, bozo, bozo_err = 0, True, err_fmt(err)
if (not status and bozo) or (status or 1000) >= 400:
raise FeedFetchError(f'feedparser error (status={status}): {url} - {bozo_err}')
if status == 304: return
elif status >= 300: raise FeedFetchError(f'Unhandled 3xx response status {status}')
return feed.get('etag'), feed.entries
def feed_process(log, feed, entries, ts, ts_since=None):
first_old_id = e_ts_prev = None
ewma_delay, ewma_a = feed.delay_ewma, feed.delay_ewma_a
emails, feed_ids, seen_ids = list(), dict(), feed.seen_entries or dict()
for e in entries:
if not feed.filter_check(e.title): continue
e_id, e_ts = feed.entry_id(e), calendar.timegm(e.published_parsed)
if not e_id: raise ValueError(f'Failed to get id for feed entry: {str_repr(e)}')
feed_ids[e_id] = e_ts
if not first_old_id: # skips all older entries in ewma calculation
if e_ts_prev: ewma_delay = ewma_a * (e_ts - e_ts_prev) + (1 - ewma_a) * ewma_delay
e_ts_prev = e_ts
if ts_since is not None and e_ts <= ts_since:
log.debug( 'Skipping old entry [id={} ts={}]: pub={}'
' author={} title={!r}', e_id, e_ts, e.published, e.author, e.title )
if not first_old_id: first_old_id = e_id
continue
if e_id in seen_ids:
log.debug( 'Skipping already-seen entry [id={} ts={}]: pub={}'
' author={} title={!r}', e_id, e_ts, e.published, e.author, e.title )
if not first_old_id: first_old_id = e_id
continue
log.debug( 'Generating notification for entry [{}]:'
' ts={} author={!r} title={!r}', e_ts, e.published, e.author, e.title )
body = dd(f'''
Title: {e.title}
Author: {e.author}
Published: {e.published}
Link: {e.link}''')
if summary := e.get('summary'):
body += '\nSummary:\n' + process_summary(summary)
emails.append((e_ts, f'YT [{e.author}]: {e.title}', body))
if e_ts_prev and not emails and (ts - e_ts_prev) > ewma_delay:
# This makes empty checks bump delay up
ewma_delay = ewma_a * (ts - e_ts_prev) + (1 - ewma_a) * ewma_delay
ewma_delay = min(feed.delay_ewma_max, ewma_delay)
elif not e_ts_prev: log.warning('Empty feed - check/remove it from list') # can be a bug too
# Find oldest common e_id and merge all newer seen_ids into feed_ids
# This is done to avoid notifications for flapping visible/hidden entries
n = 0
for n, e_id in enumerate(reversed(list(seen_ids))):
if e_id in feed_ids: break
if n > 0: seen_ids = dict((e_id, seen_ids[e_id]) for e_id in list(seen_ids)[:-n])
feed_ids = dict(sorted( # (e_id, ts) with later ts taking prio for same id
it.chain(seen_ids.items(), feed_ids.items()), key=op.itemgetter(1) ))
feed_ids = dict(sorted(feed_ids.items(), key=op.itemgetter(1), reverse=True))
return emails, ewma_delay, feed_ids
def process_summary(text, w=120, pre=' '):
line_last, text = None, text.strip().split('\n')
for n, line in enumerate(text):
line = line.rstrip()
if line == line_last:
text[n] = ''
continue
line_last = line
text[n] = fill(text[n], w) + '\n'
return ''.join(f'{pre}{line}' for line in filter(None, text))
def main(args=None):
import argparse
parser = argparse.ArgumentParser(
description='Script to generate email notifications for YouTube OPML/RSS feed updates.')
group = parser.add_argument_group('Feed sources')
group.add_argument('-o', '--opml', metavar='file',
help='YT OPML export from https://www.youtube.com/subscription_manager?action_takeout=1 link.')
group.add_argument('-r', '--rss-list', metavar='file',
help='File with YT RSS/Atom feed URLs, one per line, with optional comments after URLs.')
group.add_argument('-c', '--opml-convert', action='store_true',
help='Append all new feeds from specified -o/--opml file to -r/--rss-list and exit.')
group = parser.add_argument_group('Notification options')
group.add_argument('-e', '--email', metavar='addr', required=True,
help='Email address to send video notifications to via local "mail" command.')
group.add_argument('-d', '--email-delay',
type=float, metavar='float', default=1.1,
help='Delay between running notification-command in seconds.'
' Useful to make sure emails sort by date/time correctly within same channel,'
' and don\'t all have same exact timestamp. Default is %(default)ss (0 - disable).')
group = parser.add_argument_group('State storage/init')
group.add_argument('-s', '--state-dir', default='state', metavar='path',
help='Directory to use for storing per-feed "last check" timestamps. Default: %(default)s')
group.add_argument('-t', '--new-feed-time',
type=float, metavar='posix-ts',
help='Timestamp to fetch entries after for new/unknown feeds.'
' Default or 0 is to generate notifications for all entries in new feeds.')
group = parser.add_argument_group('Check filtering and rate-limiting')
group.add_argument('-n', '--feed-name', metavar='name',
help='Name (part) of a specific feed to check regardless of timestamps.')
group.add_argument('-m', '--max-checks', type=int, metavar='n',
help='Limit on number of feeds to check in one run. Default or 0 - no limit.')
group.add_argument('-f', '--force', action='store_true',
help='Force-check feeds regardless of timestamps.')
group = parser.add_argument_group('Debug options')
group.add_argument('--debug', action='store_true', help='Verbose operation mode.')
group.add_argument('--dry-run', action='store_true',
help='Run same stuff, but do not send emails or update state.')
opts = parser.parse_args(sys.argv[1:] if args is None else args)
logging.basicConfig(level=logging.DEBUG if opts.debug else logging.WARNING)
log = get_logger('main')
feeds_rss = feeds_opml = list()
if opts.opml:
def _get_outlines(e, feeds=None):
if feeds is None: feeds = list()
for o in e:
if o.attrib.get('xmlUrl'): feeds.append(YTFeed.from_xml(o.attrib))
else: _get_outlines(o, feeds)
return feeds
opml = etree.fromstring(pl.Path(opts.opml).read_text())
feeds_opml = _get_outlines(opml.find('body'))
log.debug('Parsed OPML: feeds={}', len(feeds_opml))
feeds_opml = YTFeedIndex(feeds_opml)
if opts.rss_list:
rss_list = pl.Path(opts.rss_list)
if rss_list.exists():
with rss_list.open() as src:
feeds_rss = list(YTFeed.from_line(line) for line in src)
log.debug('Parsed RSS-list: feeds={}', len(feeds_rss))
feeds_rss = YTFeedIndex(feeds_rss)
if opts.opml_convert:
if not opts.rss_list: parser.error('-c/--opml-convert requires -r/--rss-list option')
n = 0
with rss_list.open('a') as dst:
for feed in feeds_opml:
if feed in feeds_rss: continue
dst.write(f'{feed.url} {feed.title}\n')
print(f'Added feed: {feed}')
n += 1
print(f'-- added feeds: {n}')
return
feeds = YTFeedIndex(feeds_opml, feeds_rss)
state_log = state_process(opts.state_dir, feeds)
log.debug('Feed Index: feeds={} state-log={}', len(feeds_rss), state_log)
ts, ts_email, ts_new_feed = time.time(), 0, opts.new_feed_time or 0
check_limit = opts.max_checks or 0
if check_limit: check_limit += 1
feed_lookup = opts.feed_name and str_norm(opts.feed_name)
for feed in feeds:
if feed_lookup:
if feed_lookup not in str_norm(feed.title): continue
elif not opts.force and ts < feed.ts_check_next(): continue
feed_log = get_logger(f'feed.{feed.chan}')
feed_log.debug('Fetching feed: {} [ {} ]', feed.title, feed.url)
try: etag, entries = feed_fetch(feed_log, feed.url, feed.etag)
except FeedFetchError as err:
feed_log.error('Failed to fetch feed: {}', err)
etag = entries = None
if entries is None: continue
ts_since = None
if not feed.seen_entries:
ts_since = feed.ts_last_check or ts_new_feed
log.debug( 'Processing feed:'
' entries={} etag={!r} new-since={}', len(entries), etag, ts_since )
emails, ewma_delay, seen_ids = \
feed_process(feed_log, feed, entries, ts, ts_since)
if emails and not opts.dry_run:
log.debug( 'Sending notification emails:'
' count={} delay={:.1f}', len(emails), opts.email_delay )
for e_ts, subject, body in sorted(emails):
time.sleep(max(0, opts.email_delay - (time.time() - ts_email)))
sp.run(
['mail', '-s', subject, opts.email],
input=body.encode(), timeout=4*60, check=True )
ts_email = time.time()
update = dict(
chan=feed.chan, delay_ewma=ewma_delay,
ts_last_check=ts, etag=etag, seen_entries=seen_ids )
if not opts.dry_run: state_update(state_log, ts, feed, update)
if check_limit:
check_limit -= 1
if check_limit <= 0:
log.debug('Stopping due to -m/--max-checks limit')
break
log.debug('Finished')
if __name__ == '__main__': sys.exit(main())