Skip to content

Commit

Permalink
Subtitle extraction from streaming media manifests yt-dlp#247
Browse files Browse the repository at this point in the history
Authored by fstirlitz
Modified from: ytdl-org/youtube-dl#6144

Closes: #73
Fixes:
ytdl-org/youtube-dl#6106
ytdl-org/youtube-dl#14977
ytdl-org/youtube-dl#21438
ytdl-org/youtube-dl#23609
ytdl-org/youtube-dl#28132

Might also fix (untested):
ytdl-org/youtube-dl#15424
ytdl-org/youtube-dl#18267
ytdl-org/youtube-dl#23899
ytdl-org/youtube-dl#24375
ytdl-org/youtube-dl#24595
ytdl-org/youtube-dl#27899

Related:
ytdl-org/youtube-dl#22379
ytdl-org/youtube-dl#24517
ytdl-org/youtube-dl#24886
ytdl-org/youtube-dl#27215

Notes:
* The functions `extractor.common._extract_..._formats` are still kept for compatibility
* Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles`
* Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats
* AES support is untested
* The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players
    * Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`.
        Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file
    * The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit>
* Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools
    * Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg`
    * The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac>
    * But validity of the those extracted from ISM are untested
  • Loading branch information
pukkandan authored Apr 28, 2021
2 parents 1e387f2 + 47dcc18 commit 0227d0d
Show file tree
Hide file tree
Showing 21 changed files with 865 additions and 260 deletions.
14 changes: 14 additions & 0 deletions compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3018,10 +3018,24 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
return ctypes.WINFUNCTYPE(*args, **kwargs)


try:
compat_Pattern = re.Pattern
except AttributeError:
compat_Pattern = type(re.compile(''))


try:
compat_Match = re.Match
except AttributeError:
compat_Match = type(re.compile('').match(''))


__all__ = [
'compat_HTMLParseError',
'compat_HTMLParser',
'compat_HTTPError',
'compat_Match',
'compat_Pattern',
'compat_Struct',
'compat_b64decode',
'compat_basestring',
Expand Down
7 changes: 6 additions & 1 deletion downloader/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ def _read_ytdl_file(self, ctx):
assert 'ytdl_corrupt' not in ctx
stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
try:
ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index']
ytdl_data = json.loads(stream.read())
ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index']
if 'extra_state' in ytdl_data['downloader']:
ctx['extra_state'] = ytdl_data['downloader']['extra_state']
except Exception:
ctx['ytdl_corrupt'] = True
finally:
Expand All @@ -90,6 +93,8 @@ def _write_ytdl_file(self, ctx):
'index': ctx['fragment_index'],
},
}
if 'extra_state' in ctx:
downloader['extra_state'] = ctx['extra_state']
if ctx.get('fragment_count') is not None:
downloader['fragment_count'] = ctx['fragment_count']
frag_index_stream.write(json.dumps({'downloader': downloader}))
Expand Down
78 changes: 78 additions & 0 deletions downloader/hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import errno
import re
import io
import binascii
try:
from Crypto.Cipher import AES
Expand All @@ -27,7 +28,9 @@
parse_m3u8_attributes,
sanitize_open,
update_url_query,
bug_reports_message,
)
from .. import webvtt


class HlsFD(FragmentFD):
Expand Down Expand Up @@ -78,6 +81,8 @@ def real_download(self, filename, info_dict):
man_url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)

is_webvtt = info_dict['ext'] == 'vtt'

urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
man_url = urlh.geturl()
s = urlh.read().decode('utf-8', 'ignore')
Expand Down Expand Up @@ -142,6 +147,8 @@ def is_ad_fragment_end(s):
else:
self._prepare_and_start_frag_download(ctx)

extra_state = ctx.setdefault('extra_state', {})

fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
test = self.params.get('test', False)
Expand Down Expand Up @@ -308,13 +315,84 @@ def download_fragment(fragment):

return frag_content, frag_index

pack_fragment = lambda frag_content, _: frag_content

if is_webvtt:
def pack_fragment(frag_content, frag_index):
output = io.StringIO()
adjust = 0
for block in webvtt.parse_fragment(frag_content):
if isinstance(block, webvtt.CueBlock):
block.start += adjust
block.end += adjust

dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
cue = block.as_json

# skip the cue if an identical one appears
# in the window of potential duplicates
# and prune the window of unviable candidates
i = 0
skip = True
while i < len(dedup_window):
window_cue = dedup_window[i]
if window_cue == cue:
break
if window_cue['end'] >= cue['start']:
i += 1
continue
del dedup_window[i]
else:
skip = False

if skip:
continue

# add the cue to the window
dedup_window.append(cue)
elif isinstance(block, webvtt.Magic):
# take care of MPEG PES timestamp overflow
if block.mpegts is None:
block.mpegts = 0
extra_state.setdefault('webvtt_mpegts_adjust', 0)
block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
extra_state['webvtt_mpegts_adjust'] += 1
block.mpegts += 1 << 33
extra_state['webvtt_mpegts_last'] = block.mpegts

if frag_index == 1:
extra_state['webvtt_mpegts'] = block.mpegts or 0
extra_state['webvtt_local'] = block.local or 0
# XXX: block.local = block.mpegts = None ?
else:
if block.mpegts is not None and block.local is not None:
adjust = (
(block.mpegts - extra_state.get('webvtt_mpegts', 0))
- (block.local - extra_state.get('webvtt_local', 0))
)
continue
elif isinstance(block, webvtt.HeaderBlock):
if frag_index != 1:
# XXX: this should probably be silent as well
# or verify that all segments contain the same data
self.report_warning(bug_reports_message(
'Discarding a %s block found in the middle of the stream; '
'if the subtitles display incorrectly,'
% (type(block).__name__)))
continue
block.write_into(output)

return output.getvalue().encode('utf-8')

def append_fragment(frag_content, frag_index):
if frag_content:
fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index)
try:
file, frag_sanitized = sanitize_open(fragment_filename, 'rb')
ctx['fragment_filename_sanitized'] = frag_sanitized
file.close()
frag_content = pack_fragment(frag_content, frag_index)
self._append_fragment(ctx, frag_content)
return True
except EnvironmentError as ose:
Expand Down
54 changes: 42 additions & 12 deletions downloader/ism.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def write_piff_header(stream, params):
language = params.get('language', 'und')
height = params.get('height', 0)
width = params.get('width', 0)
is_audio = width == 0 and height == 0
stream_type = params['stream_type']
creation_time = modification_time = int(time.time())

ftyp_payload = b'isml' # major brand
Expand Down Expand Up @@ -77,7 +77,7 @@ def write_piff_header(stream, params):
tkhd_payload += u32.pack(0) * 2 # reserved
tkhd_payload += s16.pack(0) # layer
tkhd_payload += s16.pack(0) # alternate group
tkhd_payload += s88.pack(1 if is_audio else 0) # volume
tkhd_payload += s88.pack(1 if stream_type == 'audio' else 0) # volume
tkhd_payload += u16.pack(0) # reserved
tkhd_payload += unity_matrix
tkhd_payload += u1616.pack(width)
Expand All @@ -93,19 +93,34 @@ def write_piff_header(stream, params):
mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box

hdlr_payload = u32.pack(0) # pre defined
hdlr_payload += b'soun' if is_audio else b'vide' # handler type
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name
if stream_type == 'audio': # handler type
hdlr_payload += b'soun'
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += b'SoundHandler\0' # name
elif stream_type == 'video':
hdlr_payload += b'vide'
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += b'VideoHandler\0' # name
elif stream_type == 'text':
hdlr_payload += b'subt'
hdlr_payload += u32.pack(0) * 3 # reserved
hdlr_payload += b'SubtitleHandler\0' # name
else:
assert False
mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box

if is_audio:
if stream_type == 'audio':
smhd_payload = s88.pack(0) # balance
smhd_payload += u16.pack(0) # reserved
media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header
else:
elif stream_type == 'video':
vmhd_payload = u16.pack(0) # graphics mode
vmhd_payload += u16.pack(0) * 3 # opcolor
media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header
elif stream_type == 'text':
media_header_box = full_box(b'sthd', 0, 0, b'') # Subtitle Media Header
else:
assert False
minf_payload = media_header_box

dref_payload = u32.pack(1) # entry count
Expand All @@ -117,7 +132,7 @@ def write_piff_header(stream, params):

sample_entry_payload = u8.pack(0) * 6 # reserved
sample_entry_payload += u16.pack(1) # data reference index
if is_audio:
if stream_type == 'audio':
sample_entry_payload += u32.pack(0) * 2 # reserved
sample_entry_payload += u16.pack(params.get('channels', 2))
sample_entry_payload += u16.pack(params.get('bits_per_sample', 16))
Expand All @@ -127,7 +142,7 @@ def write_piff_header(stream, params):

if fourcc == 'AACL':
sample_entry_box = box(b'mp4a', sample_entry_payload)
else:
elif stream_type == 'video':
sample_entry_payload += u16.pack(0) # pre defined
sample_entry_payload += u16.pack(0) # reserved
sample_entry_payload += u32.pack(0) * 3 # pre defined
Expand Down Expand Up @@ -155,6 +170,18 @@ def write_piff_header(stream, params):
avcc_payload += pps
sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record
sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry
else:
assert False
elif stream_type == 'text':
if fourcc == 'TTML':
sample_entry_payload += b'http://www.w3.org/ns/ttml\0' # namespace
sample_entry_payload += b'\0' # schema location
sample_entry_payload += b'\0' # auxilary mime types(??)
sample_entry_box = box(b'stpp', sample_entry_payload)
else:
assert False
else:
assert False
stsd_payload += sample_entry_box

stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box
Expand Down Expand Up @@ -221,10 +248,13 @@ def real_download(self, filename, info_dict):

self._prepare_and_start_frag_download(ctx)

extra_state = ctx.setdefault('extra_state', {
'ism_track_written': False,
})

fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)

track_written = False
frag_index = 0
for i, segment in enumerate(segments):
frag_index += 1
Expand All @@ -236,11 +266,11 @@ def real_download(self, filename, info_dict):
success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
if not track_written:
if not extra_state['ism_track_written']:
tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
track_written = True
extra_state['ism_track_written'] = True
self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:
Expand Down
10 changes: 6 additions & 4 deletions extractor/atresplayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,19 @@ def _real_extract(self, url):
title = episode['titulo']

formats = []
subtitles = {}
for source in episode.get('sources', []):
src = source.get('src')
if not src:
continue
src_type = source.get('type')
if src_type == 'application/vnd.apple.mpegurl':
formats.extend(self._extract_m3u8_formats(
formats, subtitles = self._extract_m3u8_formats(
src, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
m3u8_id='hls', fatal=False)
elif src_type == 'application/dash+xml':
formats.extend(self._extract_mpd_formats(
src, video_id, mpd_id='dash', fatal=False))
formats, subtitles = self._extract_mpd_formats(
src, video_id, mpd_id='dash', fatal=False)
self._sort_formats(formats)

heartbeat = episode.get('heartbeat') or {}
Expand All @@ -115,4 +116,5 @@ def _real_extract(self, url):
'channel': get_meta('channel'),
'season': get_meta('season'),
'episode_number': int_or_none(get_meta('episodeNumber')),
'subtitles': subtitles,
}
14 changes: 10 additions & 4 deletions extractor/byutv.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def _real_extract(self, url):

info = {}
formats = []
subtitles = {}
for format_id, ep in video.items():
if not isinstance(ep, dict):
continue
Expand All @@ -90,12 +91,16 @@ def _real_extract(self, url):
continue
ext = determine_ext(video_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
m3u8_id='hls', fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
video_url, video_id, mpd_id='dash', fatal=False)
formats.extend(mpd_fmts)
subtitles = self._merge_subtitles(subtitles, mpd_subs)
else:
formats.append({
'url': video_url,
Expand All @@ -114,4 +119,5 @@ def _real_extract(self, url):
'display_id': display_id,
'title': display_id,
'formats': formats,
'subtitles': subtitles,
})
Loading

0 comments on commit 0227d0d

Please sign in to comment.