Subtitle extraction from streaming media manifests yt-dlp#247

Authored by fstirlitz Modified from: ytdl-org/youtube-dl#6144 Closes: #73 Fixes: ytdl-org/youtube-dl#6106 ytdl-org/youtube-dl#14977 ytdl-org/youtube-dl#21438 ytdl-org/youtube-dl#23609 ytdl-org/youtube-dl#28132 Might also fix (untested): ytdl-org/youtube-dl#15424 ytdl-org/youtube-dl#18267 ytdl-org/youtube-dl#23899 ytdl-org/youtube-dl#24375 ytdl-org/youtube-dl#24595 ytdl-org/youtube-dl#27899 Related: ytdl-org/youtube-dl#22379 ytdl-org/youtube-dl#24517 ytdl-org/youtube-dl#24886 ytdl-org/youtube-dl#27215 Notes: * The functions `extractor.common._extract_..._formats` are still kept for compatibility * Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles` * Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats * AES support is untested * The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players * Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`. Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file * The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit> * Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools * Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg` * The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac> * But validity of the those extracted from ISM are untested
nixxo · Apr 28, 2021 · 0227d0d · 0227d0d
2 parents 1e387f2 + 47dcc18
commit 0227d0d
Show file tree

Hide file tree

Showing 21 changed files with 865 additions and 260 deletions.
diff --git a/compat.py b/compat.py
@@ -3018,10 +3018,24 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
         return ctypes.WINFUNCTYPE(*args, **kwargs)
 
 
+try:
+    compat_Pattern = re.Pattern
+except AttributeError:
+    compat_Pattern = type(re.compile(''))
+
+
+try:
+    compat_Match = re.Match
+except AttributeError:
+    compat_Match = type(re.compile('').match(''))
+
+
 __all__ = [
     'compat_HTMLParseError',
     'compat_HTMLParser',
     'compat_HTTPError',
+    'compat_Match',
+    'compat_Pattern',
     'compat_Struct',
     'compat_b64decode',
     'compat_basestring',

diff --git a/downloader/fragment.py b/downloader/fragment.py
@@ -77,7 +77,10 @@ def _read_ytdl_file(self, ctx):
         assert 'ytdl_corrupt' not in ctx
         stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
         try:
-            ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index']
+            ytdl_data = json.loads(stream.read())
+            ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index']
+            if 'extra_state' in ytdl_data['downloader']:
+                ctx['extra_state'] = ytdl_data['downloader']['extra_state']
         except Exception:
             ctx['ytdl_corrupt'] = True
         finally:
@@ -90,6 +93,8 @@ def _write_ytdl_file(self, ctx):
                 'index': ctx['fragment_index'],
             },
         }
+        if 'extra_state' in ctx:
+            downloader['extra_state'] = ctx['extra_state']
         if ctx.get('fragment_count') is not None:
             downloader['fragment_count'] = ctx['fragment_count']
         frag_index_stream.write(json.dumps({'downloader': downloader}))

diff --git a/downloader/hls.py b/downloader/hls.py
@@ -2,6 +2,7 @@
 
 import errno
 import re
+import io
 import binascii
 try:
     from Crypto.Cipher import AES
@@ -27,7 +28,9 @@
     parse_m3u8_attributes,
     sanitize_open,
     update_url_query,
+    bug_reports_message,
 )
+from .. import webvtt
 
 
 class HlsFD(FragmentFD):
@@ -78,6 +81,8 @@ def real_download(self, filename, info_dict):
         man_url = info_dict['url']
         self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
 
+        is_webvtt = info_dict['ext'] == 'vtt'
+
         urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
         man_url = urlh.geturl()
         s = urlh.read().decode('utf-8', 'ignore')
@@ -142,6 +147,8 @@ def is_ad_fragment_end(s):
         else:
             self._prepare_and_start_frag_download(ctx)
 
+        extra_state = ctx.setdefault('extra_state', {})
+
         fragment_retries = self.params.get('fragment_retries', 0)
         skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
         test = self.params.get('test', False)
@@ -308,13 +315,84 @@ def download_fragment(fragment):
 
                 return frag_content, frag_index
 
+            pack_fragment = lambda frag_content, _: frag_content
+
+            if is_webvtt:
+                def pack_fragment(frag_content, frag_index):
+                    output = io.StringIO()
+                    adjust = 0
+                    for block in webvtt.parse_fragment(frag_content):
+                        if isinstance(block, webvtt.CueBlock):
+                            block.start += adjust
+                            block.end += adjust
+
+                            dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
+                            cue = block.as_json
+
+                            # skip the cue if an identical one appears
+                            # in the window of potential duplicates
+                            # and prune the window of unviable candidates
+                            i = 0
+                            skip = True
+                            while i < len(dedup_window):
+                                window_cue = dedup_window[i]
+                                if window_cue == cue:
+                                    break
+                                if window_cue['end'] >= cue['start']:
+                                    i += 1
+                                    continue
+                                del dedup_window[i]
+                            else:
+                                skip = False
+
+                            if skip:
+                                continue
+
+                            # add the cue to the window
+                            dedup_window.append(cue)
+                        elif isinstance(block, webvtt.Magic):
+                            # take care of MPEG PES timestamp overflow
+                            if block.mpegts is None:
+                                block.mpegts = 0
+                            extra_state.setdefault('webvtt_mpegts_adjust', 0)
+                            block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
+                            if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
+                                extra_state['webvtt_mpegts_adjust'] += 1
+                                block.mpegts += 1 << 33
+                            extra_state['webvtt_mpegts_last'] = block.mpegts
+
+                            if frag_index == 1:
+                                extra_state['webvtt_mpegts'] = block.mpegts or 0
+                                extra_state['webvtt_local'] = block.local or 0
+                                # XXX: block.local = block.mpegts = None ?
+                            else:
+                                if block.mpegts is not None and block.local is not None:
+                                    adjust = (
+                                        (block.mpegts - extra_state.get('webvtt_mpegts', 0))
+                                        - (block.local - extra_state.get('webvtt_local', 0))
+                                    )
+                                continue
+                        elif isinstance(block, webvtt.HeaderBlock):
+                            if frag_index != 1:
+                                # XXX: this should probably be silent as well
+                                # or verify that all segments contain the same data
+                                self.report_warning(bug_reports_message(
+                                    'Discarding a %s block found in the middle of the stream; '
+                                    'if the subtitles display incorrectly,'
+                                    % (type(block).__name__)))
+                                continue
+                        block.write_into(output)
+
+                    return output.getvalue().encode('utf-8')
+
             def append_fragment(frag_content, frag_index):
                 if frag_content:
                     fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index)
                     try:
                         file, frag_sanitized = sanitize_open(fragment_filename, 'rb')
                         ctx['fragment_filename_sanitized'] = frag_sanitized
                         file.close()
+                        frag_content = pack_fragment(frag_content, frag_index)
                         self._append_fragment(ctx, frag_content)
                         return True
                     except EnvironmentError as ose:

diff --git a/downloader/ism.py b/downloader/ism.py
@@ -48,7 +48,7 @@ def write_piff_header(stream, params):
     language = params.get('language', 'und')
     height = params.get('height', 0)
     width = params.get('width', 0)
-    is_audio = width == 0 and height == 0
+    stream_type = params['stream_type']
     creation_time = modification_time = int(time.time())
 
     ftyp_payload = b'isml'  # major brand
@@ -77,7 +77,7 @@ def write_piff_header(stream, params):
     tkhd_payload += u32.pack(0) * 2  # reserved
     tkhd_payload += s16.pack(0)  # layer
     tkhd_payload += s16.pack(0)  # alternate group
-    tkhd_payload += s88.pack(1 if is_audio else 0)  # volume
+    tkhd_payload += s88.pack(1 if stream_type == 'audio' else 0)  # volume
     tkhd_payload += u16.pack(0)  # reserved
     tkhd_payload += unity_matrix
     tkhd_payload += u1616.pack(width)
@@ -93,19 +93,34 @@ def write_piff_header(stream, params):
     mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload)  # Media Header Box
 
     hdlr_payload = u32.pack(0)  # pre defined
-    hdlr_payload += b'soun' if is_audio else b'vide'  # handler type
-    hdlr_payload += u32.pack(0) * 3  # reserved
-    hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0'  # name
+    if stream_type == 'audio':  # handler type
+        hdlr_payload += b'soun'
+        hdlr_payload += u32.pack(0) * 3  # reserved
+        hdlr_payload += b'SoundHandler\0'  # name
+    elif stream_type == 'video':
+        hdlr_payload += b'vide'
+        hdlr_payload += u32.pack(0) * 3  # reserved
+        hdlr_payload += b'VideoHandler\0'  # name
+    elif stream_type == 'text':
+        hdlr_payload += b'subt'
+        hdlr_payload += u32.pack(0) * 3  # reserved
+        hdlr_payload += b'SubtitleHandler\0'  # name
+    else:
+        assert False
     mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload)  # Handler Reference Box
 
-    if is_audio:
+    if stream_type == 'audio':
         smhd_payload = s88.pack(0)  # balance
         smhd_payload += u16.pack(0)  # reserved
         media_header_box = full_box(b'smhd', 0, 0, smhd_payload)  # Sound Media Header
-    else:
+    elif stream_type == 'video':
         vmhd_payload = u16.pack(0)  # graphics mode
         vmhd_payload += u16.pack(0) * 3  # opcolor
         media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload)  # Video Media Header
+    elif stream_type == 'text':
+        media_header_box = full_box(b'sthd', 0, 0, b'')  # Subtitle Media Header
+    else:
+        assert False
     minf_payload = media_header_box
 
     dref_payload = u32.pack(1)  # entry count
@@ -117,7 +132,7 @@ def write_piff_header(stream, params):
 
     sample_entry_payload = u8.pack(0) * 6  # reserved
     sample_entry_payload += u16.pack(1)  # data reference index
-    if is_audio:
+    if stream_type == 'audio':
         sample_entry_payload += u32.pack(0) * 2  # reserved
         sample_entry_payload += u16.pack(params.get('channels', 2))
         sample_entry_payload += u16.pack(params.get('bits_per_sample', 16))
@@ -127,7 +142,7 @@ def write_piff_header(stream, params):
 
         if fourcc == 'AACL':
             sample_entry_box = box(b'mp4a', sample_entry_payload)
-    else:
+    elif stream_type == 'video':
         sample_entry_payload += u16.pack(0)  # pre defined
         sample_entry_payload += u16.pack(0)  # reserved
         sample_entry_payload += u32.pack(0) * 3  # pre defined
@@ -155,6 +170,18 @@ def write_piff_header(stream, params):
             avcc_payload += pps
             sample_entry_payload += box(b'avcC', avcc_payload)  # AVC Decoder Configuration Record
             sample_entry_box = box(b'avc1', sample_entry_payload)  # AVC Simple Entry
+        else:
+            assert False
+    elif stream_type == 'text':
+        if fourcc == 'TTML':
+            sample_entry_payload += b'http://www.w3.org/ns/ttml\0'  # namespace
+            sample_entry_payload += b'\0'  # schema location
+            sample_entry_payload += b'\0'  # auxilary mime types(??)
+            sample_entry_box = box(b'stpp', sample_entry_payload)
+        else:
+            assert False
+    else:
+        assert False
     stsd_payload += sample_entry_box
 
     stbl_payload = full_box(b'stsd', 0, 0, stsd_payload)  # Sample Description Box
@@ -221,10 +248,13 @@ def real_download(self, filename, info_dict):
 
         self._prepare_and_start_frag_download(ctx)
 
+        extra_state = ctx.setdefault('extra_state', {
+            'ism_track_written': False,
+        })
+
         fragment_retries = self.params.get('fragment_retries', 0)
         skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
 
-        track_written = False
         frag_index = 0
         for i, segment in enumerate(segments):
             frag_index += 1
@@ -236,11 +266,11 @@ def real_download(self, filename, info_dict):
                     success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
                     if not success:
                         return False
-                    if not track_written:
+                    if not extra_state['ism_track_written']:
                         tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
                         info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
                         write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
-                        track_written = True
+                        extra_state['ism_track_written'] = True
                     self._append_fragment(ctx, frag_content)
                     break
                 except compat_urllib_error.HTTPError as err:

diff --git a/extractor/atresplayer.py b/extractor/atresplayer.py
@@ -86,18 +86,19 @@ def _real_extract(self, url):
         title = episode['titulo']
 
         formats = []
+        subtitles = {}
         for source in episode.get('sources', []):
             src = source.get('src')
             if not src:
                 continue
             src_type = source.get('type')
             if src_type == 'application/vnd.apple.mpegurl':
-                formats.extend(self._extract_m3u8_formats(
+                formats, subtitles = self._extract_m3u8_formats(
                     src, video_id, 'mp4', 'm3u8_native',
-                    m3u8_id='hls', fatal=False))
+                    m3u8_id='hls', fatal=False)
             elif src_type == 'application/dash+xml':
-                formats.extend(self._extract_mpd_formats(
-                    src, video_id, mpd_id='dash', fatal=False))
+                formats, subtitles = self._extract_mpd_formats(
+                    src, video_id, mpd_id='dash', fatal=False)
         self._sort_formats(formats)
 
         heartbeat = episode.get('heartbeat') or {}
@@ -115,4 +116,5 @@ def _real_extract(self, url):
             'channel': get_meta('channel'),
             'season': get_meta('season'),
             'episode_number': int_or_none(get_meta('episodeNumber')),
+            'subtitles': subtitles,
         }
diff --git a/extractor/byutv.py b/extractor/byutv.py
@@ -82,6 +82,7 @@ def _real_extract(self, url):
 
         info = {}
         formats = []
+        subtitles = {}
         for format_id, ep in video.items():
             if not isinstance(ep, dict):
                 continue
@@ -90,12 +91,16 @@ def _real_extract(self, url):
                 continue
             ext = determine_ext(video_url)
             if ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
+                m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
                     video_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
+                    m3u8_id='hls', fatal=False)
+                formats.extend(m3u8_fmts)
+                subtitles = self._merge_subtitles(subtitles, m3u8_subs)
             elif ext == 'mpd':
-                formats.extend(self._extract_mpd_formats(
-                    video_url, video_id, mpd_id='dash', fatal=False))
+                mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
+                    video_url, video_id, mpd_id='dash', fatal=False)
+                formats.extend(mpd_fmts)
+                subtitles = self._merge_subtitles(subtitles, mpd_subs)
             else:
                 formats.append({
                     'url': video_url,
@@ -114,4 +119,5 @@ def _real_extract(self, url):
             'display_id': display_id,
             'title': display_id,
             'formats': formats,
+            'subtitles': subtitles,
         })