# Extracting and Processing Subtitles from a YouTube Video in Thai.

### References:

- [yt-dlp on GitHub](https://github.com/yt-dlp/yt-dlp)

## Setting the things up

In [1]:
import yt_dlp
import json
import re

YT_BASE_URL = 'https://www.youtube.com/watch?v='
DST_FOLDER = 'files'

In [15]:
def download_auto_subtitles(video_url, lang='th', output_path='files', skip_download=True):
    ydl_opts = {
        'writesubtitles': True,
        'writeautomaticsub': True,  # Enable auto-generated subtitles
        'subtitleslangs': [lang],
        'skip_download': skip_download,
        'outtmpl': f'{output_path}/%(title)s_%(id)s.%(ext)s',
        'format': 'best'
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=True)

    return info


## Extracting subtitles and optionally downloading the video.

In [5]:
v_id = 'dVXQHDuTPxM'

In [16]:
info = download_auto_subtitles(YT_BASE_URL + v_id, lang='th', output_path=DST_FOLDER, skip_download=False)

[youtube] Extracting URL: https://www.youtube.com/watch?v=dVXQHDuTPxM
[youtube] dVXQHDuTPxM: Downloading webpage
[youtube] dVXQHDuTPxM: Downloading tv client config
[youtube] dVXQHDuTPxM: Downloading player 5ae7d525
[youtube] dVXQHDuTPxM: Downloading tv player API JSON
[youtube] dVXQHDuTPxM: Downloading ios player API JSON
[youtube] dVXQHDuTPxM: Downloading m3u8 information
[info] dVXQHDuTPxM: Downloading subtitles: th
[info] dVXQHDuTPxM: Downloading 1 format(s): 18
Deleting existing file files\โซเมีย, ไท-ไต ใน ＂นิธิ เอียวศรีวงศ์＂_dVXQHDuTPxM.th.vtt
[info] Writing video subtitles to: files\โซเมีย, ไท-ไต ใน ＂นิธิ เอียวศรีวงศ์＂_dVXQHDuTPxM.th.vtt
[download] Destination: files\โซเมีย, ไท-ไต ใน ＂นิธิ เอียวศรีวงศ์＂_dVXQHDuTPxM.th.vtt
[download] 100% of  503.99KiB in 00:00:00 at 516.49KiB/s
[download] files\โซเมีย, ไท-ไต ใน ＂นิธิ เอียวศรีวงศ์＂_dVXQHDuTPxM.mp4 has already been downloaded
[download] 100% of  146.17MiB


In [17]:
subtitle_path = info['requested_subtitles']['th']['filepath']
subtitle_path

'files\\โซเมีย, ไท-ไต ใน ＂นิธิ เอียวศรีวงศ์＂_dVXQHDuTPxM.th.vtt'

In [7]:
info

{'id': 'dVXQHDuTPxM',
 'title': 'โซเมีย, ไท-ไต ใน "นิธิ เอียวศรีวงศ์"',
 'formats': [{'format_id': 'sb2',
   'format_note': 'storyboard',
   'ext': 'mhtml',
   'protocol': 'mhtml',
   'acodec': 'none',
   'vcodec': 'none',
   'url': 'https://i.ytimg.com/sb/dVXQHDuTPxM/storyboard3_L0/default.jpg?sqp=-oaymwENSDfyq4qpAwVwAcABBqLzl_8DBgiH4Jy9Bg==&sigh=rs$AOn4CLCSRcgRghXFzRlsIc5jsstDChy60A',
   'width': 48,
   'height': 27,
   'fps': 0.03852080123266564,
   'rows': 10,
   'columns': 10,
   'fragments': [{'url': 'https://i.ytimg.com/sb/dVXQHDuTPxM/storyboard3_L0/default.jpg?sqp=-oaymwENSDfyq4qpAwVwAcABBqLzl_8DBgiH4Jy9Bg==&sigh=rs$AOn4CLCSRcgRghXFzRlsIc5jsstDChy60A',
     'duration': 2596.0}],
   'audio_ext': 'none',
   'video_ext': 'none',
   'vbr': 0,
   'abr': 0,
   'tbr': None,
   'resolution': '48x27',
   'aspect_ratio': 1.78,
   'filesize_approx': None,
   'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55