Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,49 @@
]


# Hosts we treat as YouTube. Each entry covers schemes http/https and an
# optional leading "www." / "m." so mobile and short links work too.
_YOUTUBE_HOSTS = {
"www.youtube.com",
"youtube.com",
"m.youtube.com",
"music.youtube.com",
"youtu.be",
}


def _extract_video_id(url: str) -> str | None:
"""Return the 11-character video id for any supported YouTube URL shape.

Accepts the canonical watch URL (``/watch?v=...``), the short form
(``youtu.be/<id>``), Shorts (``/shorts/<id>``), embed (``/embed/<id>``),
and live (``/live/<id>``). Returns ``None`` for anything else so the
caller can fall back to the regular HTML converter.
"""
try:
parsed = urlparse(url)
except ValueError:
return None
host = (parsed.hostname or "").lower()
if host not in _YOUTUBE_HOSTS:
return None
# youtu.be/<id>
if host == "youtu.be":
vid = parsed.path.lstrip("/").split("/", 1)[0]
return vid or None
# /watch?v=<id>
if parsed.path == "/watch":
qs = parse_qs(parsed.query)
v = qs.get("v", [""])[0]
return v or None
# /shorts/<id>, /embed/<id>, /live/<id>
for prefix in ("/shorts/", "/embed/", "/live/"):
if parsed.path.startswith(prefix):
vid = parsed.path[len(prefix):].split("/", 1)[0]
return vid or None
return None


class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript."""

Expand All @@ -53,8 +96,8 @@ def accepts(
url = unquote(url)
url = url.replace(r"\?", "?").replace(r"\=", "=")

if not url.startswith("https://www.youtube.com/watch?"):
# Not a YouTube URL
if _extract_video_id(url) is None:
# Not a YouTube URL we can handle
return False

if extension in ACCEPTED_FILE_EXTENSIONS:
Expand Down Expand Up @@ -147,10 +190,10 @@ def convert(
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
if "v" in params and params["v"][0]:
video_id = str(params["v"][0])
raw_url = stream_info.url or ""
raw_url = unquote(raw_url).replace(r"\?", "?").replace(r"\=", "=")
video_id = _extract_video_id(raw_url)
if video_id:
transcript_list = ytt_api.list(video_id)
languages = ["en"]
for transcript in transcript_list:
Expand Down
79 changes: 79 additions & 0 deletions packages/markitdown/tests/test_youtube_url_shapes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""Unit tests for the YouTube URL-shape helper.

Covers the URL forms that used to fall through to the generic HTML
converter: ``youtu.be/<id>`` short links, ``youtube.com/shorts/<id>``,
``/embed/<id>``, ``/live/<id>``, and the mobile host ``m.youtube.com``.
"""

from markitdown.converters._youtube_converter import _extract_video_id


def test_canonical_watch_url():
assert _extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ") == "dQw4w9WgXcQ"


def test_watch_url_with_extra_params():
assert (
_extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=42s&feature=share")
== "dQw4w9WgXcQ"
)


def test_mobile_watch_url():
assert _extract_video_id("https://m.youtube.com/watch?v=dQw4w9WgXcQ") == "dQw4w9WgXcQ"


def test_music_youtube_watch_url():
assert (
_extract_video_id("https://music.youtube.com/watch?v=dQw4w9WgXcQ")
== "dQw4w9WgXcQ"
)


def test_short_url():
assert _extract_video_id("https://youtu.be/dQw4w9WgXcQ") == "dQw4w9WgXcQ"


def test_short_url_with_timestamp():
assert _extract_video_id("https://youtu.be/dQw4w9WgXcQ?t=30") == "dQw4w9WgXcQ"


def test_shorts_url():
assert (
_extract_video_id("https://www.youtube.com/shorts/dQw4w9WgXcQ")
== "dQw4w9WgXcQ"
)


def test_embed_url():
assert (
_extract_video_id("https://www.youtube.com/embed/dQw4w9WgXcQ")
== "dQw4w9WgXcQ"
)


def test_live_url():
assert (
_extract_video_id("https://www.youtube.com/live/dQw4w9WgXcQ")
== "dQw4w9WgXcQ"
)


def test_non_youtube_url_returns_none():
assert _extract_video_id("https://vimeo.com/12345") is None


def test_watch_without_v_param_returns_none():
assert _extract_video_id("https://www.youtube.com/watch") is None


def test_channel_url_returns_none():
assert _extract_video_id("https://www.youtube.com/channel/UC123") is None


def test_empty_string_returns_none():
assert _extract_video_id("") is None


def test_unknown_host_returns_none():
assert _extract_video_id("https://example.com/watch?v=dQw4w9WgXcQ") is None