diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 53c8833509a..1616a4a620d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -337,6 +337,12 @@ Consider all listed sites to potentially be NSFW. Categories, Creators, Posts, Search Results + + Hreads + https://Hreads.net/ + Chapters, Manga + + Idol Complex https://idol.sankakucomplex.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8e7129618af..e00a2757bdd 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -65,6 +65,7 @@ "hiperdex", "hitomi", "hotleak", + "hreads", "idolcomplex", "imagebam", "imagechest", diff --git a/gallery_dl/extractor/hreads.py b/gallery_dl/extractor/hreads.py new file mode 100644 index 00000000000..e08191ebc37 --- /dev/null +++ b/gallery_dl/extractor/hreads.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://Hreads.net/""" + +from .common import MangaExtractor, ChapterExtractor +from .. import text, util +import re + + +BASE_PATTERN = r"(?:https?://)?hreads\.net" + + +class HreadsBase: + """Base class for Hreads extractors""" + + category = "hreads" + root = "https://hreads.net/" + + def get_title(self, page): + title = text.extr(page, "", "") + title = text.unescape(title).strip() + match = re.search( + r"(?:Read )?([\w\d ]+?)(?:(?: - Chapter [\d.]+)? - Hreads)", + title, + ) + if match: + title = match.group(1) + return title + + +class HreadsChapterExtractor(HreadsBase, ChapterExtractor): + """Extractor for manga chapters from Hreads.net""" + + subcategory = "chapter" + directory_fmt = ("{category}", "{manga}", "Chapter-{chapter:03}{chapter_minor}") + archive_fmt = "{chapter:03}{chapter_minor}_{page}" + pattern = BASE_PATTERN + r"/comic/([\w\d-]+)\/chapter-(\d+-[\d+]|\d+)/?" + example = "https://hreads.net/comics/sex-stopwatch/chapter-1/" + + def __init__(self, match): + url = match.group(0) + self.gid, self.chapter = match.groups() + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + chapter, sep, minor = self.chapter.partition("-") + + data = { + "manga": self.get_title(page), + "manga_id": self.gid, + "chapter": text.parse_int(chapter), + "chapter_id": f"{self.gid}-chapter-{self.chapter}", + "chapter_minor": sep + minor, + } + return data + + def images(self, page): + images = [] + first_img = text.extract(page, 'src="https://cdn.hreads.net', '"') + images.append((f"https://cdn.hreads.net{first_img[0].strip()}", None)) + + for url in text.extract_iter(page, 'data-src="', '"'): + images.append((url.strip(), None)) + return images + + +class HreadsMangaExtractor(HreadsBase, MangaExtractor): + """Extractor for manga from Hreads.net""" + + subcategory = "manga" + chapterclass = HreadsChapterExtractor + pattern = BASE_PATTERN + r"/comic/([\w\d-]+)/?$" + example = "https://hreads.net/comics/sex-stopwatch" + + def __init__(self, match): + url, self.gid = match.group(0), match.group(1) + MangaExtractor.__init__(self, match, url) + + def chapters(self, page): + chapters = set() + print("Chapter extraction") + for chapter_id in text.extract_iter( + page, f'