merge #4444: [2ch] add 'thread' and 'board' extractors (#1009, #3540)

mikf · Jan 15, 2024 · 59cf4b3 · 59cf4b3
2 parents 90b3823 + 6819658
commit 59cf4b3
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 0 deletions.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW.
 </tr>
 </thead>
 <tbody valign="top">
+<tr>
+    <td>2ch</td>
+    <td>https://2ch.hk/</td>
+    <td>Boards, Threads</td>
+    <td></td>
+</tr>
 <tr>
     <td>2chen</td>
     <td>https://sturdychan.help/</td>

diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://2ch.hk/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+
+class _2chThreadExtractor(Extractor):
+    """Extractor for 2ch threads"""
+    category = "2ch"
+    subcategory = "thread"
+    root = "https://2ch.hk"
+    directory_fmt = ("{category}", "{board}", "{thread} {title}")
+    filename_fmt = "{tim}{filename:? //}.{extension}"
+    archive_fmt = "{board}_{thread}_{tim}"
+    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
+    example = "https://2ch.hk/a/res/12345.html"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.board, self.thread = match.groups()
+
+    def items(self):
+        url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
+        posts = self.request(url).json()["threads"][0]["posts"]
+
+        op = posts[0]
+        title = op.get("subject") or text.remove_html(op["comment"])
+
+        thread = {
+            "board" : self.board,
+            "thread": self.thread,
+            "title" : text.unescape(title)[:50],
+        }
+
+        yield Message.Directory, thread
+        for post in posts:
+            files = post.get("files")
+            if files:
+                post["post_name"] = post["name"]
+                post["date"] = text.parse_timestamp(post["timestamp"])
+                del post["files"]
+                del post["name"]
+
+                for file in files:
+                    file.update(thread)
+                    file.update(post)
+
+                    file["filename"] = file["fullname"].rpartition(".")[0]
+                    file["tim"], _, file["extension"] = \
+                        file["name"].rpartition(".")
+
+                    yield Message.Url, self.root + file["path"], file
+
+
+class _2chBoardExtractor(Extractor):
+    """Extractor for 2ch boards"""
+    category = "2ch"
+    subcategory = "board"
+    root = "https://2ch.hk"
+    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
+    example = "https://2ch.hk/a/"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.board = match.group(1)
+
+    def items(self):
+        # index page
+        url = "{}/{}/index.json".format(self.root, self.board)
+        index = self.request(url).json()
+        index["_extractor"] = _2chThreadExtractor
+        for thread in index["threads"]:
+            url = "{}/{}/res/{}.html".format(
+                self.root, self.board, thread["thread_num"])
+            yield Message.Queue, url, index
+
+        # pages 1..n
+        for n in util.advance(index["pages"], 1):
+            url = "{}/{}/{}.json".format(self.root, self.board, n)
+            page = self.request(url).json()
+            page["_extractor"] = _2chThreadExtractor
+            for thread in page["threads"]:
+                url = "{}/{}/res/{}.html".format(
+                    self.root, self.board, thread["thread_num"])
+                yield Message.Queue, url, page
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -10,6 +10,7 @@
 import re
 
 modules = [
+    "2ch",
     "2chan",
     "2chen",
     "35photo",

diff --git a/test/results/2ch.py b/test/results/2ch.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+gallery_dl = __import__("gallery_dl.extractor.2ch")
+_2ch = getattr(gallery_dl.extractor, "2ch")
+
+
+__tests__ = (
+{
+    "#url"     : "https://2ch.hk/a/res/6202876.html",
+    "#category": ("", "2ch", "thread"),
+    "#class"   : _2ch._2chThreadExtractor,
+    "#pattern" : r"https://2ch\.hk/a/src/6202876/\d+\.\w+",
+    "#count"   : range(450, 1000),
+
+    "banned"   : 0,
+    "board"    : "a",
+    "closed"   : 0,
+    "comment"  : str,
+    "date"     : "type:datetime",
+    "displayname": str,
+    "email"    : "",
+    "endless"  : 1,
+    "extension": str,
+    "filename" : str,
+    "fullname" : str,
+    "height"   : int,
+    "lasthit"  : 1705273977,
+    "md5"      : r"re:[0-9a-f]{32}",
+    "name"     : r"re:\d+\.\w+",
+    "num"      : int,
+    "number"   : range(1, 1000),
+    "op"       : 0,
+    "parent"   : int,
+    "path"     : r"re:/a/src/6202876/\d+\.\w+",
+    "post_name": "Аноним",
+    "size"     : int,
+    "sticky"   : 0,
+    "subject"  : str,
+    "thread"   : "6202876",
+    "thumbnail": str,
+    "tim"      : r"re:\d+",
+    "timestamp": int,
+    "title"    : "MP4/WEBM",
+    "tn_height": int,
+    "tn_width" : int,
+    "trip"     : "",
+    "type"     : int,
+    "views"    : int,
+    "width"    : int,
+},
+
+{
+    "#url"     : "https://2ch.hk/a/",
+    "#category": ("", "2ch", "board"),
+    "#class"   : _2ch._2chBoardExtractor,
+    "#pattern" : _2ch._2chThreadExtractor.pattern,
+    "#count"   : range(200, 300),
+},
+
+)