[chevereto] add generic extractors (#4664)

- support jpgfish - support pixl.li / pixl.is (#3179, #4357)
mikf · Oct 16, 2023 · 2911ed1 · 2911ed1
1 parent ade8347
commit 2911ed1
Show file tree

Hide file tree

Showing 6 changed files with 234 additions and 155 deletions.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -427,12 +427,6 @@ Consider all sites to be NSFW unless otherwise known.
     <td>Games</td>
     <td></td>
 </tr>
-<tr>
-    <td>JPG Fish</td>
-    <td>https://jpg1.su/</td>
-    <td>Albums, individual Images, User Profiles</td>
-    <td></td>
-</tr>
 <tr>
     <td>Keenspot</td>
     <td>http://www.keenspot.com/</td>
@@ -998,6 +992,22 @@ Consider all sites to be NSFW unless otherwise known.
     <td></td>
 </tr>
 
+<tr>
+    <td colspan="4"><strong>Chevereto Instances</strong></td>
+</tr>
+<tr>
+    <td>JPG Fish</td>
+    <td>https://jpg2.su/</td>
+    <td>Albums, individual Images, User Profiles</td>
+    <td></td>
+</tr>
+<tr>
+    <td>Pixl</td>
+    <td>https://pixl.li/</td>
+    <td>Albums, individual Images, User Profiles</td>
+    <td></td>
+</tr>
+
 <tr>
     <td colspan="4"><strong>Danbooru Instances</strong></td>
 </tr>

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -28,6 +28,7 @@
     "blogger",
     "bunkr",
     "catbox",
+    "chevereto",
     "comicvine",
     "cyberdrop",
     "danbooru",
@@ -73,7 +74,6 @@
     "issuu",
     "itaku",
     "itchio",
-    "jpgfish",
     "jschan",
     "kabeuchi",
     "keenspot",

diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2023 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Chevereto galleries"""
+
+from .common import BaseExtractor, Message
+from .. import text
+
+
+class CheveretoExtractor(BaseExtractor):
+    """Base class for chevereto extractors"""
+    basecategory = "chevereto"
+    directory_fmt = ("{category}", "{user}", "{album}",)
+    archive_fmt = "{id}"
+
+    def __init__(self, match):
+        BaseExtractor.__init__(self, match)
+        self.path = match.group(match.lastindex)
+
+    def _pagination(self, url):
+        while url:
+            page = self.request(url).text
+
+            for item in text.extract_iter(
+                    page, '<div class="list-item-image ', 'image-container'):
+                yield text.extr(item, '<a href="', '"')
+
+            url = text.extr(page, '<a data-pagination="next" href="', '" ><')
+
+
+BASE_PATTERN = CheveretoExtractor.update({
+    "jpgfish": {
+        "root": "https://jpg2.su",
+        "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
+    },
+    "pixl": {
+        "root": "https://pixl.li",
+        "pattern": r"pixl\.(?:li|is)",
+    },
+})
+
+
+class CheveretoImageExtractor(CheveretoExtractor):
+    """Extractor for chevereto Images"""
+    subcategory = "image"
+    pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)"
+    example = "https://jpg2.su/img/TITLE.ID"
+
+    def items(self):
+        url = self.root + self.path
+        extr = text.extract_from(self.request(url).text)
+
+        image = {
+            "id"   : self.path.rpartition(".")[2],
+            "url"  : extr('<meta property="og:image" content="', '"'),
+            "album": text.extr(extr("Added to <a", "/a>"), ">", "<"),
+            "user" : extr('username: "', '"'),
+        }
+
+        text.nameext_from_url(image["url"], image)
+        yield Message.Directory, image
+        yield Message.Url, image["url"], image
+
+
+class CheveretoAlbumExtractor(CheveretoExtractor):
+    """Extractor for chevereto Albums"""
+    subcategory = "album"
+    pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)"
+    example = "https://jpg2.su/album/TITLE.ID"
+
+    def items(self):
+        url = self.root + self.path
+        data = {"_extractor": CheveretoImageExtractor}
+
+        if self.path.endswith("/sub"):
+            albums = self._pagination(url)
+        else:
+            albums = (url,)
+
+        for album in albums:
+            for image in self._pagination(album):
+                yield Message.Queue, image, data
+
+
+class CheveretoUserExtractor(CheveretoExtractor):
+    """Extractor for chevereto Users"""
+    subcategory = "user"
+    pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)"
+    example = "https://jpg2.su/USER"
+
+    def items(self):
+        url = self.root + self.path
+
+        if self.path.endswith("/albums"):
+            data = {"_extractor": CheveretoAlbumExtractor}
+        else:
+            data = {"_extractor": CheveretoImageExtractor}
+
+        for url in self._pagination(url):
+            yield Message.Queue, url, data
diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py