In [None]:
import re
import os
import zipfile
import httpx
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import pillow_avif
import tempfile
# from tqdm import tqdm
from tqdm.notebook import tqdm

url_series = "https://www.webtoons.com/en/action/omniscient-reader/list?title_no=2154"
dir_output = "downloads"
os.makedirs(dir_output, exist_ok=True)

# Setup headers
header_ua = '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"'
header_ua_mobile = "?0"
header_ua_platform = '"Linux"'
header_user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
headers = {
    "sec-ch-ua": header_ua,
    "sec-ch-ua-mobile": header_ua_mobile,
    "sec-ch-ua-platform": header_ua_platform,
    "user-agent": header_user_agent,
}

def httpx_get_soup(url, headers) -> BeautifulSoup:
    """
    Fetches the content of a URL and returns a BeautifulSoup object.
    """
    with httpx.Client() as client:
        response = client.get(url, headers=headers)
        if response.status_code == 200:
            return BeautifulSoup(response.text, 'html.parser')
        else:
            print(f"Failed to retrieve the page: {url}\nStatus code: {response.status_code}")
            exit(1)
            return None

def get_episodes(soup, regex) -> dict:
    """
    Extracts episode links from the BeautifulSoup object.
    """
    links = [a['href'] for a in soup.find_all('a', href=re.compile(regex))]
    dat = {int(re.search(r'episode_no=(\d+)', link).group(1)): link for link in links}
    return dat

def download_cbz(url, path_cbz, img_format="avif"):
    """
    Downloads a CBZ file from the given URL and saves it to the specified path.
    Default format is AVIF. Options are AVIF or JPEG.
    """
    # Step 1: Fetch the page content as soup
    soup = httpx_get_soup(url, headers)
    episode_no = re.search(r'episode_no=(\d+)', url).group(1)
    # Step 2: Find all image links of content images
    regex = r'https://webtoon-phinf\.pstatic\.net.*\.(jpg|jpeg|png).*'
    links = soup.find_all('img', attrs={'data-url': re.compile(regex), 'class': '_images'})
    # Setp 3: Download images
    headers['referer'] = url
    n = 0
    with tempfile.TemporaryDirectory() as temp_dir:
        with httpx.Client() as client:
            for url in tqdm(links, desc=f"Downloading Episode: {episode_no}", unit="image", leave=False):
                image_url = url['data-url']
                response = client.get(image_url, headers=headers)
                if response.status_code == 200:
                    n += 1
                    image = Image.open(BytesIO(response.content))
                    file_name = f"{str(n).zfill(3)}.{img_format}"
                    if img_format == "avif":
                        image.save(os.path.join(temp_dir, file_name), format="AVIF")
                    else:
                        image.save(os.path.join(temp_dir, file_name), format="JPEG")
                else:
                    print(f"Failed to download image: {response.status_code}")
                    exit(1)        
            
        # Step 4: Create new CBZ with downloaded images
        with zipfile.ZipFile(path_cbz, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
            for filename in sorted(os.listdir(temp_dir)):
                file_path = os.path.join(temp_dir, filename)
                zipf.write(file_path, arcname=filename)

# Step 1: Fetch the series page content as soup and extract title first episode url
soup = httpx_get_soup(url_series, headers)
series_title = soup.find('title').text.strip()
episodes = get_episodes(soup, r'.+&episode_no=\d+')
first_episode_no = min(episodes.keys())

# Step 2: Get the list of episode URLs from the first episode
url = episodes[first_episode_no]
soup = httpx_get_soup(url, headers)
episodes = get_episodes(soup, r'https://www\.webtoons\.com.+viewer\?title_no=\d+')
episodes = dict(sorted(episodes.items(), key=lambda item: item[0]))

# Step 3: Download each episode as a CBZ file
dir_output = os.path.join(dir_output, series_title)
os.makedirs(dir_output, exist_ok=True)
for key in tqdm(episodes.keys(), desc="Processing Episodes", unit="url"):
    episode_no = key
    url = episodes[key]
    path_cbz = os.path.join(dir_output, f"{series_title}, Episode {episode_no}.cbz")
    if os.path.exists(path_cbz):
        continue
    download_cbz(url, path_cbz, img_format="avif")

Processing Episodes:   0%|          | 0/252 [00:00<?, ?url/s]

Downloading Episode: 8:   0%|          | 0/134 [00:00<?, ?image/s]

Downloading Episode: 9:   0%|          | 0/121 [00:00<?, ?image/s]

Downloading Episode: 10:   0%|          | 0/95 [00:00<?, ?image/s]

Downloading Episode: 11:   0%|          | 0/154 [00:00<?, ?image/s]

Downloading Episode: 12:   0%|          | 0/127 [00:00<?, ?image/s]

Downloading Episode: 13:   0%|          | 0/110 [00:00<?, ?image/s]

Downloading Episode: 14:   0%|          | 0/102 [00:00<?, ?image/s]

Downloading Episode: 15:   0%|          | 0/123 [00:00<?, ?image/s]

Downloading Episode: 16:   0%|          | 0/155 [00:00<?, ?image/s]

Downloading Episode: 17:   0%|          | 0/160 [00:00<?, ?image/s]

Downloading Episode: 18:   0%|          | 0/128 [00:00<?, ?image/s]

Downloading Episode: 19:   0%|          | 0/167 [00:00<?, ?image/s]

Downloading Episode: 20:   0%|          | 0/146 [00:00<?, ?image/s]

Downloading Episode: 21:   0%|          | 0/144 [00:00<?, ?image/s]

Downloading Episode: 22:   0%|          | 0/146 [00:00<?, ?image/s]

Downloading Episode: 23:   0%|          | 0/125 [00:00<?, ?image/s]

Downloading Episode: 24:   0%|          | 0/132 [00:00<?, ?image/s]

Downloading Episode: 25:   0%|          | 0/146 [00:00<?, ?image/s]

Downloading Episode: 26:   0%|          | 0/131 [00:00<?, ?image/s]

Development

https://www.webtoons.com/en/action/omniscient-reader/list?title_no=2154

https://www.webtoons.com/en/action/omniscient-reader/episode-0-prologue/viewer?title_no=2154&episode_no=1

In [None]:
import re
import os
import zipfile
import httpx
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

# https://www.webtoons.com/en/action/omniscient-reader/episode-1-prologue/viewer?title_no=2154&episode_no=1
# https://www.webtoons.com/en/action/omniscient-reader/list?title_no=2154


header_ua = '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"'
header_ua_mobile = "?0"
header_ua_platform = '"Linux"'
header_user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"


# headers = {
#     "referer": "https://www.webtoons.com/en/action/omniscient-reader/episode-0-prologue/viewer?title_no=2154&episode_no=1",
#     "sec-ch-ua": '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
#     "sec-ch-ua-mobile": "?0",
#     "sec-ch-ua-platform": '"Linux"',
#     "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
# }

headers = {
    "sec-ch-ua": header_ua,
    "sec-ch-ua-mobile": header_ua_mobile,
    "sec-ch-ua-platform": header_ua_platform,
    "user-agent": header_user_agent,
}

episode_no = 1

url_series = "https://www.webtoons.com/en/action/omniscient-reader/list?title_no=2154"

# title_no = re.search(r'title_no=(\d+)', url_series).group(1)
# url_chapter = url_series.split('/list')[0] + '/x/viewer?title_no=' + title_no + '&episode_no=' + str(episode_no)

# response = requests.get(url_chapter)
# response.raise_for_status()  # Raise an error for bad HTTP responses
# soup = BeautifulSoup(response.text, 'html.parser')

url = url_series

def httpx_get_soup(url, headers) -> BeautifulSoup:
    """
    Fetches the content of a URL and returns a BeautifulSoup object.
    """
    with httpx.Client() as client:
        response = client.get(url, headers=headers)
        if response.status_code == 200:
            return BeautifulSoup(response.text, 'html.parser')
        else:
            print(f"Failed to retrieve the page: {url}\nStatus code: {response.status_code}")
            exit(1)
            return None

def get_episodes(soup, regex) -> dict:
    """
    Extracts episode links from the BeautifulSoup object.
    """
    links = [a['href'] for a in soup.find_all('a', href=re.compile(regex))]
    dat = {int(re.search(r'episode_no=(\d+)', link).group(1)): link for link in links}
    return dat

soup = httpx_get_soup(url, headers)
episodes = get_episodes(soup, r'.+&episode_no=\d+')
first_episode_no = min(episodes.keys())

url = episodes[episode_no]
soup = httpx_get_soup(url, headers)
episodes = get_episodes(soup, r'https://www\.webtoons\.com.+viewer\?title_no=\d+')
episodes = dict(sorted(episodes.items(), key=lambda item: item[0]))

for episode_no, link in episodes.items():
    print(f"Episode {episode_no}: {link}")

# ="https://www.webtoons.com/en/action/omniscient-reader/episode-189/viewer?title_no=2154
# print(soup.prettify())

Episode 1: https://www.webtoons.com/en/action/omniscient-reader/episode-0-prologue/viewer?title_no=2154&episode_no=1
Episode 2: https://www.webtoons.com/en/action/omniscient-reader/episode-1/viewer?title_no=2154&episode_no=2
Episode 3: https://www.webtoons.com/en/action/omniscient-reader/episode-2/viewer?title_no=2154&episode_no=3
Episode 4: https://www.webtoons.com/en/action/omniscient-reader/episode-3/viewer?title_no=2154&episode_no=4
Episode 5: https://www.webtoons.com/en/action/omniscient-reader/episode-4/viewer?title_no=2154&episode_no=5
Episode 6: https://www.webtoons.com/en/action/omniscient-reader/episode-5/viewer?title_no=2154&episode_no=6
Episode 7: https://www.webtoons.com/en/action/omniscient-reader/episode-6/viewer?title_no=2154&episode_no=7
Episode 8: https://www.webtoons.com/en/action/omniscient-reader/episode-7/viewer?title_no=2154&episode_no=8
Episode 9: https://www.webtoons.com/en/action/omniscient-reader/episode-8/viewer?title_no=2154&episode_no=9
Episode 10: https:/

In [None]:
import pillow_avif
import tempfile
# from tqdm import tqdm
from tqdm.notebook import tqdm

folder = "data"
os.makedirs(folder, exist_ok=True)

path_cbz = os.path.join(folder, "omniscient_reader.cbz")


url_chapter = "https://www.webtoons.com/en/action/omniscient-reader/episode-0-prologue/viewer?title_no=2154&episode_no=1"
url = url_chapter

def download_cbz(url, path_cbz, img_format="avif"):
    """
    Downloads a CBZ file from the given URL and saves it to the specified path.
    Default format is AVIF. Options are AVIF or JPEG.
    """
    # Step 1: Fetch the page content as soup
    soup = httpx_get_soup(url, headers)
    # Step 2: Find all image links of content images
    regex = r'https://webtoon-phinf\.pstatic\.net.*\.(jpg|jpeg|png).*'
    links = soup.find_all('img', attrs={'data-url': re.compile(regex), 'class': '_images'})
    # Setp 3: Download images
    headers['referer'] = url
    n = 0
    with tempfile.TemporaryDirectory() as temp_dir:
        with httpx.Client() as client:
            for url in tqdm(links, desc="Downloading images", unit="image"):
                image_url = url['data-url']
                response = client.get(image_url, headers=headers)
                if response.status_code == 200:
                    n += 1
                    image = Image.open(BytesIO(response.content))
                    file_name = f"{str(n).zfill(3)}.{img_format}"
                    if img_format == "avif":
                        image.save(os.path.join(temp_dir, file_name), format="AVIF")
                    else:
                        image.save(os.path.join(temp_dir, file_name), format="JPEG")
                else:
                    print(f"Failed to download image: {response.status_code}")
                    exit(1)        
        # Step 4: Create new CBZ with downloaded images
        with zipfile.ZipFile(path_cbz, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
            for filename in sorted(os.listdir(temp_dir)):
                file_path = os.path.join(temp_dir, filename)
                zipf.write(file_path, arcname=filename)

download_cbz(url, path_cbz, img_format="avif")

In [None]:
urls = [
    "https://www.webtoons.com/en/action/omniscient-reader/episode-0-prologue/viewer?title_no=2154&episode_no=1",
    "https://www.webtoons.com/en/action/omniscient-reader/episode-0-prologue/viewer?title_no=2154&episode_no=2"
]

def download_cbz_all(urls, folder, img_format="avif"):
    """
    Downloads multiple CBZ files from the given URLs and saves them to the specified folder.
    """
    for url in tqdm(urls, desc="Processing URLs", unit="url"):
        episode_no = re.search(r'episode_no=(\d+)', url).group(1)
        path_cbz = os.path.join(folder, f"omniscient_reader_{episode_no}.cbz")
        download_cbz(url, path_cbz, img_format)

download_cbz_all(urls, folder, img_format="avif")