In [1]:
import os
import re
import shutil
import requests
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from PyPDF2 import PdfMerger

## Functions

In [2]:
def get_image_urls(manga_chapter_link, verbose=0):
    data = requests.get(manga_chapter_link).text
    start_marker = "<p id=arraydata style=display:none>"
    end_marker = "</p>\r"

    # Find the position of the first occurrence of start_marker
    start_index = data.find(start_marker)

    # Check if start_marker is found
    if start_index != -1:
        # Find the position of the first occurrence of end_marker after start_index
        end_index = data.find(end_marker, start_index + len(start_marker))
        
        # Check if end_marker is found after start_marker
        if end_index != -1:
            # Extract the substring between start_index and end_index
            result = data[start_index + len(start_marker):end_index]
            result = result.split(",")
            if verbose > 0:
                print("found " + str(len(result)) + " pages")
        else:
            print("end_marker not found after " + start_marker)
    else:
        print(start_marker + " not found")

    return result

In [3]:
def get_all_chapter_urls(main_page_url):
    data = requests.get(main_page_url).text
    
    start = data.rindex("<!-- content-->")
    end = start + data[start:].index("</ul>")

    pattern = r"href=\"(.*?)\""
    return list(reversed(re.findall(pattern, data[start:end])))

In [4]:
def download_chapter_images(chapter_image_urls, manga_name, chapter):
    # Create a directory to save the downloaded images
    if not os.path.exists('./downloaded_images'):
        os.makedirs('./downloaded_images')
    if not os.path.exists(os.path.join('./downloaded_images',manga_name)):
        os.makedirs(os.path.join('./downloaded_images',manga_name))
    if not os.path.exists(os.path.join('./downloaded_images',manga_name,chapter)):
        os.makedirs(os.path.join('./downloaded_images',manga_name,chapter))

    # Download and save each image locally
    for i, image_url in enumerate(chapter_image_urls):
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(os.path.join('./downloaded_images',manga_name,chapter,'image_'+str(i+ 1)+'.png'), 'wb') as image_file:
                image_file.write(response.content)
        else:
            print("failed to download page", i + 1)

In [5]:
def create_PDF_from_images(manga_name, chapter, verbose=0):
    if not os.path.exists('./Manga_PDFs'):
        os.makedirs('./Manga_PDFs')
    if not os.path.exists(os.path.join('./Manga_PDFs',manga_name)):
        os.makedirs(os.path.join('./Manga_PDFs',manga_name))

    # Output PDF file path
    output_pdf_path = os.path.join('./Manga_PDFs',manga_name,chapter+'.pdf')

    # Create a PDF document
    c = canvas.Canvas(output_pdf_path, pagesize=letter)
    width, height = letter

    # sort the images according to page number
    path = os.path.join("./downloaded_images", manga_name, chapter)
    images = os.listdir(path)
    images.sort(key=lambda x: int(x.split('.')[0].split("_")[1]))
    image_paths = [os.path.join(path, x) for x in images]

    for image_path in image_paths:
        # Add the image to the PDF page, maintaining the aspect ratio
        c.drawImage(image_path, 0, 0, width, height, preserveAspectRatio=True)
        c.showPage()

    # Add a white blank page at the end
    c.showPage()

    # Save the PDF document
    c.save()

    if verbose > 0:
        print(f'PDF saved as "{output_pdf_path}"')

In [6]:
def merge_pdf_files(path_to_pdfs, list_of_chapter_names, path_of_merge):
    pdf_merger = PdfMerger()

    for pdf_file in list_of_chapter_names:
        pdf_merger.append(os.path.join(path_to_pdfs, pdf_file))

    with open(path_of_merge, 'wb') as output:
        pdf_merger.write(output)

    pdf_merger.close()

In [10]:
def merge_manga_pdf_folder(manga_pdf_folder_path, merge_step = 10, verbose=0):
    merge_folder_path = os.path.join(os.path.split(manga_pdf_folder_path)[0], os.path.split(manga_pdf_folder_path)[1] + "_merged")
    os.makedirs(merge_folder_path)

    chapter_pdfs = [f for f in os.listdir(manga_pdf_folder_path) if f.endswith('.pdf')]
    chapter_pdfs.sort(key=lambda x: int(x.split('.')[0]))

    for i in range(0, len(chapter_pdfs), merge_step):
        chapters_to_merge = chapter_pdfs[i: i + merge_step]
        title = "Chapters " + str(chapters_to_merge[0].split(".")[0]) + "-" + str(chapters_to_merge[-1].split(".")[0])
        merge_path = os.path.join(merge_folder_path, title)
        merge_pdf_files(manga_pdf_folder_path, chapters_to_merge, merge_path)
        
        for chapter_pdf in chapters_to_merge:
            os.remove(os.path.join(manga_pdf_folder_path, chapter_pdf))

        if verbose > 0:
            print("merged", title, "into", merge_path)

In [None]:
def double_check_chapters(main_page_url, manga_name):
    chapter_pdfs = os.listdir(os.path.join('./Manga_PDFs',manga_name))

    all_chapter_urls = get_all_chapter_urls(main_page_url)
    all_chapter_urls = [x.split("-")[-1]+".pdf" for x in all_chapter_urls]

    return list(set(all_chapter_urls).difference(set(chapter_pdfs)))

In [None]:
def download_manga_from_main_page_url(main_page_url, manga_name, merge_step=10, verbose=0):
    manga_name = manga_name.replace(" ", "_")
    chapter_urls = get_all_chapter_urls(main_page_url)
    
    for manga_chapter_link in chapter_urls:
        chapter = manga_chapter_link.split("-")[-1]
        if verbose > 0:
            print("processing chapter:", chapter)

        image_urls = get_image_urls(manga_chapter_link, verbose=verbose-1)
        download_chapter_images(image_urls, manga_name, chapter)
        create_PDF_from_images(manga_name, chapter, verbose=verbose-1)

        if verbose > 0:
            print("deleting downloaded images for chapter", chapter)
        shutil.rmtree(os.path.join('./downloaded_images',manga_name,chapter))
    shutil.rmtree(os.path.join('./downloaded_images',manga_name))

    missing_chapters = double_check_chapters(main_page_url, manga_name)
    if len(missing_chapters) != 0:
        print(" ".join(missing_chapters), "are missing, abort mission!")
        return

    merge_manga_pdf_folder(os.path.join("./Manga_PDFs", manga_name), merge_step = merge_step, verbose=verbose-1)

    shutil.rmtree(os.path.join('./Manga_PDFs/',manga_name))

# Auto Download

In [None]:
main_page_url = "https://mangapanda.in/manga/planetes"
download_manga_from_main_page_url(main_page_url, "planetes2", merge_step=7, verbose=1)

# Manual download  

- Try manually downloading the chapters if auto download fails because of websites broken chapter page urls  
- You can detect the broken chapter page urls by this error:  
"fileName=<_io.BufferedReader name=PATH/OF/BROKEN/PNG identity=[ImageReader@0x7f6f650d5b20] cannot identify image file <_io.BytesIO object at 0x7f6f650d2ea0>"

In [None]:
def download_manga_with_list(manga_link, manga_name, chapter_list):
    for chapter in chapter_list:
        manga_chapter_link = manga_link + chapter

        print("processing chapter:", chapter)

        try:
            image_urls = get_image_urls(manga_chapter_link)
        except Exception as e:
            print("error during geting image urls\n", e)
            break

        try:
            download_chapter_images(image_urls, manga_name, chapter)
        except Exception as e:
            print("error during dowloading images\n", e)
            break

        try:
            create_PDF_from_images(image_urls, manga_name, chapter)
        except Exception as e:
            print("error during creating pdf\n", e)
            break

In [None]:
"""manga_link = "https://mangapanda.in/planetes-chapter-"
manga_name = "planetes"
chapter_list = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26"]

download_manga_with_list(manga_link, manga_name, chapter_list)
merge_manga_pdf_folder("./Manga_PDFs/fire punch", merge_step = 10)"""