In [1]:
import re
from typing import Dict, List

import fitz

filename = "pdf/sample20230430.pdf"

url_regrex = re.compile(r"https?://(?:[-\w\/\:\?\=\~\&\#.]|(?:%[\da-fA-F]{2}))+")


def read_url_from_pdftext(filename: str) -> Dict[str, int]:
    text = ""
    # PDFを読み込む
    with fitz.open(filename) as doc:
        # １ページずつテキストを抽出して連結
        for page in range(len(doc)):
            text += doc[page].get_text()

    # URLを検索する
    urls = url_regrex.findall(text)
    print(f"Url: {len(urls)}")

    # 重複チェック
    return duplecate_check(urls)

def duplecate_check(urls: List[str]) -> Dict[str, int]:
    urls_pickuped = {}
    counter = 0
    for url in urls:
        if url in urls_pickuped:
            urls_pickuped[url] += 1
        else:
            urls_pickuped[url] = 1
            counter += 1
            # print(url)
    print(f"Unique url: {counter}")
    return urls_pickuped

In [2]:
import pikepdf

def read_url_from_pdflink(filename: str) -> Dict[str, int]:
    urls = []
    with pikepdf.Pdf.open(filename) as doc:
        # 各ページのリンクを抽出する
        for page in doc.pages:
            if not page.get("/Annots"):
                continue
            for annot in page.get("/Annots"):
                if not annot or not annot.get("/A"):
                    continue
                uri = annot.get("/A").get("/URI")
                if uri is not None:
                    urls.append(uri)
    print(f"Url: {len(urls)}")

    # 重複チェック
    return duplecate_check(urls)


In [3]:
urls_pickuped = read_url_from_pdftext(filename)
urls_linked = read_url_from_pdflink(filename)

Url: 84
Unique url: 84
Url: 85
Unique url: 85


In [4]:
from dictdiffer import diff

result = {
    x: _list for (x, y, _list) in (diff(urls_pickuped, urls_linked))
}
print(result)


{'add': [(pikepdf.String("https://ja.wikipedia.org/wiki/%E5%8A%89%E8%A3%95"), 1), (pikepdf.String("http://www.kbs-kyoto.co.jp/tv/kaigai/archives/hokugi/%23062378"), 1)], 'remove': [('http://www.kbs-kyoto.co.jp/tv/kaigai/archives/hokugi/#062378', 1)]}


In [5]:
# マージ

urls = [x for x in urls_pickuped.keys()]
#urls.append(str(result["add"][0][0]))

import urllib.parse

for (url, count) in result["add"]:
    decoded = urllib.parse.unquote(str(url))
    if decoded in urls:
        print(f"{decoded} found")
    else:
        print(f"{decoded} not found")
        urls.append(str(url))

with open("urls.txt", mode="w") as _fp:
    for url in urls:
        _fp.write(url)
        _fp.write("\n")


https://ja.wikipedia.org/wiki/劉裕 not found
http://www.kbs-kyoto.co.jp/tv/kaigai/archives/hokugi/#062378 found


In [6]:
import qrcode

# import qrcode.image.svg

# method = "basic"

# if not method and method == 'basic':
#     # Simple factory, just a set of rects.
#     factory = qrcode.image.svg.SvgImage
# elif method == 'fragment':
#     # Fragment factory (also just a set of rects)
#     factory = qrcode.image.svg.SvgFragmentImage
# else:
#     # Combined path factory, fixes white space that may occur when zooming
#     factory = qrcode.image.svg.SvgPathImage

factory = qrcode.image.pure.PyPNGImage

qr = qrcode.QRCode(
    version=1,
    error_correction=qrcode.constants.ERROR_CORRECT_L,
    box_size=2,
    border=4,
    image_factory=factory
)

def make_qrcodes(urls: List[str]) -> None:
    for url in urls:
        qr.clear()
        qr.add_data(url)
        qr.make(fit=True)
        image = qr.make_image(fill_color="black", back_color="white")
        if image:
            image.save("qrcode/" + re.sub(r"[^a-zA-Z0-9._-]+", "_", url) + ".png")
        else:
            print(f"image empty {image} {url}")


In [7]:
import os
import shutil
import zipfile

def clear_dir(dirname: str) -> None:
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.makedirs(dirname)

def zip_dir(zip_file: str, dirname: str, more_files: List[str]) -> None:
    with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # ディレクトリ内のファイルをzipに追加する
        for foldername, subfolders, filenames in os.walk(dirname):
            for filename in filenames:
                filepath = os.path.join(foldername, filename)
                zipf.write(filepath, os.path.relpath(filepath, dirname))
        # 追加のファイルをzipに追加する
        for file in more_files:
            zipf.write(file, os.path.basename(file))

In [8]:
qrcode_dir = "qrcode"
more_files = ["urls.txt"]
zip_file = "qrcodes.zip"

clear_dir(qrcode_dir)
make_qrcodes(urls)
zip_dir(zip_file, qrcode_dir, more_files)