In [1]:
import json
import multiprocessing
import os
import re
import tempfile
from collections import Counter
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import whoosh
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from whoosh.analysis import StandardAnalyzer

In [2]:
import bz2


def save_list_bz2(data, filename):
    serialized_data = json.dumps(data)
    compressed_data = bz2.compress(serialized_data.encode("utf-8"))

    with open(filename, "wb") as f:
        f.write(compressed_data)


def load_list_bz2(filename):
    with open(filename, "rb") as f:
        compressed_data = f.read()
        decompressed_data = bz2.decompress(compressed_data)
        return json.loads(decompressed_data.decode("utf-8"))

In [3]:
def process_file(file_path):
    # 出力ファイル名の生成
    output_dir = "token_counts"
    output_file = os.path.join(output_dir, os.path.basename(file_path).replace(".bz2", ".json"))

    # 既にファイルが存在する場合は処理をスキップ
    if os.path.exists(output_file):
        return

    # データ読み込み
    data = load_list_bz2(file_path)

    word_counts = {
        "title": {},
        "abstract": {},
        "claims": {},
        "description": {},
    }
    for pat_tokens in data.values():
        for key, tokens in pat_tokens.items():
            tokens = set(tokens)
            for token in tokens:
                if token not in word_counts[key]:
                    word_counts[key][token] = 0
                word_counts[key][token] += 1

    # 安全なファイル書き込み
    safe_write(word_counts, output_file)


def safe_write(data, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with tempfile.NamedTemporaryFile(delete=False, dir=os.path.dirname(path), mode="w") as tmp_file:
        json.dump(data, tmp_file)
    os.replace(tmp_file.name, path)


files = sorted(glob("/kaggle/input/all-index/patent2data/*.bz2"))

with multiprocessing.Pool(processes=30) as pool:
    list(tqdm(pool.imap(process_file, files), total=len(files)))

100%|██████████| 1330765/1330765 [44:11<00:00, 501.92it/s]  


In [4]:
files = sorted(glob("token_counts/*.json"))
token_counts = {
    "title": Counter(),
    "abstract": Counter(),
    "description": Counter(),
    "claims": Counter(),
}
for file in tqdm(files):
    with open(file) as f:
        data = json.load(f)
        for key in token_counts:
            token_counts[key].update(data[key])

100%|██████████| 1330765/1330765 [1:18:17<00:00, 283.26it/s]


In [5]:
print(token_counts["title"].most_common(2))
print(token_counts["abstract"].most_common(2))
print(token_counts["description"].most_common(2))
print(token_counts["claims"].most_common(2))

[('and', 4083374), ('method', 2465608)]
[('and', 8575647), ('in', 6491832)]
[('and', 13276113), ('in', 12941306)]
[('and', 9944834), ('claim', 9085779)]


In [6]:
for key in token_counts:
    token_counts[key] = dict(token_counts[key])

In [7]:
with open("token_counts.json", "w") as f:
    json.dump(token_counts, f)