In [1]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [2]:
import bz2
import gc
import json
import multiprocessing
import random
import warnings
from collections import Counter
from copy import deepcopy
from glob import glob
from multiprocessing import Pool
from typing import List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import plyvel
import polars as pl
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base
from penguinml.utils.timer import Timer
from tqdm import tqdm

import whoosh_utils
from const import ALL_KEYS, INF, KEY2QUERY, NUM_CPU
from db import CpcToken2RangeDB, SingleTokenDB, TokinezedDB
from solver import HitBlock, SimulatedAnnealing, State
from utils import compute_ap, evaluate, load_list_bz2, save_list_bz2

warnings.filterwarnings("ignore")
MODEL_NAME = "baseline"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))

init_logger("log.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

Processing /kaggle/input/whoosh-wheel-2-7-4/Whoosh-2.7.4-py2.py3-none-any.whl
Whoosh is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


[0m2024-06-22 04:49:41.393754: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-22 04:49:41.639442: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-22 04:49:42.732985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/li

In [3]:
df = pl.read_csv("/kaggle/input/uspto-boolean-search-optimization/nearest_neighbors.csv")
pub2id = {pub: idx for idx, pub in enumerate(df["publication_number"].to_numpy())}
id2pub = {idx: pub for idx, pub in enumerate(df["publication_number"].to_numpy())}

patent2centers = {}
for targets in tqdm(df.iter_rows()):
    center = targets[0]
    center_id = pub2id[center]

    targets = targets[1:]
    assert len(targets) == 50
    for idx, target in enumerate(targets):
        if target not in pub2id:
            continue
        target_id = pub2id[target]
        if target_id not in patent2centers:
            patent2centers[target_id] = []
        patent2centers[target_id].append(center_id)

del df
gc.collect()

13307647it [23:28, 9451.27it/s] 


0

In [4]:
with open("/kaggle/input/cpc-mapping/cpc2patents.json", "r") as f:
    cpc2patents = json.load(f)

gc.collect()

0

In [5]:
# !rm -r data
# !mkdir data

In [10]:
def process_cpc_patents(args):
    cpc, patents = args
    n = len(patents)
    if not (1000 <= n and n <= 20000):
        return

    cpc_save = cpc.replace("/", "-")
    output_path = f"data/{cpc_save}.bz2"
    if os.path.exists(output_path):
        return

    # patentのテキストデータを読み込む
    patent2tokens = {}
    for patent in patents:
        try:
            path = os.path.join("/kaggle/input/all-index-per-patent/data", patent + ".json.bz2")
            if os.path.exists(path):
                data = load_list_bz2(path)
            else:
                path = os.path.join(
                    "/kaggle/input/all-index-per-patent/data2", patent + ".json.bz2"
                )
                if os.path.exists(path):
                    data = load_list_bz2(path)
                else:
                    continue
        except:
            continue
        tokens = []
        for key, words in data.items():
            tokens += [KEY2QUERY[key] + ":" + word for word in words]
        patent2tokens[patent] = set(tokens)

    # tokenをカウント
    global_token2patents = {}
    for patent, tokens in patent2tokens.items():
        for token in tokens:
            if token not in global_token2patents:
                global_token2patents[token] = []
            global_token2patents[token].append(patent)
    for key in global_token2patents.keys():
        global_token2patents[key] = set(global_token2patents[key])

    # centerごとにpatentを整理
    # center -> List[patent]
    center2patents = {}
    for patent in patents:
        if patent not in pub2id:
            continue
        patent_id = pub2id[patent]
        if patent_id not in patent2centers:
            continue
        center_ids = patent2centers[patent_id]
        centers = [id2pub[center_id] for center_id in center_ids if center_id in id2pub]
        for center in centers:
            if center not in center2patents:
                center2patents[center] = []
            center2patents[center].append(patent)
    for center in center2patents.keys():
        center2patents[center] = set(center2patents[center])

    # centerごとに使用可能なtokenを保存
    center_token_inner_outers = []
    for center, this_patents in center2patents.items():
        token2patents = {}
        for patent in this_patents:
            if patent not in patent2tokens:
                continue
            for token in patent2tokens[patent]:
                if token not in token2patents:
                    token2patents[token] = []
                token2patents[token].append(patent)
        for key in token2patents.keys():
            token2patents[key] = set(token2patents[key])

        used_set = set()
        for token, inner_patents in token2patents.items():
            if len(global_token2patents[token]) > 5 + len(inner_patents):
                continue

            # inner, outerの個数を整理
            all_patents = global_token2patents[token]
            outer_patents = all_patents - inner_patents
            inner_count = len(inner_patents)
            outer_count = len(outer_patents)

            # outerが多すぎたらNG
            if outer_count >= 5:
                continue

            # innerが少なすぎたらNG
            if inner_count <= 1:
                continue

            # まったく同じpatentならNG
            key = tuple(sorted(list(all_patents)))
            if key in used_set:
                continue
            used_set.add(key)

            center_token_inner_outers.append(
                (center, token, list(inner_patents), list(outer_patents))
            )

    # if len(center_token_inner_outers):
    #     print(center, center_token_inner_outers)

    save_list_bz2(center_token_inner_outers, output_path)

    # del (
    #     global_token2patents,
    #     center2patents,
    #     patent2tokens,
    #     center_token_inner_outers,
    # )
    # try:
    #     del token2patents, used_set, all_patents
    #     del outer_patents, inner_patents, inner_count, outer_count, key
    # except:
    #     pass
    # try:
    #     del center_ids, centers, this_patents, tokens, data, path
    # except:
    #     pass
    # gc.collect()


# for cpc, patents in tqdm(cpc2patents.items(), total=len(cpc2patents)):
#     process_cpc_patents(cpc, patents)

with Pool(16) as p:
    _ = list(
        tqdm(
            p.imap_unordered(process_cpc_patents, cpc2patents.items()),
            total=len(cpc2patents),
        )
    )

100%|██████████| 265889/265889 [06:29<00:00, 682.41it/s] 
