In [1]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [2]:
import json
import os
from glob import glob

import numpy as np
import polars as pl
from tqdm import tqdm

In [3]:
import re
from typing import List

import whoosh

import whoosh_utils

# TODO: get_analyzer() をutilsに移動
BRS_STOPWORDS = [
    "an",
    "are",
    "by",
    "for",
    "if",
    "into",
    "is",
    "no",
    "not",
    "of",
    "on",
    "such",
    "that",
    "the",
    "their",
    "then",
    "there",
    "these",
    "they",
    "this",
    "to",
    "was",
    "will",
]


NUMBER_REGEX = re.compile(r"^(\d+|\d{1,3}(,\d{3})*)(\.\d+)?$")


class NumberFilter(whoosh.analysis.Filter):
    def __call__(self, tokens):
        for t in tokens:
            if not NUMBER_REGEX.match(t.text):
                yield t


def get_token_list(text: str) -> List[str]:
    tokens = custom_analyzer(text)
    return [t.text for t in tokens]


# Prevent both stopwords and numbers from ever being indexed.
custom_analyzer = whoosh.analysis.StandardAnalyzer(stoplist=BRS_STOPWORDS) | NumberFilter()

Processing /kaggle/input/whoosh-wheel-2-7-4/Whoosh-2.7.4-py2.py3-none-any.whl
Whoosh is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


[0m

In [4]:
df = pl.read_parquet("/kaggle/input/uspto-boolean-search-optimization/patent_data/1790_7.parquet")
df.head(1)

publication_number,title,abstract,claims,description
str,str,str,str,str
"""US-X1-I1""","""The making of …","""""","""""","""XQQQQ Si @s EQ…"


In [5]:
import bz2


def save_list_bz2(data, filename):
    serialized_data = json.dumps(data)
    compressed_data = bz2.compress(serialized_data.encode("utf-8"))

    with open(filename, "wb") as f:
        f.write(compressed_data)


def load_list_bz2(filename):
    with open(filename, "rb") as f:
        compressed_data = f.read()
        decompressed_data = bz2.decompress(compressed_data)
        return json.loads(decompressed_data.decode("utf-8"))

In [6]:
import multiprocessing

os.makedirs("patent2data", exist_ok=True)

files = sorted(glob("/kaggle/input/uspto-boolean-search-optimization/patent_data/*.parquet"))

patent2json = {}
patent2data = {}
save_file_id = 0
N_PATENT_PER_FILE = 10
for f in tqdm(files):
    df = pl.read_parquet(f)
    for patent, title, abstact, claims, description in zip(
        df["publication_number"],
        df["title"],
        df["abstract"],
        df["claims"],
        df["description"],
    ):
        patent2json[patent] = save_file_id
        patent2data[patent] = {
            "title": get_token_list(title),
            "abstract": get_token_list(abstact),
            "claims": get_token_list(claims),
            "description": get_token_list(description),
        }

        if len(patent2data) == N_PATENT_PER_FILE:
            save_list_bz2(patent2data, f"patent2data/{save_file_id}.json.bz2")
            patent2data = {}
            save_file_id += 1

100%|██████████| 2251/2251 [28:40:29<00:00, 45.86s/it]    


In [8]:
save_list_bz2(patent2data, f"patent2data/{save_file_id}.json.bz2")

In [7]:
with open("patent2json.json", "w") as f:
    json.dump(patent2json, f)