In [1]:
import os
from tqdm import tqdm

KAGGLE_ENV = not os.path.exists("/kaggle/.vscode")
ENV_NAME = "kaggle" if KAGGLE_ENV else "local"
print(f"{KAGGLE_ENV=}")
print(f"{ENV_NAME=}")

if KAGGLE_ENV:
    !pip install -U -q plyvel --no-index --find-links=file:///kaggle/input/uspto-gen-wheel/plyvel

    move_dirs = (
        [
        "/kaggle/input/preprocess-all-token-single",
        "/kaggle/input/uspto-rare-tokens-dataset",
        ] 
        + [f"/kaggle/input/complete-db-{i}" for i in range(15)]
        + [f"/kaggle/input/complete-db-v2-{i}" for i in range(5)]
        + [f"/kaggle/input/uspto-ratio-db-{i}" for i in range(20)]
    )

    !mkdir /kaggle/tmp
    for move_dir in tqdm(move_dirs):
        !cp -r {move_dir} /kaggle/tmp/{move_dir.split("/")[-1]}
    !ls /kaggle/tmp

KAGGLE_ENV=False
ENV_NAME='local'


In [2]:
import os
import sys

if KAGGLE_ENV:
    PACKAGE_DIR = "/kaggle/input/uspto-src/src"
else:
    PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [19]:
import json
import multiprocessing
import random
import warnings
from collections import Counter
from typing import List, Set, Tuple

import matplotlib.pyplot as plt
import numpy as np
import plyvel
import polars as pl
import yaml
from penguinml.utils.logger import get_logger, init_logger
from penguinml.utils.set_seed import seed_base
from penguinml.utils.timer import Timer
from tqdm import tqdm

import whoosh_utils
from const import INF, KEY2QUERY, NUM_CPU, QUERY2KEY
from db import CompleteDB, SingleTokenDB, TokinezedDB
from solver import HitBlock, SimulatedAnnealing, State
from utils import compute_ap, evaluate

warnings.filterwarnings("ignore")
MODEL_NAME = "baseline"
CFG = yaml.safe_load(open(os.path.join(PACKAGE_DIR, "config.yaml"), "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])
CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
# !rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

init_logger("log.log")
logger = get_logger("main")
seed_base(CFG[MODEL_NAME]["execution"]["seed"])

set seed: 46


exp_0001


In [6]:
DANGER_TYPE = "all"
# DANGER_TYPE = "2hop"
assert DANGER_TYPE in ["all", "2hop"]

if KAGGLE_ENV:
    TRAIN_PATH = "/kaggle/input/uspto-train-index-2500/train2500_seed0.parquet"
    #     TRAIN_PATH = "/kaggle/input/uspto-explainable-ai/test.csv"
    TRAIN_INDEX_PATH = "/kaggle/input/uspto-train-index-2500/index_2500_200k"
    NN_DF_PATH = "/kaggle/input/uspto-explainable-ai/nearest_neighbors.csv"

    # database
    PATENT2RARE_TOKENS_PATH = "/kaggle/tmp/uspto-rare-tokens-dataset/db"

    COMPLETE_DB_PATH = [f"/kaggle/tmp/complete-db-{i}/complete-db-{i}/db" for i in range(15)]
    COMPLETE_INDEX_PATH = [f"/kaggle/input/uspto-complete-index-{i}/index.lz4" for i in range(15)]

    COMPLETE_DB_V2_PATH = [f"/kaggle/tmp/complete-db-v2-{i}/db" for i in range(5)]
    COMPLETE_INDEX_V2_PATH = [f"/kaggle/input/complete-db-index-v2-{i}/index.lz4" for i in range(5)]

    CPC_TOKEN2RANGE_SPLIT = 20
    CPC_TOKEN2RANGE_DB_PATHES = [
        f"/kaggle/tmp/uspto-ratio-db-{i}/ratio-db-{i}/db" for i in range(CPC_TOKEN2RANGE_SPLIT)
    ]
    CPC_TOKEN_INDEX_PATHES = [
        f"/kaggle/input/uspto-ratio-db-index-{i}/index.txt" for i in range(CPC_TOKEN2RANGE_SPLIT)
    ]

    SINGLE_TOKEN_DB_PATH = "/kaggle/tmp/preprocess-all-token-single/db/db"
    SINGLE_TOKEN_INDEX_PATH = "/kaggle/tmp/preprocess-all-token-single/index.lz4"

    TRAIN_MODE = "train" in TRAIN_PATH
    VISUALIZE = False
else:
    TRAIN_PATH = "/kaggle/input/uspto-train-data-2500/train2500_seed0.parquet"
    TRAIN_INDEX_PATH = "/kaggle/input/train-index-2500/index_2500_200k"
    NN_DF_PATH = "/kaggle/input/uspto-boolean-search-optimization/nearest_neighbors.csv"

    # database
    PATENT2RARE_TOKENS_PATH = "/kaggle/input/rare-tokens/db"

    COMPLETE_DB_PATH = [
        f"/kaggle/input/preprocess-complete/split/complete-db-{i}/db" for i in range(15)
    ]
    COMPLETE_INDEX_PATH = [
        f"/kaggle/input/preprocess-complete/split/complete-db-{i}/index.lz4" for i in range(15)
    ]

    COMPLETE_DB_V2_PATH = [
        f"/kaggle/input/preprocess-complete-v2/split/complete-db-{i}/db" for i in range(5)
    ]
    COMPLETE_INDEX_V2_PATH = [
        f"/kaggle/input/preprocess-complete-v2/split/complete-db-{i}/index.lz4" for i in range(5)
    ]

    TOKENIZED_SPLIT = 10
    TOKENIZED_DB_PATHES = [
        f"/kaggle/input/all-index-per-patent/split/tokenized-db-{i}/db"
        for i in range(TOKENIZED_SPLIT)
    ]
    TOKENIZED_INDEX_PATHES = [
        f"/kaggle/input/all-index-per-patent/split/tokenized-db-{i}/index.lz4"
        for i in range(TOKENIZED_SPLIT)
    ]

    CPC_TOKEN2RANGE_SPLIT = 20
    CPC_TOKEN2RANGE_DB_PATHES = [
        f"/kaggle/input/preprocess-cands-ratio/split/ratio-db-{i}/db"
        for i in range(CPC_TOKEN2RANGE_SPLIT)
    ]
    CPC_TOKEN_INDEX_PATHES = [
        f"/kaggle/input/preprocess-cands-ratio/split/ratio-db-{i}/index.txt"
        for i in range(CPC_TOKEN2RANGE_SPLIT)
    ]

    SINGLE_TOKEN_DB_PATH = "/kaggle/input/preprocess-all-token-single/db"
    SINGLE_TOKEN_INDEX_PATH = "/kaggle/input/preprocess-all-token-single/index.lz4"

    TRAIN_MODE = True
    VISUALIZE = False

In [5]:
if TRAIN_PATH.split(".")[-1] == "parquet":
    train = pl.read_parquet(TRAIN_PATH)
else:
    train = pl.read_csv(TRAIN_PATH)

In [9]:
tokenized_db = TokinezedDB(TOKENIZED_DB_PATHES, TOKENIZED_INDEX_PATHES)

In [17]:
with open("/kaggle/input/token-counts/token_counts.json", "r") as f:
    token_counts = json.load(f)

In [35]:
for i, target_ids in tqdm(enumerate(train.iter_rows())):
    target_ids = target_ids[1:]
    token2patents = {}
    for target in target_ids:
        tokens = tokenized_db.get(target)
        for key, this_tokens in tokens.items():
            for token in this_tokens:
                token = f"{KEY2QUERY[key]}:{token}"
                if token not in token2patents:
                    token2patents[token] = set()
                token2patents[token].add(target)

    for token, patents in token2patents.items():
        inner_count = len(patents)
        key, _token = token.split(":")
        all_count = token_counts[QUERY2KEY[key]][_token]
        outer_count = all_count - inner_count

        if outer_count < 500 and inner_count > 10:
            print(token, inner_count, outer_count)
    print("------------")

    if i > 50:
        break

1it [00:00,  4.15it/s]

detd:transpyloric 21 184
detd:lcpufas 24 280
detd:arasco 16 145
detd:dhasco 18 252
detd:enfamil 12 214
detd:lcpufa 25 355
detd:chito 13 375
detd:aribino 15 84
detd:siallyl 13 100
detd:fuco 13 452
detd:gentio 14 466
detd:reconstituteable 11 49
detd:lactoflavin 11 349
detd:ovoflavin 11 81
detd:nicmn 11 73
detd:folacin 14 468
detd:pteroylglutamic 11 349
detd:deoxyadenosylcobalamin 12 232
detd:hydroxycobalamin 11 345
detd:picolonate 12 153
detd:nonabsorbed 12 355
------------
ab:wales 11 357
------------


3it [00:00,  6.23it/s]

------------


4it [00:01,  3.39it/s]

detd:doranz 12 300
detd:cocchi 14 323
detd:l1.2 17 420
detd:lestr 11 225
------------
------------


6it [00:01,  4.21it/s]

detd:tresiba 19 272
detd:pediatria 15 36
detd:determir 15 92
detd:gerich 12 222
------------


7it [00:01,  4.14it/s]

------------
------------


9it [00:01,  5.30it/s]

------------
------------


12it [00:02,  5.57it/s]

detd:phosphoramadite 16 353
------------
------------


14it [00:02,  6.12it/s]

------------
------------


15it [00:02,  5.32it/s]

ti:platelets 11 411
detd:benzenepentacarboxylic 11 307
detd:benzenehexacarboxylic 11 306
detd:cyclopentanediacetic 13 65
detd:amnoethanesulfonic 11 12
detd:emta 13 381
detd:endca 13 27
detd:cholamine 13 241
detd:triscarboxylic 13 29
detd:methyltricarballylic 13 34
detd:amminoethane 11 14
detd:aggregometry 23 438
detd:plateletpheresis 12 340
detd:polysucrose 13 475
detd:gpvi 13 391
detd:gpiiia 14 362
------------
------------


18it [00:03,  5.18it/s]

------------
------------


20it [00:03,  6.34it/s]

------------
------------


21it [00:03,  6.59it/s]

------------
detd:sentrin 13 334
detd:prka 11 482
detd:chromobox 11 368


22it [00:04,  2.67it/s]

------------


24it [00:05,  3.34it/s]

detd:expensify 15 99
detd:gusto 15 382
detd:generitech 15 90
detd:expensing 15 291
detd:operationalizing 15 397
detd:value_1 12 165
------------
------------


26it [00:05,  4.32it/s]

------------
------------


27it [00:05,  4.45it/s]

detd:bistrifluoromethylbenzidine 11 28
detd:bistrifluoromethoxylbenzidine 11 1
detd:bistrifluoromethyldiphenyl 11 4
detd:trifluoromethylphenyloxyl 11 1
detd:naphthaloyl 12 235
detd:biphenyldicarbonyl 15 144
detd:diaminodiphenic 13 12
detd:bisphenylfluorene 17 38
detd:pfmb 21 57
detd:ffda 11 38
detd:dadp 11 460
clm:ozo 20 28
clm:perfluorobiphenyl 11 135
detd:tfdb 15 259
------------
------------


30it [00:06,  4.87it/s]

clm:propenylene 14 185
clm:leukotrienes 31 289
detd:biosyntheses 32 485
detd:thioethoxide 16 294
detd:diphenylphosphide 16 309
detd:aharony 21 138
detd:forder 34 264
detd:heparinisation 34 58
detd:humes 23 488
detd:aked 22 244
detd:nsaia 34 61
detd:meck 23 180
ab:sulphinyl 31 103
ab:sulphonyl 31 348
clm:sulphinyl 24 422
detd:peroxysulphate 30 59
detd:peroxymonosulphate 30 188
detd:intrapouch 20 18
detd:sp62 32 266
detd:hydroxytetrahydropyran 14 357
detd:ylsulphonyl 11 433
ab:alkynylene 12 299
clm:propynylene 12 36
detd:propylsulphinyl 19 288
detd:butylsulphinyl 18 172
detd:aryldi 13 133
detd:pentaoxacyclopentadecane 13 160
ab:heterocyclene 16 21
detd:ynylene 12 146
detd:cyanopropoxy 16 303
detd:carbamoylmethoxy 15 478
detd:methylaminoethoxy 12 463
detd:methoxycarbonylethoxy 15 183
detd:alkylsulphide 12 70
detd:diarylphosphide 12 61
detd:2.sup.b 15 492
detd:3.sup.c 15 201
detd:4.sup.d 14 98
detd:5.sup.e 14 74
detd:6.sup.f 14 46
detd:7.sup.g 12 29
detd:8.sup.h 11 31
detd:methylallyloxy 1

32it [00:06,  5.74it/s]

------------
------------


33it [00:07,  5.68it/s]

------------


35it [00:07,  5.22it/s]

------------
------------


37it [00:07,  5.13it/s]

------------
------------


39it [00:08,  5.45it/s]

ti:urethanes 11 298
ab:urethanes 14 443
------------
ti:fibrinogen 15 281
------------


40it [00:08,  5.25it/s]

------------
------------


43it [00:08,  5.94it/s]

detd:macapp 14 123
detd:menubar 13 297
detd:autoscroll 11 76
------------
------------


44it [00:09,  5.57it/s]

detd:cloudlet 16 466
detd:cloudran 16 261
------------


46it [00:09,  4.97it/s]

detd:completable 15 226
ti:drones 11 262
ab:atc 11 279
------------
ab:palatability 21 372
clm:palatability 13 401
clm:farinaceous 13 369
------------


48it [00:09,  5.72it/s]

------------
------------


49it [00:10,  6.40it/s]

------------
------------


51it [00:10,  4.93it/s]

detd:orthotist 11 201
------------
------------



