In [1]:
import os
import sys

PACKAGE_DIR = "/kaggle/src"
sys.path.append(PACKAGE_DIR)
sys.path.append(os.path.join(PACKAGE_DIR, "Penguin-ML-Library"))

In [2]:
import whoosh_utils

Processing /kaggle/input/whoosh-wheel-2-7-4/Whoosh-2.7.4-py2.py3-none-any.whl
Whoosh is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


[0m

In [3]:
import gc
import warnings

import numpy as np
import polars as pl
import yaml
from tqdm import tqdm

In [4]:
# train_idx = whoosh_utils.load_index("/kaggle/input/rich-index")
# searcher = whoosh_utils.get_searcher(train_idx)
qp = whoosh_utils.get_query_parser()
# train_idx.doc_count()

In [6]:
query = 'ti:\'"hoge""fuga"~2\''
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND ti:fuga)


In [8]:
query = 'ti:"hoge fuga"~4'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

2
ti:"hoge fuga"


In [9]:
query = 'ti:hoge ADJ2 ti:fuga'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

3
(ti:"hoge ti" AND (ti:fuga OR ab:fuga OR clm:fuga OR detd:fuga OR cpc::fuga))


In [19]:
query = "hoge ADJ3 fuga"
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

3
(ti:"hoge fuga" OR ab:"hoge fuga" OR clm:"hoge fuga" OR detd:"hoge fuga" OR cpc:"hoge fuga")


In [23]:
query = 'ti:hoge-fuga'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND ti:fuga)


In [40]:
query = 'ti:"hoge""fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND (ti:fuga OR ab:fuga OR clm:fuga OR detd:fuga OR cpc:fuga))


In [28]:
query = 'ti:hoge&fuga'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND ti:fuga)


In [26]:
query = 'ti:(hoge|fuga)'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

2
(ti:hoge AND ti:fuga)


In [34]:
query = 'NOT&cpc:hoge'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(cpc:NOT& AND cpc:hoge)


In [39]:
query = 'NOT@cpc:hoge'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(cpc:NOT@ AND cpc:hoge)


In [26]:
query = '(ti:hoge) OR (ti:fuga)'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

3
(ti:hoge OR ti:fuga)


In [5]:
query = 'ti:"hoge"ti:"fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND ti:fuga)


In [6]:
query = 'ti:"\'hoge\'\'fuga\'\'foo\'"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
ti:"hoge fuga foo"


In [8]:
query = 'ti:"hoge"ti:"\'hoge\'\'fuga\'\'foo\'"ti:"fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND ti:"hoge fuga foo" AND ti:fuga)


In [55]:
query = 'ti:"hoge"\nOR\nti:"fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

3
(ti:hoge OR ti:fuga)


In [56]:
query = 'ti:"hoge"\u200BOR\u200Bti:"fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND (ti:or OR ab:or OR clm:or OR detd:or OR cpc:​OR​) AND ti:fuga)


In [57]:
query = 'ti:"hoge" | ti:"fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

3
(ti:hoge AND cpc:| AND ti:fuga)


In [62]:
query = '!ti:"hoge"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(cpc:! AND ti:hoge)


In [6]:
query = "mobile NOT (mobile OR phone)"
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

5
((ti:mobile OR ab:mobile OR clm:mobile OR detd:mobile OR cpc:mobile) AND NOT (ti:mobile OR ab:mobile OR clm:mobile OR detd:mobile OR cpc:mobile OR ti:phone OR ab:phone OR clm:phone OR detd:phone OR cpc:phone))


In [7]:
query = 'mobile NOT "mobile OR phone"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

5
((ti:mobile OR ab:mobile OR clm:mobile OR detd:mobile OR cpc:mobile) AND NOT (ti:"mobile or phone" OR ab:"mobile or phone" OR clm:"mobile or phone" OR detd:"mobile or phone" OR cpc:"mobile OR phone"))


In [8]:
query = '"mobile phone"~2'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

2
(ti:"mobile phone" OR ab:"mobile phone" OR clm:"mobile phone" OR detd:"mobile phone" OR cpc:"mobile phone")


In [11]:
query = "mobile ADJ2 phone"
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

3
(ti:"mobile phone" OR ab:"mobile phone" OR clm:"mobile phone" OR detd:"mobile phone" OR cpc:"mobile phone")


In [10]:
query = 'ti:hoge"mobile phone"~2'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

2
(ti:hoge AND (ti:"mobile phone" OR ab:"mobile phone" OR clm:"mobile phone" OR detd:"mobile phone" OR cpc:"mobile phone"))


In [11]:
query = 'ti:hoge"mobile"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND (ti:mobile OR ab:mobile OR clm:mobile OR detd:mobile OR cpc:mobile))


In [14]:
query = 'ti:"hoge"ti:"mobile"ti:"fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
(ti:hoge AND ti:mobile AND ti:fuga)


In [17]:
query = '"hoge""mobile""fuga"'
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

1
((ti:hoge OR ab:hoge OR clm:hoge OR detd:hoge OR cpc:hoge) AND (ti:mobile OR ab:mobile OR clm:mobile OR detd:mobile OR cpc:mobile) AND (ti:fuga OR ab:fuga OR clm:fuga OR detd:fuga OR cpc:fuga))


In [10]:
query = "(ti:mobile ADJ2 phone) OR (ti:mobile ADJ2 tv)"
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

7
(ti:"mobile phone" OR ti:"mobile tv")


In [13]:
query = "ti:(mobile ADJ2 (phone OR tv))"
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))

6
(ti:mobile AND ti:adj2 AND (ti:phone OR ti:tv))


In [6]:
query = "USA"
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))
results = whoosh_utils.execute_query(query, qp, searcher)[:5]
results

1
(ti:usa OR ab:usa OR clm:usa OR detd:usa OR cpc:USA OR id:USA)


[<Hit {'ab': 'The present invention provides polynucleotides encoding human and murine guanylate binding protein polypeptides, fragments and homologues thereof. Also provided are vectors, host cells, antibodies, and recombinant and synthetic methods for producing said polypeptides. The present invention further relates to diagnostic and therapeutic methods for applying the guanylate binding protein polypeptides to the diagnosis, treatment, and/or prevention of various diseases and/or disorders related to these polypeptides, such as rheumatoid arthritis and/or conditions related to aberrant NF-κB activity, guanylate binding activity and GTPase activity. The present invention further relates to screening methods for identifying agonists and antagonists of the polynucleotides and polypeptides of the present invention.', 'clm': '1. An isolated nucleic acid molecule comprising a polynucleotide sequence selected from the group consisting of:\n (a) an isolated polynucleotide encoding a polype

In [7]:
query = "visited"
print(whoosh_utils.count_query_tokens(query=query))
print(qp.parse(query))
results = whoosh_utils.execute_query(query, qp, searcher)[:5]
results

1
(ti:visited OR ab:visited OR clm:visited OR detd:visited OR cpc:visited OR id:visited)


[<Hit {'ab': 'A wireless router is provided which is configured to support a first subnet and a second subnet. The wireless router comprises a plurality of virtual wireless switches. Each virtual wireless switch comprises a plurality of access ports. The first subnet comprises a group of the access ports belonging to the first virtual wireless switch, and the second subnet comprises a second group of the access ports belonging to the second virtual wireless switch. The wireless router is configured to support layer 3 mobility when a client, having a client IP address from within the first subnet, roams from the first subnet to the second subnet, from the first to the second virtual wireless switch. The wireless router stores registration information associated with the client to allow a client to roam between the first subnet and the second subnet while keeping the client IP address. The wireless router uses the registration information to send packets to the client when the client has

: 

In [10]:
test = pl.read_csv("/kaggle/input/uspto-boolean-search-optimization/test.csv")
test_patents = set(sum(test.to_numpy().tolist(), []))

exist = []
for patent in test_patents:
    query = f"id:{patent}"
    results = whoosh_utils.execute_query(query, qp, searcher)
    exist.append(len(results) > 0)
sum(exist) / len(exist)

0.0196078431372549

In [11]:
import json

with open("/kaggle/input/uspto-boolean-search-optimization/train_index_patent_ids.json", "r") as f:
    train_index_patent_ids = set(json.load(f))

exist = []
for patent in tqdm(train_index_patent_ids):
    query = f"id:{patent}"
    results = whoosh_utils.execute_query(query, qp, searcher)
    exist.append(len(results) > 0)
sum(exist) / len(exist)

100%|██████████| 200000/200000 [00:40<00:00, 4973.34it/s]


1.0

In [12]:
with open("/kaggle/input/patent2parquet/patent2parquet.json", "r") as f:
    patent2parquet = json.load(f)

patent2parquet = pl.DataFrame({"publication_number": list(patent2parquet.keys())})

exist = []
for patent in tqdm(patent2parquet["publication_number"][::100]):
    query = f"id:{patent}"
    results = whoosh_utils.execute_query(query, qp, searcher)
    exist.append(len(results) > 0)
sum(exist) / len(exist)

100%|██████████| 133077/133077 [00:16<00:00, 7860.05it/s]


0.015336985354343725