In [1]:
import json
import multiprocessing
import os
import re
import tempfile
from collections import Counter
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import whoosh
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from whoosh.analysis import StandardAnalyzer

In [2]:
df = pl.read_parquet("/kaggle/input/uspto-boolean-search-optimization/patent_metadata.parquet")
df.head(1)

publication_number,publication_date,filing_date,family_id,cpc_codes
str,datetime[μs],datetime[μs],f64,list[str]
"""US-1-A""",1836-07-13 00:00:00,,2060279.0,"[""B61C11/04""]"


In [3]:
counter = Counter()

for cpc_codes in tqdm(df["cpc_codes"]):
    counter.update(cpc_codes.to_list())

  0%|          | 0/13307751 [00:00<?, ?it/s]

100%|██████████| 13307751/13307751 [00:34<00:00, 388694.30it/s]


In [4]:
counter.most_common(10)

[('Y02E60/10', 90498),
 ('A61P35/00', 78482),
 ('A61P43/00', 64549),
 ('H01L2924/0002', 53016),
 ('Y02P70/50', 46434),
 ('Y02T10/12', 43992),
 ('G06N20/00', 43042),
 ('A61K45/06', 41954),
 ('A61P29/00', 41285),
 ('Y02T10/70', 36955)]

In [5]:
len(counter)

265889

In [6]:
cpc2count = {cpc: count for cpc, count in counter.items()}
with open("cpc2count.json", "w") as f:
    json.dump(cpc2count, f)

In [7]:
patent_id2cpc = {
    pat_id: cpc_codes.to_list()
    for pat_id, cpc_codes in zip(df["publication_number"], df["cpc_codes"])
}

In [8]:
with open("patent2cpc.json", "w") as f:
    json.dump(patent_id2cpc, f)

In [9]:
patent_id2cpc

{'US-1-A': ['B61C11/04'],
 'US-1-P': ['A01H5/02', 'A01H6/749'],
 'US-10-A': ['B27J1/00'],
 'US-1000-A': ['B60G11/04', 'Y10S507/905'],
 'US-10000-A': ['F04D29/283'],
 'US-10000-P': ['A01H5/02', 'A01H6/42'],
 'US-100000-A': ['B68B1/04', 'B68C2005/005'],
 'US-1000000-A': ['B60C2011/0313',
  'B60C7/04',
  'B60C7/101',
  'G06F1/00',
  'G07D1/00',
  'H01L2224/05124',
  'Y10T152/10423'],
 'US-10000001-B2': ['B29C2945/76083',
  'B29C2945/76227',
  'B29C2945/76431',
  'B29C2945/76505',
  'B29C2945/76866',
  'B29C45/1751',
  'B29C45/64',
  'B29C45/66',
  'B29C45/661',
  'B29C45/76',
  'B29C45/7653',
  'B29C45/80',
  'G05B19/182',
  'G05B19/402',
  'G05B2219/45244'],
 'US-10000002-B2': ['B29C47/0004',
  'B29C47/0019',
  'B29C47/0021',
  'B29C47/065',
  'B29C47/707',
  'B29C48/022',
  'B29C48/07',
  'B29C48/08',
  'B29C48/21',
  'B29C48/71',
  'B29D2030/0682',
  'B29D30/0681',
  'B29K2021/00',
  'B29K2071/00',
  'B29K2077/00',
  'B29L2030/008',
  'B32B1/00',
  'B32B2250/00',
  'B32B2250/24',
  'B3