In [1]:
import gc
import warnings

import numpy as np
import polars as pl
import yaml
from tqdm import tqdm

warnings.filterwarnings("ignore")

In [7]:
meta = pl.read_parquet("/kaggle/input/uspto-boolean-search-optimization/patent_metadata.parquet")
meta.head()

publication_number,publication_date,filing_date,family_id,cpc_codes
str,datetime[μs],datetime[μs],f64,list[str]
"""US-1-A""",1836-07-13 00:00:00,,2060279.0,"[""B61C11/04""]"
"""US-1-P""",1931-08-18 00:00:00,1930-08-06 00:00:00,49893162.0,"[""A01H5/02"", ""A01H6/749""]"
"""US-10-A""",1836-08-10 00:00:00,,2060288.0,"[""B27J1/00""]"
"""US-1000-A""",1838-11-03 00:00:00,,2061286.0,"[""B60G11/04"", ""Y10S507/905""]"
"""US-10000-A""",1853-09-06 00:00:00,,2070320.0,"[""F04D29/283""]"


In [36]:
meta.sample(100)

publication_number,publication_date,filing_date,family_id,cpc_codes
str,datetime[μs],datetime[μs],f64,list[str]
"""US-7912907-B1""",2011-03-22 00:00:00,2005-10-07 00:00:00,4.3741868e7,"[""H04L51/212""]"
"""US-D359868-S""",1995-07-04 00:00:00,1994-01-31 00:00:00,6.5241378e7,[]
"""US-4159179-A""",1979-06-26 00:00:00,1977-06-02 00:00:00,2.5184184e7,"[""G01N21/251""]"
"""US-1167537-A""",1916-01-11 00:00:00,1915-02-18 00:00:00,3.235556e6,"[""D05B69/22""]"
"""US-9383549-B2""",2016-07-05 00:00:00,2015-03-06 00:00:00,5.4122738e7,"[""G01S7/4813"", ""G02B13/0015"", … ""H04N5/2254""]"
"""US-1644437-A""",1927-10-04 00:00:00,1925-08-18 00:00:00,2.1968876e7,"[""B23Q1/545"", ""B25B1/22""]"
"""US-5775054-A""",1998-07-07 00:00:00,1997-02-19 00:00:00,1.1341116e7,"[""B65B19/10"", ""B65G2201/0226""]"
"""US-10769213-B2…",2020-09-08 00:00:00,2016-10-24 00:00:00,6.196956e7,"[""G06F16/334"", ""G06F16/353"", ""G06F16/93""]"
"""US-2017156145-…",2017-06-01 00:00:00,2017-02-14 00:00:00,4.0512424e7,"[""H04L1/1812"", ""H04L47/36"", … ""H04W80/02""]"
"""US-3440095-A""",1969-04-22 00:00:00,1966-09-01 00:00:00,2.4304897e7,"[""C23F11/04"", ""C23G1/065""]"


In [19]:
usd_meta = meta.filter(pl.col("publication_number").str.starts_with("US-D"))
non_usd_meta = meta.filter(~pl.col("publication_number").str.starts_with("US-D"))
usd_meta.shape, non_usd_meta.shape

((681926, 5), (12625825, 5))

In [35]:
non_usd_meta.filter(pl.col("cpc_codes").list.lengths() == 0).filter(
    pl.col("publication_date") > pl.date(2000, 1, 1)
).sort("publication_date").sample(20)

publication_number,publication_date,filing_date,family_id,cpc_codes
str,datetime[μs],datetime[μs],f64,list[str]
"""US-PP27400-P3""",2016-11-22 00:00:00,2014-10-08 00:00:00,55656473.0,[]
"""US-2021410347-…",2021-12-30 00:00:00,2021-06-07 00:00:00,79023911.0,[]
"""US-PP34436-P3""",2022-07-26 00:00:00,2020-12-23 00:00:00,82021966.0,[]
"""US-2018352701-…",2018-12-06 00:00:00,2017-05-30 00:00:00,64315362.0,[]
"""US-PP27760-P2""",2017-03-07 00:00:00,2015-10-01 00:00:00,58161674.0,[]
"""US-RE42580-E""",2011-08-02 00:00:00,2009-02-11 00:00:00,44342907.0,[]
"""US-PP29464-P2""",2018-07-03 00:00:00,2016-07-25 00:00:00,62684444.0,[]
"""US-PP33990-P2""",2022-03-01 00:00:00,2021-05-24 00:00:00,80442370.0,[]
"""US-PP29812-P3""",2018-11-06 00:00:00,2017-02-09 00:00:00,63037042.0,[]
"""US-2018077840-…",2018-03-15 00:00:00,2016-09-15 00:00:00,61558550.0,[]


In [20]:
usd_meta

publication_number,publication_date,filing_date,family_id,cpc_codes
str,datetime[μs],datetime[μs],f64,list[str]
"""US-D107228-S""",1937-11-30 00:00:00,1937-09-22 00:00:00,5.7100844e7,"[""A46B9/04""]"
"""US-D114511-S""",1939-04-25 00:00:00,1939-01-19 00:00:00,5.1690155e7,"[""A43B13/125"", ""A43B13/145""]"
"""US-D120187-S""",1940-04-23 00:00:00,1939-11-25 00:00:00,5.5699264e7,"[""B64C3/16"", ""B64C39/04"", ""B64C39/066""]"
"""US-D123595-S""",1940-11-19 00:00:00,1939-09-21 00:00:00,5.5699266e7,"[""B64C3/10""]"
"""US-D126473-S""",1941-04-08 00:00:00,1940-12-26 00:00:00,3.4618917e7,[]
"""US-D133482-S""",1942-08-18 00:00:00,1942-06-03 00:00:00,4.2244745e7,"[""E04H17/004"", ""E04H17/1439""]"
"""US-D138028-S""",1944-06-06 00:00:00,1944-03-20 00:00:00,4.7739649e7,[]
"""US-D138697-S""",1944-09-05 00:00:00,1943-10-26 00:00:00,5.8212853e7,[]
"""US-D148402-S""",1948-01-20 00:00:00,1946-04-26 00:00:00,8.2703234e7,[]
"""US-D155773-S""",1949-11-01 00:00:00,1948-09-30 00:00:00,5.7406004e7,"[""A45D2200/10"", ""A45D34/00"", … ""A45D40/28""]"
