# Preparing CPC Codes for embedding

In [1]:
import pandas as pd
import re
import xml.etree.ElementTree as ET


In [2]:
df = pd.read_csv('../QSURv3/Curation/PatentCuration/smiles_patent_cpc.csv')
df = df.dropna()

In [3]:
df.columns

Index(['smiles', 'patent_id', 'cpc'], dtype='object')

In [4]:
CPCcodes = df.cpc.unique()
CPCcodes

array(['AA61A61P/A61P3/04', 'NoneNoneNone/None', 'AA61A61P/A61P29/00',
       ..., 'BB01B01J/B01J29/084', 'CC12C12N/C12N5/0621',
       'BB09B09C/B09C1/002'], dtype=object)

Extracting the 4th level of CPC code because they have the same level of specificity as use cases 

In [23]:

def ExtractCPC(cpc_code):
    # First check if the input contains "None"
    if "None" in cpc_code:
        return None
        
    # Pattern explanation:
    # ^[^/]+ matches everything from start until first /
    # /([^/]{4})[^/]*/  captures 4 characters after first / but before second /
    pattern = r'^[^/]+/([^/]{4})[^/]*/'
    
    match = re.search(pattern, cpc_code)
    if match:
        return match.group(1)
    return None



In [24]:
FilteredCPCs = [ExtractCPC(x) for x in CPCcodes]

# match CPC code to their desciption for embedding

In [25]:
# fucntion for CPC code finding 
def SearchCPC(filepath, code):
    if code is None:
        return "No code provided"
        
    # Initialize lowtxt outside the loop
    lowtxt = ""
    
    tree = ET.parse(filepath)
    root = tree.getroot()   

    # get top level
    toptxt = ''
    for i in range(0,5):
        try:
            toptxt = toptxt + root[0][1][i][0].text + '/ '
        except IndexError:
            break

    code = code 
    for f in root.iter('classification-item'):
        if f[0].text == code:
            try:
                lowtxt = f[1][0][0].text
                break  # Exit loop once found
            except IndexError:
                lowtxt = "Text not found"
    
    if not lowtxt:  # If no match was found
        return f"No match found for code {code}"
        
    return toptxt + lowtxt


In [26]:
# gather code descriptions
CPCs = []
Descriptions = []
for i in FilteredCPCs:
    try:
        if i is not None:
            file_path = f'CPCSchemas/PatentData/cpc-scheme-{i}.xml'  # Simplified path
            d = SearchCPC(file_path, i)
            print(f"Code {i}: {d}")
            CPCs.append(i)
            Descriptions.append(d)
        else:
            print("Skipping None value")
    except FileNotFoundError:
        print(f"File not found for code {i}")
    except Exception as e:
        print(f"Error processing code {i}: {str(e)}")

Code A61P: SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS/ SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
Skipping None value
Code A61P: SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS/ SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
Code C09K: MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT PROVIDED FOR ELSEWHERE/ MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT PROVIDED FOR ELSEWHERE
Code C07C: ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR CARBOCYCLIC COMPOUNDS
Code C07C: ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR CARBOCYCLIC COMPOUNDS
Code C07D: HETEROCYCLIC COMPOUNDS / HETEROCYCLIC COMPOUNDS 
Code C07F: ACYCLIC, CARBOCYCLIC OR HETEROCYCLIC COMPOUNDS CONTAINING ELEMENTS OTHER THAN CARBON, HYDROGEN, HALOGEN, OXYGEN, NITROGEN, SULFUR, SELENIUM OR TELLURIUM / ACYCLIC, CARBOCYCLIC OR HETEROCYCLIC COMPOUNDS CONTAINING ELEMENTS OTHER THAN CARBON, HYDROGEN, HALOGEN, OXYGEN,

In [27]:
CPCmap = pd.DataFrame()
CPCmap['CPCcode'] = CPCs
CPCmap['CPCDescription'] = Descriptions
CPCmap

Unnamed: 0,CPCcode,CPCDescription
0,A61P,SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMP...
1,A61P,SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMP...
2,C09K,"MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT ..."
3,C07C,ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR C...
4,C07C,ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR C...
...,...,...
8609,H01B,CABLES/ CONDUCTORS/ INSULATORS/ SELECTION OF M...
8610,G02F,OPTICAL DEVICES OR ARRANGEMENTS FOR THE CONTRO...
8611,B01J,"CHEMICAL OR PHYSICAL PROCESSES, e.g. CATALYSIS..."
8612,C12N,MICROORGANISMS OR ENZYMES/ COMPOSITIONS THEREO...


In [28]:
df = CPCmap.drop_duplicates()
df

Unnamed: 0,CPCcode,CPCDescription
0,A61P,SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMP...
2,C09K,"MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT ..."
3,C07C,ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR C...
5,C07D,HETEROCYCLIC COMPOUNDS / HETEROCYCLIC COMPOUNDS
6,C07F,"ACYCLIC, CARBOCYCLIC OR HETEROCYCLIC COMPOUNDS..."
...,...,...
8270,A22B,SLAUGHTERING/ SLAUGHTERING
8290,G06Q,INFORMATION AND COMMUNICATION TECHNOLOGY [ICT]...
8335,G04F,TIME-INTERVAL MEASURING / TIME-INTERVAL MEASUR...
8554,B60S,"SERVICING, CLEANING, REPAIRING, SUPPORTING, LI..."


In [30]:
df.to_csv('CPCDescriptions.csv',index=False)

# test to make sure the extracted codes are searchable 


In [14]:
four = [extract_middle_section4(x) for x in CPCcodes ]
five = [extract_middle_section5(x) for x in CPCcodes ]

In [15]:
five

['A61P3/00',
 None,
 'A61P2/00',
 'C09K1/00',
 'C07C4/00',
 'C07C6/00',
 'C07D4/00',
 'C07F1/00',
 'A01N4/00',
 'B82B3/00',
 'A61K3/00',
 'A61K3/00',
 'A61P2/00',
 'A61K3/00',
 'C07C3/00',
 'A61K3/00',
 'A61K9/00',
 'A61P9/00',
 'A61K3/00',
 'A61P2/00',
 'A61K9/00',
 'B01J3/00',
 'C07D3/00',
 'A01N3/00',
 'C07D2/00',
 'A61P3/00',
 'C07H1/00',
 'C07C5/00',
 'C07F5/00',
 'A61K3/00',
 'A61P1/00',
 'A61K3/00',
 'A61K3/00',
 'A61K9/00',
 'A61K3/00',
 'C07K1/00',
 'C08F8/00',
 'C07C3/00',
 'C07C2/00',
 'C07B5/00',
 'A01N4/00',
 'A61K3/00',
 'A61K3/00',
 'A61K4/00',
 'A61K3/00',
 'C07D4/00',
 'C07C6/00',
 'C08G7/00',
 'H10K8/00',
 'H10K8/00',
 'H01M1/00',
 'A61K9/00',
 'A61K3/00',
 'A61K3/00',
 'A61K3/00',
 'C07C2/00',
 'A61K8/00',
 'A61P1/00',
 'C09K1/00',
 'B01J2/00',
 'A61P1/00',
 'A61K4/00',
 'B41C1/00',
 'A61K3/00',
 'C07D2/00',
 'A61K3/00',
 'A61P2/00',
 'C07C2/00',
 'A61K3/00',
 'A61K3/00',
 'C07D2/00',
 'A61K4/00',
 'A61K3/00',
 'C07F9/00',
 'C08K5/00',
 'G06F3/00',
 'A61K8/00',
 'A61

In [9]:
import xml.etree.ElementTree as ET

def SearchCPC(filepath, code):
    if code is None:
        return "No code provided"
        
    # Initialize lowtxt outside the loop
    lowtxt = ""
    
    tree = ET.parse(filepath)
    root = tree.getroot()   

    # get top level
    toptxt = ''
    for i in range(0,5):
        try:
            toptxt = toptxt + root[0][1][i][0].text + '/ '
        except IndexError:
            break

    code = code 
    for f in root.iter('classification-item'):
        if f[0].text == code:
            try:
                lowtxt = f[1][0][0].text
                break  # Exit loop once found
            except IndexError:
                lowtxt = "Text not found"
    
    if not lowtxt:  # If no match was found
        return f"No match found for code {code}"
        
    return toptxt + lowtxt

# For testing the codes
for i in four:
    try:
        if i is not None:
            file_path = f'/home/matt/Proj/QSURFinal/PatentData/cpc-scheme-{i}.xml'  # Simplified path
            d = SearchCPC(file_path, i)
            print(f"Code {i}: {d}")
        else:
            print("Skipping None value")
    except FileNotFoundError:
        print(f"File not found for code {i}")
    except Exception as e:
        print(f"Error processing code {i}: {str(e)}")

Code A61P: SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS/ SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
Skipping None value
Code A61P: SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS/ SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
Code C09K: MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT PROVIDED FOR ELSEWHERE/ MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT PROVIDED FOR ELSEWHERE
Code C07C: ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR CARBOCYCLIC COMPOUNDS
Code C07C: ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR CARBOCYCLIC COMPOUNDS
Code C07D: HETEROCYCLIC COMPOUNDS / HETEROCYCLIC COMPOUNDS 
Code C07F: ACYCLIC, CARBOCYCLIC OR HETEROCYCLIC COMPOUNDS CONTAINING ELEMENTS OTHER THAN CARBON, HYDROGEN, HALOGEN, OXYGEN, NITROGEN, SULFUR, SELENIUM OR TELLURIUM / ACYCLIC, CARBOCYCLIC OR HETEROCYCLIC COMPOUNDS CONTAINING ELEMENTS OTHER THAN CARBON, HYDROGEN, HALOGEN, OXYGEN,