In [1]:
import pandas as pd
import re

# Function to extract first 4 characters between first and second slash
def extract_cpc_code(full_code):
    pattern = r'^[^/]+/([^/]{4})'
    match = re.search(pattern, full_code)
    if match:
        return match.group(1)
    return None

# Read the files
mol_data = pd.read_csv('../Curation/PatentCuration/smiles_patent_cpc.csv', header=None, names=['SMILES', 'Patent', 'CPC_Code'])
func_mappings = pd.read_csv('functional_cpc_mappings.csv')

# Extract the 4-character CPC codes from mol_data
mol_data['CPC_4char'] = mol_data['CPC_Code'].apply(extract_cpc_code)

# Create a dictionary for quick lookup from func_mappings
cpc_to_function = dict(zip(func_mappings['Most_Similar_CPC'], func_mappings['Functional_Use']))

# Map the functions to the molecular data
mol_data['Mapped_Function'] = mol_data['CPC_4char'].map(cpc_to_function)

# Display first few rows to verify
print(mol_data.head())

# Save the result
mol_data.to_csv('molecular_data_with_functions.csv', index=False)

# Print some statistics
print("\nNumber of unique CPC codes:", mol_data['CPC_4char'].nunique())
print("Number of mapped functions:", mol_data['Mapped_Function'].nunique())
print("Number of unmapped CPCs:", mol_data['CPC_4char'].nunique() - mol_data['Mapped_Function'].nunique())

                                              SMILES      Patent  \
0                                             smiles   patent_id   
1  O=C(c1cc(Cl)ccc1-c1ncccn1)N1C2CCC(COc3ccc(F)cn...  US10100047   
2  Cc1nc(C(=O)N2C3CCC(COc4ccc(F)cn4)C34CCC24)c(-c...  US10100047   
3  COCCCOc1cc2c(cc1Cl)-c1cc(=O)c(C(=O)O)cn1N(C(C)...  US10100052   
4  CC(C)(C)N1Cc2cc(OCCCOC(O)(O)O)c(Cl)cc2-c2cc(=O...  US10100052   

            CPC_Code CPC_4char Mapped_Function  
0                cpc      None             NaN  
1  AA61A61P/A61P3/04      A61P             NaN  
2  AA61A61P/A61P3/04      A61P             NaN  
3  NoneNoneNone/None      None             NaN  
4  NoneNoneNone/None      None             NaN  

Number of unique CPC codes: 254
Number of mapped functions: 28
Number of unmapped CPCs: 226


In [1]:
import pandas as pd
import re

# Function to extract first 4 characters between first and second slash
def extract_cpc_code(full_code):
    pattern = r'^[^/]+/([^/]{4})'
    match = re.search(pattern, full_code)
    if match:
        return match.group(1)
    return None

# Read the files
mol_data = pd.read_csv('../Curation/PatentCuration/smiles_patent_cpc.csv', header=None, names=['SMILES', 'Patent', 'CPC_Code'])
func_mappings = pd.read_csv('functional_cpc_mappings.csv')

# Extract the 4-character CPC codes from mol_data
mol_data['CPC_4char'] = mol_data['CPC_Code'].apply(extract_cpc_code)

# Create a dictionary for quick lookup from func_mappings
# Now mapping CPC to Functional_Use instead of the other way around
cpc_to_function = dict(zip(func_mappings['Most_Similar_CPC'], func_mappings['Functional_Use']))

# Map the functional uses to the molecular data
mol_data['Functional_Use'] = mol_data['CPC_4char'].map(cpc_to_function)

# Display first few rows to verify
print("\nFirst few rows of mapped data:")
print(mol_data.head())

# Print mapping statistics
print("\nMapping Statistics:")
print(f"Total molecules: {len(mol_data)}")
print(f"Unique CPC codes: {mol_data['CPC_4char'].nunique()}")
print(f"Molecules with mapped functions: {mol_data['Functional_Use'].notna().sum()}")
print(f"Molecules without mapped functions: {mol_data['Functional_Use'].isna().sum()}")

# Save the result
mol_data.to_csv('molecular_data_with_functions.csv', index=False)


First few rows of mapped data:
                                              SMILES      Patent  \
0                                             smiles   patent_id   
1  O=C(c1cc(Cl)ccc1-c1ncccn1)N1C2CCC(COc3ccc(F)cn...  US10100047   
2  Cc1nc(C(=O)N2C3CCC(COc4ccc(F)cn4)C34CCC24)c(-c...  US10100047   
3  COCCCOc1cc2c(cc1Cl)-c1cc(=O)c(C(=O)O)cn1N(C(C)...  US10100052   
4  CC(C)(C)N1Cc2cc(OCCCOC(O)(O)O)c(Cl)cc2-c2cc(=O...  US10100052   

            CPC_Code CPC_4char Functional_Use  
0                cpc      None            NaN  
1  AA61A61P/A61P3/04      A61P        Biocide  
2  AA61A61P/A61P3/04      A61P        Biocide  
3  NoneNoneNone/None      None            NaN  
4  NoneNoneNone/None      None            NaN  

Mapping Statistics:
Total molecules: 2879821
Unique CPC codes: 254
Molecules with mapped functions: 2859974
Molecules without mapped functions: 19847


In [2]:
mol_data.head()

Unnamed: 0,SMILES,Patent,CPC_Code,CPC_4char,Functional_Use
0,smiles,patent_id,cpc,,
1,O=C(c1cc(Cl)ccc1-c1ncccn1)N1C2CCC(COc3ccc(F)cn...,US10100047,AA61A61P/A61P3/04,A61P,Biocide
2,Cc1nc(C(=O)N2C3CCC(COc4ccc(F)cn4)C34CCC24)c(-c...,US10100047,AA61A61P/A61P3/04,A61P,Biocide
3,COCCCOc1cc2c(cc1Cl)-c1cc(=O)c(C(=O)O)cn1N(C(C)...,US10100052,NoneNoneNone/None,,
4,CC(C)(C)N1Cc2cc(OCCCOC(O)(O)O)c(Cl)cc2-c2cc(=O...,US10100052,NoneNoneNone/None,,


In [3]:
valid =mol_data.dropna()

In [4]:
len(mol_data.SMILES.unique())

1676420

In [5]:
len(valid.SMILES.unique())

1675372

In [6]:
valid

Unnamed: 0,SMILES,Patent,CPC_Code,CPC_4char,Functional_Use
1,O=C(c1cc(Cl)ccc1-c1ncccn1)N1C2CCC(COc3ccc(F)cn...,US10100047,AA61A61P/A61P3/04,A61P,Biocide
2,Cc1nc(C(=O)N2C3CCC(COc4ccc(F)cn4)C34CCC24)c(-c...,US10100047,AA61A61P/A61P3/04,A61P,Biocide
5,CC1C(C2(C)CCC(O[Si](C)(C)C(C)(C)C)CC2CCO)CCC2(...,US10100056,AA61A61P/A61P29/00,A61P,Biocide
6,CC1=CCC2C(CN)C(C3(C)CCC(O)CC3CO)CCC12C,US10100056,AA61A61P/A61P29/00,A61P,Biocide
7,CC1(C)CC(C2CCC(OCc3cc(C(C)(C)C)c(O)c(C(C)(C)C)...,US10100250,CC09C09K/C09K19/3003,C09K,Color scavenger (EPA)
...,...,...,...,...,...
2879816,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US10651411,CC07C07C/C07C13/567,C07C,Conductive agent
2879817,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11248009,CC07C07F/C07F5/02,C07F,Refrigerants
2879818,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11370759,CC07C07D/C07D219/02,C07D,Freeze-thaw additive
2879819,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11450815,CC07C07C/C07C209/74,C07C,Conductive agent


# second attempt 

In [7]:
import pandas as pd
import re

def extract_cpc_code(full_code):
    """Extract first 4 characters between first and second slash"""
    pattern = r'^[^/]+/([^/]{4})'
    match = re.search(pattern, full_code)
    if match:
        return match.group(1)
    return None

# Read the files
mol_data = pd.read_csv('../Curation/PatentCuration/smiles_patent_cpc.csv', 
                       header=None, 
                       names=['SMILES', 'Patent', 'CPC_Code'])
func_mappings = pd.read_csv('functional_cpc_mappings.csv')

# Extract the 4-character CPC codes from mol_data
mol_data['CPC_4char'] = mol_data['CPC_Code'].apply(extract_cpc_code)

# Create dictionaries for both function and similarity score lookups
cpc_to_function = dict(zip(func_mappings['Most_Similar_CPC'], func_mappings['Functional_Use']))
cpc_to_similarity = dict(zip(func_mappings['Most_Similar_CPC'], func_mappings['Similarity_Score']))

# Map both functional uses and similarity scores to the molecular data
mol_data['Functional_Use'] = mol_data['CPC_4char'].map(cpc_to_function)
mol_data['Mapping_Similarity'] = mol_data['CPC_4char'].map(cpc_to_similarity)

# Add CPC description from func_mappings
cpc_to_description = dict(zip(func_mappings['Most_Similar_CPC'], func_mappings['CPC_Description']))
mol_data['CPC_Description'] = mol_data['CPC_4char'].map(cpc_to_description)

# Display first few rows to verify
print("\nFirst few rows of mapped data:")
print(mol_data.head())

# Print mapping statistics
print("\nMapping Statistics:")
print(f"Total molecules: {len(mol_data)}")
print(f"Unique CPC codes: {mol_data['CPC_4char'].nunique()}")
print(f"Molecules with mapped functions: {mol_data['Functional_Use'].notna().sum()}")
print(f"Molecules without mapped functions: {mol_data['Functional_Use'].isna().sum()}")

# Calculate and display similarity score statistics
print("\nSimilarity Score Statistics:")
print(mol_data['Mapping_Similarity'].describe())

# Group by functional use and calculate average similarity
func_use_stats = mol_data.groupby('Functional_Use').agg({
    'Mapping_Similarity': ['mean', 'min', 'max', 'count']
}).round(3)

print("\nFunctional Use Statistics:")
print(func_use_stats)

# Save the result with all new columns
mol_data.to_csv('molecular_data_with_functions_and_similarity.csv', index=False)

# Create a summary report for unmapped CPC codes
unmapped_cpcs = mol_data[mol_data['Functional_Use'].isna()]['CPC_4char'].unique()
if len(unmapped_cpcs) > 0:
    print("\nUnmapped CPC codes:")
    print(unmapped_cpcs)
    
    # Save unmapped CPCs to a separate file for review
    pd.DataFrame({'Unmapped_CPC': unmapped_cpcs}).to_csv('unmapped_cpc_codes.csv', index=False)

# Generate quality metrics for the mapping
quality_metrics = {
    'Total_Molecules': len(mol_data),
    'Mapped_Molecules': mol_data['Functional_Use'].notna().sum(),
    'Mapping_Coverage': (mol_data['Functional_Use'].notna().sum() / len(mol_data)) * 100,
    'Average_Similarity': mol_data['Mapping_Similarity'].mean(),
    'Median_Similarity': mol_data['Mapping_Similarity'].median(),
    'Min_Similarity': mol_data['Mapping_Similarity'].min(),
    'Max_Similarity': mol_data['Mapping_Similarity'].max(),
    'Unique_Functions': mol_data['Functional_Use'].nunique(),
    'Unique_CPC_Codes': mol_data['CPC_4char'].nunique()
}

print("\nQuality Metrics:")
for metric, value in quality_metrics.items():
    if isinstance(value, float):
        print(f"{metric}: {value:.2f}")
    else:
        print(f"{metric}: {value}")

# Save quality metrics to a separate file
pd.DataFrame([quality_metrics]).to_csv('mapping_quality_metrics.csv', index=False)


First few rows of mapped data:
                                              SMILES      Patent  \
0                                             smiles   patent_id   
1  O=C(c1cc(Cl)ccc1-c1ncccn1)N1C2CCC(COc3ccc(F)cn...  US10100047   
2  Cc1nc(C(=O)N2C3CCC(COc4ccc(F)cn4)C34CCC24)c(-c...  US10100047   
3  COCCCOc1cc2c(cc1Cl)-c1cc(=O)c(C(=O)O)cn1N(C(C)...  US10100052   
4  CC(C)(C)N1Cc2cc(OCCCOC(O)(O)O)c(Cl)cc2-c2cc(=O...  US10100052   

            CPC_Code CPC_4char Functional_Use  Mapping_Similarity  \
0                cpc      None            NaN                 NaN   
1  AA61A61P/A61P3/04      A61P        Biocide            0.507894   
2  AA61A61P/A61P3/04      A61P        Biocide            0.507894   
3  NoneNoneNone/None      None            NaN                 NaN   
4  NoneNoneNone/None      None            NaN                 NaN   

   CPC_Description  
0              NaN  
1              NaN  
2              NaN  
3              NaN  
4              NaN  

Mapping Statistic

In [8]:
mol_data

Unnamed: 0,SMILES,Patent,CPC_Code,CPC_4char,Functional_Use,Mapping_Similarity,CPC_Description
0,smiles,patent_id,cpc,,,,
1,O=C(c1cc(Cl)ccc1-c1ncccn1)N1C2CCC(COc3ccc(F)cn...,US10100047,AA61A61P/A61P3/04,A61P,Biocide,0.507894,
2,Cc1nc(C(=O)N2C3CCC(COc4ccc(F)cn4)C34CCC24)c(-c...,US10100047,AA61A61P/A61P3/04,A61P,Biocide,0.507894,
3,COCCCOc1cc2c(cc1Cl)-c1cc(=O)c(C(=O)O)cn1N(C(C)...,US10100052,NoneNoneNone/None,,,,
4,CC(C)(C)N1Cc2cc(OCCCOC(O)(O)O)c(Cl)cc2-c2cc(=O...,US10100052,NoneNoneNone/None,,,,
...,...,...,...,...,...,...,...
2879816,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US10651411,CC07C07C/C07C13/567,C07C,Conductive agent,0.502147,
2879817,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11248009,CC07C07F/C07F5/02,C07F,Refrigerants,0.502259,
2879818,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11370759,CC07C07D/C07D219/02,C07D,Freeze-thaw additive,0.510843,
2879819,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11450815,CC07C07C/C07C209/74,C07C,Conductive agent,0.502147,


In [9]:
valid = mol_data.drop(['CPC_Description'], axis=1)

In [10]:
valid

Unnamed: 0,SMILES,Patent,CPC_Code,CPC_4char,Functional_Use,Mapping_Similarity
0,smiles,patent_id,cpc,,,
1,O=C(c1cc(Cl)ccc1-c1ncccn1)N1C2CCC(COc3ccc(F)cn...,US10100047,AA61A61P/A61P3/04,A61P,Biocide,0.507894
2,Cc1nc(C(=O)N2C3CCC(COc4ccc(F)cn4)C34CCC24)c(-c...,US10100047,AA61A61P/A61P3/04,A61P,Biocide,0.507894
3,COCCCOc1cc2c(cc1Cl)-c1cc(=O)c(C(=O)O)cn1N(C(C)...,US10100052,NoneNoneNone/None,,,
4,CC(C)(C)N1Cc2cc(OCCCOC(O)(O)O)c(Cl)cc2-c2cc(=O...,US10100052,NoneNoneNone/None,,,
...,...,...,...,...,...,...
2879816,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US10651411,CC07C07C/C07C13/567,C07C,Conductive agent,0.502147
2879817,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11248009,CC07C07F/C07F5/02,C07F,Refrigerants,0.502259
2879818,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11370759,CC07C07D/C07D219/02,C07D,Freeze-thaw additive,0.510843
2879819,Cc1ccc(N(C)c2ccc(C)cc2)cc1,US11450815,CC07C07C/C07C209/74,C07C,Conductive agent,0.502147


In [14]:
valid2 = valid.dropna()

In [18]:
valid2.to_csv('Mol_CPC_use_mapping.csv',index = False)