# PubChem から情報を取ってくる

## 概要

* Active ingredients から、不要な部分を除く
* 長い単語から当てていき、PubChemPy で検索

In [14]:
import pubchempy as pcp

def find_all_longest_matches(input_text):
    words = input_text.split()
    n = len(words)
    found_matches = []

    # 部分文字列を生成して長さの降順で探索
    for length in range(n, 0, -1):
        for start in range(n - length + 1):  # 部分文字列の開始位置
            query = " ".join(words[start:start + length])
            
            results = pcp.get_compounds(query, 'name')
            if results:
                compound = results[0]
                # マッチした化合物を保存
                found_matches.append({
                    "Name": query,
                    "CID": compound.cid,
                    "SMILES": compound.canonical_smiles
                })
                words[start:start + length] = ["###"] * length  # 該当箇所を置換（同じ範囲に重複しないため）
    
    return found_matches

In [15]:
input_text = (
    "Active Ingredients Purpose Camphor 11% Topical Analgesic "
    "Eucalyptus Oil 10% Anti-Inflammatory Menthol 8% Topical Analgesic "
    "Wintergreen Oil (Methyl Salicylate) 10% Topical Analgesic"
)

matches = find_all_longest_matches(input_text)

for match in matches:
    print(f"Matched Query: {match['Name']}")
    print(f"Start Index: {match['CID']}")
    print(f"Details: {match['SMILES']}\n")

Matched Query: Camphor
Start Index: 2537
Details: CC1(C2CCC1(C(=O)C2)C)C

Matched Query: Menthol
Start Index: 1254
Details: CC1CCC(C(C1)O)C(C)C

Matched Query: Wintergreen
Start Index: 4133
Details: COC(=O)C1=CC=CC=C1O



In [1]:
import pandas as pd
import pubchempy as pcp
from tqdm import tqdm

# 入力ファイルの読み込み
df = pd.read_csv("/workspace/ssd4t/yoshikawa/fdalabel_handler/workspace/active_ingredient.tsv", sep="\t").head()
lists = df[["ID", "ingredients"]].values.tolist()

# 化合物情報を検索してマッチを取得する関数
def find_all_longest_matches(input_text):
    words = input_text.split()
    n = len(words)
    found_matches = []

    # 部分文字列を生成して長さの降順で探索
    for length in range(n, 0, -1):
        for start in range(n - length + 1):
            query = " ".join(words[start:start + length])

            results = pcp.get_compounds(query, 'name')
            if results:
                compound = results[0]
                # マッチした化合物を保存
                found_matches.append({
                    "Name": query,
                    "CID": compound.cid,
                    "SMILES": compound.canonical_smiles
                })
                words[start:start + length] = ["###"] * length  # 該当箇所を置換（同じ範囲に重複しないため） 

    return found_matches

# 出力用のリスト
output_data = []

# 各IDとそのingredientsを処理
for record in tqdm(lists):
    id_value, ingredients = record
    ingredient_list = ingredients.split(", ")

    for ingredient in ingredient_list:
        matches = find_all_longest_matches(ingredient)
        for match in matches:
            output_data.append({
                "ID": id_value,
                "Ingredient": match["Name"],
                "CID": match["CID"],
                "SMILES": match["SMILES"]
            })

# データフレームとして出力
output_df = pd.DataFrame(output_data)
output_df.to_csv("/workspace/ssd4t/yoshikawa/fdalabel_handler/workspace/matched_ingredients.tsv", sep="\t", index=False)

100%|██████████| 5/5 [01:33<00:00, 18.62s/it]


## データの途中の状態の確認など

In [25]:
df

Unnamed: 0,ID,ingredients
0,7,"['benzalkonium cl 0.13', 'lidocaine hcl 2.5']"
1,9,"[': adenoviren nosode 12c', 'cytomegalovirus n..."
2,13,"['(in each 1.25 ml) ibuprofen 50', '(nsaid)* *..."
3,16,"['titanium dioxide 9.0', 'zinc oxide 6.3', 'pu..."
4,17,['(in each tablet) fexofenadine hcl usp']


In [40]:
print(lists)

[['Benzalkonium Cl 0.13, Lidocaine HCL 2.5'], [': Adenoviren Nosode 12C, Cytomegalovirus Nosode 18X, DNA 12X, Epstein-Barr Virus Nosode 16C, Hepatitis B Nosode 33X, Herpes Simplex 1 & 2 Nosode 15X, Human Herpesvirus 6A 15X, Human Herpesvirus 6B 15X, Influenzinum 12X, 200X.'], ['(in each 1.25 mL) Ibuprofen 50, (NSAID)* *nonsteroidal anti-inflammatory drug'], ['Titanium Dioxide 9.0, Zinc Oxide 6.3, Purpose: Sunscreen'], ['(in each tablet) Fexofenadine HCl USP']]


In [46]:
for names in tqdm(lists):
    names = names[0].split(", ")
    for name in names:
        print(name)

100%|██████████| 5/5 [00:00<00:00, 7931.74it/s]

Benzalkonium Cl 0.13
Lidocaine HCL 2.5
: Adenoviren Nosode 12C
Cytomegalovirus Nosode 18X
DNA 12X
Epstein-Barr Virus Nosode 16C
Hepatitis B Nosode 33X
Herpes Simplex 1 & 2 Nosode 15X
Human Herpesvirus 6A 15X
Human Herpesvirus 6B 15X
Influenzinum 12X
200X.
(in each 1.25 mL) Ibuprofen 50
(NSAID)* *nonsteroidal anti-inflammatory drug
Titanium Dioxide 9.0
Zinc Oxide 6.3
Purpose: Sunscreen
(in each tablet) Fexofenadine HCl USP





In [68]:
import pubchempy as pcp
from tqdm import tqdm
results = pcp.get_compounds('cl', 'name')
if results:
    compound = results[0]
    # マッチした化合物を保存
    print(
        "CID" + str(compound.cid)
    )

CID24526
