### Get the status of all drug

In [4]:
import xml.etree.ElementTree as ET

# 加载 XML 文件
tree = ET.parse("full_database.xml")  # 替换为你的文件路径
root = tree.getroot()

# DrugBank XML 使用的命名空间（根据你的文件可能略有不同）
ns = {'db': 'http://www.drugbank.ca'}

# 状态集合
desired_statuses = {"approved", "investigational", "experimental", "withdrawn"}

# 存储符合条件的 drugbank-id
matching_drug_ids = []

# 遍历每个 drug
for drug in root.findall("db:drug", ns):
    # 获取 drugbank-id（第一个 id 是主要 ID）
    drugbank_id = drug.find("db:drugbank-id", ns).text

    # 获取 drug 的所有状态
    groups = drug.find("db:groups", ns)
    if groups is not None:
        drug_statuses = {g.text.lower() for g in groups.findall("db:group", ns)}
        if drug_statuses & desired_statuses:
            matching_drug_ids.append({
                "drugbank_id": drugbank_id,
                "status": drug_statuses,
                "valid": True
            })

# 输出结果
print(f"Found {len(matching_drug_ids)} matching drugs")
print(matching_drug_ids[:10])  # 只看前10个


Found 17285 matching drugs
[{'drugbank_id': 'DB00001', 'status': {'approved', 'withdrawn'}, 'valid': True}, {'drugbank_id': 'DB00002', 'status': {'approved'}, 'valid': True}, {'drugbank_id': 'DB00003', 'status': {'approved'}, 'valid': True}, {'drugbank_id': 'DB00004', 'status': {'approved', 'investigational'}, 'valid': True}, {'drugbank_id': 'DB00005', 'status': {'approved', 'investigational'}, 'valid': True}, {'drugbank_id': 'DB00006', 'status': {'approved', 'investigational'}, 'valid': True}, {'drugbank_id': 'DB00007', 'status': {'approved', 'investigational'}, 'valid': True}, {'drugbank_id': 'DB00008', 'status': {'approved', 'investigational'}, 'valid': True}, {'drugbank_id': 'DB00009', 'status': {'approved', 'investigational'}, 'valid': True}, {'drugbank_id': 'DB00010', 'status': {'approved', 'withdrawn'}, 'valid': True}]


In [5]:
import pandas as pd

df = pd.DataFrame(matching_drug_ids)
df.to_csv("drugbank_status.tsv", index=False, sep="\t")

### Get the approved drugs after 2022

In [None]:
import pandas as pd

# 下载地址（可提前手动下载）：https://www.accessdata.fda.gov/cder/ndc/drug-products.csv
fda_csv_path = "drug-products.csv"  # 替换为你的本地路径

# 加载 FDA 批准药物数据
df = pd.read_csv(fda_csv_path)

# 示例列名包括：PRODUCTID, PROPRIETARYNAME, NONPROPRIETARYNAME, APPLICATIONNUMBER, STARTMARKETINGDATE 等
# FDA 的日期是字符串格式 'YYYYMMDD'，我们转换成 datetime 类型
df["STARTMARKETINGDATE"] = pd.to_datetime(df["STARTMARKETINGDATE"], format="%Y%m%d", errors="coerce")

# 筛选出 2022 年及以后的批准药物
approved_after_2022 = df[df["STARTMARKETINGDATE"] >= "2022-01-01"].copy()

# 显示关键信息
approved_subset = approved_after_2022[[
    "PRODUCTID",
    "PROPRIETARYNAME",         # 商品名
    "NONPROPRIETARYNAME",      # 活性成分
    "APPLICATIONNUMBER",
    "STARTMARKETINGDATE",
]]

# 显示前几行
print(approved_subset.head())

# 可保存为 CSV
approved_subset.to_csv("approved_drugs_after_2022.csv", index=False)
