In [None]:
!wget https://db.idrblab.net/ttd/sites/default/files/ttd_database/P1-03-TTD_crossmatching.txt -O P1-03-TTD_crossmatching.txt

### Drug ID Mapping

In [77]:
import pandas as pd


def parse_section(section):
    """从给定section中解析TTDDRUID，CHEBI_ID和PUBCHCID"""
    section_data = {}
    for line in section.split("\n"):
        parts = line.split("\t")
        # 确保数据行格式正确
        if len(parts) == 3:
            section_data[parts[1]] = parts[2]

    return section_data


def construct_drug_id(chebi_id, pubchcid):
    """构造Drug_id列的值"""
    if pubchcid:
        return f"PUBCHEM:{pubchcid}"
    elif chebi_id:
        return chebi_id
    else:
        return None


def process_file(file_path):
    # 读取文件并按空行分割sections
    with open(file_path, "r") as file:
        for i in range(0, 28):
            next(file)

        sections = file.read().strip().split("\n\n")

    data = []
    for section in sections:
        section_data = parse_section(section)
        drug_id = construct_drug_id(
            section_data.get("CHEBI_ID"), section_data.get("PUBCHCID")
        )
        data.append({"TTDDRUID": section_data.get("TTDDRUID", ""), "Drug_id": drug_id})

    return pd.DataFrame(data)

file_path = "./P1-03-TTD_crossmatching.txt"
df = process_file(file_path)

output_path = "processed_ttd_drug_id.tsv"
invalid = df[df["Drug_id"].isnull()]
df = df[df["Drug_id"].notnull()]

invalid.to_csv("invalid_ttd_drug_id.tsv", sep="\t", index=False)
df.to_csv(output_path, index=False, sep="\t")

print(f"CSV file has been saved to {output_path}")

CSV file has been saved to ttd_drug_id.tsv


### Biomarker ID Mapping

In [63]:
# Fetch Biomarker data from TTD
import requests, re
from bs4 import BeautifulSoup

def fetch_biomarker(biomarker_id: str):
    url = f"https://db.idrblab.net/ttd/data/biomarker/details/{biomarker_id}"

    # 使用requests获取页面内容
    response = requests.get(url)

    # 确保请求成功
    if response.status_code == 200:
        # 使用BeautifulSoup解析HTML内容
        soup = BeautifulSoup(response.text, "html.parser")

        # 找到表格，假设它是页面中的第一个表格
        table = soup.find("table")

        # 解析表格，提取标题行和数据行
        headers = [header.text for header in table.find_all("th")]
        rows = []
        for row in table.find_all("tr"):
            columns = row.find_all("td")
            rows.append([column.text for column in columns])

        # 删除空行
        rows = [row for row in rows if len(row) > 0]

        # 打印表格数据
        headers = list(
            filter(
                lambda x: x not in ["Biomarker General Infomation", "References"], headers
            )
        )
        rows = [re.sub(r'\s+', ' ', row[0].replace("\n", "").strip()) for row in rows if len(row) > 0]
        biomarker_dict = dict(zip(headers, rows))
    else:
        print("Failed to retrieve the webpage")
        biomarker_dict = {}

    return biomarker_dict


def save(biomarker_data, output_path):
    d = biomarker_data.values()
    # print(d)
    biomarker_df = pd.DataFrame(list(d))
    biomarker_df.to_csv(output_path, index=False, sep=output_path.endswith(".tsv") and "\t" or ",")

In [71]:
import pandas as pd
import tqdm
import os
from time import sleep

input_path = "../biomarker-disease/P1-08-Biomarker_disease_extracted.tsv"
output_path = "./processed_ttd_biomarkers.tsv"

data = pd.read_csv(input_path, sep="\t")
biomarker_ids = data["ttd_biomarker_id"].unique()

biomarker_data = (
    pd.read_csv(output_path, sep="\t")
    if os.path.exists(output_path)
    else pd.DataFrame()
)
biomarker_data_dict_lst = biomarker_data.to_dict(orient="records")
if len(biomarker_data_dict_lst) > 0:
    biomarker_data = {biomarker["Biomarker ID"]: biomarker for biomarker in biomarker_data_dict_lst}
else:
    biomarker_data = {}

idx = 1
for biomarker_id in tqdm.tqdm(biomarker_ids):
    if biomarker_id in biomarker_data:
        continue
    else:
        biomarker_data[biomarker_id] = fetch_biomarker(biomarker_id)
        sleep(2)

    if idx % 10 == 0 or idx == len(biomarker_ids):
        save(biomarker_data, output_path)

    idx += 1

save(biomarker_data, output_path)

100%|██████████| 2512/2512 [00:00<00:00, 3367239.26it/s]


In [76]:
biomarkers = pd.read_csv(output_path, sep="\t")
biomarkers.head()

biomarkers = biomarkers[["Biomarker ID", "Biomarker Name", "UniProt ID"]]
invalid = biomarkers[biomarkers["UniProt ID"].isnull()]
biomarkers = biomarkers[biomarkers["UniProt ID"].notnull()]
biomarkers["UniProt ID"] = biomarkers["UniProt ID"].apply(lambda x: x.replace(" ", "").split(";"))
biomarkers = biomarkers.explode(column="UniProt ID")

invalid.to_csv("invalid_ttd_biomarker_id.tsv", sep="\t", index=False)
biomarkers.to_csv("processed_ttd_biomarker_id.tsv", sep="\t", index=False)

### Gene ID Mapping

In [48]:
!wget https://db.idrblab.net/ttd/sites/default/files/ttd_database/P1-01-TTD_target_download.txt -O P1-01-TTD_target_download.txt

--2024-04-17 06:01:49--  https://db.idrblab.net/ttd/sites/default/files/ttd_database/P1-01-TTD_target_download.txt
Resolving db.idrblab.net (db.idrblab.net)... 47.88.56.212
Connecting to db.idrblab.net (db.idrblab.net)|47.88.56.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8672051 (8.3M) [text/plain]
Saving to: ‘P1-01-TTD_target_download.txt’


2024-04-17 06:01:51 (6.62 MB/s) - ‘P1-01-TTD_target_download.txt’ saved [8672051/8672051]



In [79]:
import pandas as pd


def parse_section(section):
    """
    Parse the given section and return a list of dictionaries containing the Target ID, Target Name, and UniProt ID.
    """
    base_data = {
        "ttd_target_id": "",
        "target_name": "",
        "target_id": "",
        "ttd_uniprot_id": "",
        "target_type": "",
        "synonyms": "",
        "function": "",
        "bio_class": "",
        "ec_number": "",
        "sequence": "",
    }

    lines = section.split("\n")
    for line in lines:
        parts = line.split("\t")
        if parts[1] == "TARGETID":
            base_data["ttd_target_id"] = parts[2]
        elif parts[1] == "TARGETNAME" or parts[1] == "TARGNAME":
            base_data["target_name"] = parts[2]
        elif parts[1] == "GENENAME":
            base_data["target_id"] = parts[2].split("; ")
        elif parts[1] == "TARGETTYPE" or parts[1] == "TARGTYPE":
            base_data["target_type"] = parts[2]
        elif parts[1] == "SYNONYMS":
            base_data["synonyms"] = parts[2]
        elif parts[1] == "UNIPROID":
            base_data["ttd_uniprot_id"] = parts[2].split("; ")
        elif parts[1] == "FUNCTION":
            base_data["function"] = parts[2]
        elif parts[1] == "BIOCLASS":
            base_data["bio_class"] = parts[2]
        elif parts[1] == "ECNUMBER":
            base_data["ec_number"] = parts[2]
        elif parts[1] == "SEQUENCE":
            base_data["sequence"] = parts[2]

    return base_data


def process_file(filepath):
    """
    Process the given TTD target download file and return a DataFrame containing the extracted data.
    """
    with open(filepath, "r") as file:
        for i in range(0, 32):
            next(file)

        content = file.read().strip()
        sections = content.split("\n\n")

    all_data = []
    for section in sections:
        section_data = parse_section(section)

        target_id = section_data.get("target_id", [])
        if isinstance(target_id, list):
            if len(target_id) > 1:
                for idx, id in enumerate(target_id):
                    d = section_data.copy()
                    try:
                        d["target_id"] = "SYMBOL:" + id
                        d["ttd_uniprot_id"] = d["ttd_uniprot_id"][idx]
                    except IndexError:
                        print(
                            "It's a special case: ",
                            section_data["target_id"],
                            section_data["ttd_uniprot_id"],
                        )
                        d["target_id"] = "SYMBOL:" + d["target_id"][0]
                        d["ttd_uniprot_id"] = d["ttd_uniprot_id"][0]

                    all_data.append(d)
            else:
                section_data["target_id"] = "SYMBOL:" + target_id[0]
                section_data["ttd_uniprot_id"] = section_data["ttd_uniprot_id"][0]
                all_data.append(section_data)
        else:
            section_data["target_id"] = "SYMBOL:" + target_id
            all_data.append(section_data)

    df = pd.DataFrame(all_data)

    invalid_rows = df[df["target_id"].isna() | df["target_id"].str.contains(" ")]
    invalid_rows.to_csv("./invalid_ttd_gene_id.tsv", index=False, sep="\t")
    df.to_csv("./processed_ttd_gene_id.tsv", index=False, sep="\t")


# 调用process_file函数处理文件
process_file("P1-01-TTD_target_download.txt")

It's a special case:  ['CAMLG', 'CAML'] ['CAMLG_HUMAN']
