In [1]:
!pip3 install pandas



In [2]:
!wget https://db.idrblab.net/ttd/sites/default/files/ttd_database/P1-05-Drug_disease.txt -O P1-05-Drug_disease.txt

--2024-04-16 12:12:00--  https://db.idrblab.net/ttd/sites/default/files/ttd_database/P1-05-Drug_disease.txt
Resolving db.idrblab.net (db.idrblab.net)... 47.88.56.212
Connecting to db.idrblab.net (db.idrblab.net)|47.88.56.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2756589 (2.6M) [text/plain]
Saving to: ‘P1-05-Drug_disease.txt’


2024-04-16 12:12:01 (4.27 MB/s) - ‘P1-05-Drug_disease.txt’ saved [2756589/2756589]



In [3]:
import csv

def parse_section(section):
    """Parse a section of the input file and return the extracted data"""
    lines = section.strip().split("\n")
    ttddruid, drug_name, indications = "", "", []
    for line in lines:
        if line.startswith("TTDDRUID"):
            ttddruid = line.split()[1]
        elif line.startswith("DRUGNAME"):
            drug_name = ' '.join(line.split()[1:])
        elif line.startswith("INDICATI"):
            parts = line.split("ICD-11:")
            if len(parts) > 1:
                disease_name = parts[0].replace("INDICATI", "").strip()
                icd_code = "ICD-11:" + parts[1].split()[0].replace(" ", "")
                indications.append((disease_name, icd_code))
    return ttddruid, drug_name, indications

def process_file(input_file_path, output_file_path):
    """Process the input file and write the extracted data to the output file"""
    with open(input_file_path, 'r', encoding='utf-8') as file:
        for i in range(22):  # 跳过前4行
            next(file)

        content = file.read()
    sections = content.split("\n\n")
    
    with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["source_id", "source_type", "target_id", "target_type", "source_name", "target_name", "relation_type", "resource", "ttd_target_id", "ttd_source_id"])
        
        for section in sections:
            ttddruid, drug_name, indications = parse_section(section)
            for disease_name, icd_code in indications:
                writer.writerow([
                    ttddruid, "Compound", icd_code, "Disease", drug_name, disease_name, "DRUGBANK::treats::Compound:Disease", "TTD", icd_code, ttddruid
                ])

input_file_path = 'P1-05-Drug_disease.txt'
output_file_path = 'P1-05-Drug_disease-extracted.csv'

process_file(input_file_path, output_file_path)


In [1]:
import pandas as pd

file_path1 = 'P1-05-Drug_disease-extracted.csv'
file_path2 = '../idmapping/ttd_drug_id.tsv'

df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2, sep='\t')

merged_df = pd.merge(df1, df2, left_on='source_id', right_on='TTDDRUID', how='left')
merged_df['source_id'] = merged_df['Drug_id']
merged_df.drop(['TTDDRUID', 'Drug_id'], axis=1, inplace=True)

output_file_path = 'formatted_ttd_drug_disease.tsv'
invalid = merged_df[merged_df['source_id'].isnull() | merged_df['target_id'].isnull()]
merged_df = merged_df[~(merged_df['source_id'].isnull() | merged_df['target_id'].isnull())]

invalid = invalid.to_csv('invalid_ttd_drug_disease.tsv', index=False, sep='\t')
merged_df.to_csv(output_file_path, index=False, sep='\t')

print("Data processing completed and the new table has been saved.")


Data processing completed and the new table has been saved.
