# TTD Data Extraction

The purpose of this jupyter notebook is to create a csv from the TTD Dataset P1-01 from [TTD Website](https://ttd.idrblab.cn/full-data-download).

This P1-01 file comes in a txt format with details of the data source.


In [1]:
import re
import pandas as pd
import numpy as np
import pathlib
import os
from tabulate import tabulate
import yaml

In [2]:
## load configuration paths from yaml

def load_config(yaml_path="P1-config.yaml"):
    with open(yaml_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

config = load_config()

## Preview the txt file

In [3]:
def read_ttd(file_path, lines=50):
    """
    Read and preview the first few lines of a TTD .txt file to inspect its format.
    
    Args:
        file_path (str): Path to the file.
        lines (int): Number of lines to preview.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    
    print(f"--- Previewing: {file_path} ({min(lines, len(data))} of {len(data)} lines) ---\n")
    for line in data[:lines]:
        print(line.rstrip())


p1_01_ttd = config["raw_paths"]["p1_01_ttd"]
read_ttd(p1_01_ttd)

--- Previewing: C:/Users/Admin/OneDrive - Nanyang Technological University/FYP_DATA/RAW_DATA/P1-01-TTD_target_download.txt (50 of 91045 lines) ---


TTD - Therapeutic Targets Database Full Data Download File
Title - TTD targets information in raw format
Version 10.1.01 (2024.01.10)
Provided by IDRB   Lab of Innovative Drug Reasearch and Bioinformatics
                   College of Pharmaceutical Sciences
                   Zhejiang University
                   https://idrblab.org/
Provided by BIDD   BioInformatic and Drug Design Group
                   Department of Pharmacy
                   National Unviersity of Singapore
                   https://bidd.group/
Any question about data provided here, please contact with:
Dr. Zhou (zhou_ying@zju.edu.cn) and Dr. Zhang (zhangyintao@zju.edu.cn)

--------------------------------------------------------------------------------------------------------
Abbreviations:
TARGETID	TTD Target ID
FORMERID	TTD Former Target ID
UNIPROID	Uniprot ID


## Extract table from the txt file

In [4]:



def parse_p1_01(file_path):
    """Parses P1-01-TTD_target_download.txt into structured rows."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    targets = []
    current = {}
    for line in lines:
        line = line.strip()
        if not line or line.startswith('---') or line.startswith('TTD -'):
            continue
        if '\t' in line:
            parts = line.split('\t')
            if len(parts) >= 3:
                target_id, key, value = parts[0], parts[1], '\t'.join(parts[2:])
                if key == 'TARGETID' and current:
                    targets.append(current)
                    current = {}
                if key.startswith('DRUGINFO'):
                    if 'DRUGINFO' not in current:
                        current['DRUGINFO'] = []
                    current['DRUGINFO'].append(value)
                else:
                    current[key] = value
    if current:
        targets.append(current)

    # Expand DRUGINFO entries
    expanded_rows = []
    for entry in targets:
        base = entry.copy()
        drug_infos = base.pop('DRUGINFO', [])
        for drug_info in drug_infos:
            dparts = drug_info.split('\t')
            if len(dparts) >= 3:
                base_copy = base.copy()
                base_copy['TTDDRUID'] = dparts[0]
                base_copy['DRUGNAME'] = dparts[1]
                base_copy['CLINICAL_STATUS'] = dparts[2]
                expanded_rows.append(base_copy)
            else:
                expanded_rows.append(base)
    return pd.DataFrame(expanded_rows)

df_target = parse_p1_01(p1_01_ttd)
parsed_target = config["processed_paths"]["parsed_target"]
df_target.to_csv(parsed_target, index=True)
df_target.head()


Unnamed: 0,TARGETID,FORMERID,UNIPROID,TARGNAME,GENENAME,TARGTYPE,SYNONYMS,FUNCTION,PDBSTRUC,BIOCLASS,ECNUMBER,SEQUENCE,TTDDRUID,DRUGNAME,CLINICAL_STATUS
0,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D0O6UY,Pemigatinib,Approved
1,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D09HNV,Intedanib,Approved
2,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D01PZD,Romiplostim,Approved
3,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D07PQJ,ARQ-087,Phase 3
4,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D05PWX,Sulfatinib,Phase 3


In [5]:
print(df_target.shape)

(45236, 15)


## Filter by clinical status

### Read file

In [10]:
# read parsed_target data
drug_target = pd.read_csv(parsed_target)
drug_target.head()


Unnamed: 0.1,Unnamed: 0,TARGETID,FORMERID,UNIPROID,TARGNAME,GENENAME,TARGTYPE,SYNONYMS,FUNCTION,PDBSTRUC,BIOCLASS,ECNUMBER,SEQUENCE,TTDDRUID,DRUGNAME,CLINICAL_STATUS
0,0,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D0O6UY,Pemigatinib,Approved
1,1,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D09HNV,Intedanib,Approved
2,2,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D01PZD,Romiplostim,Approved
3,3,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D07PQJ,ARQ-087,Phase 3
4,4,T47101,TTDC00024,FGFR1_HUMAN,Fibroblast growth factor receptor 1 (FGFR1),FGFR1,Successful,c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...,Required for normal mesoderm patterning and co...,6MZW; 6MZQ; 6C1O; 6C1C; 6C1B,Kinase,EC 2.7.10.1,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,D05PWX,Sulfatinib,Phase 3


### Filter by relevant clinical status

In [11]:
clinical_statuses = ['Approved', 'Approved in China', 'Approved in EU', 'Approved (orphan drug)']
drug_target_filtered = drug_target[drug_target['CLINICAL_STATUS'].isin(clinical_statuses)]
print(np.shape(drug_target_filtered))


(2742, 16)


### Keep only necessary columns

In [12]:
drug_target_final = drug_target_filtered[['DRUGNAME', 'TARGNAME']].drop_duplicates().reset_index(drop=True)
print(np.shape(drug_target_final))

# save to csv
drug_target_final_path = config["processed_paths"]["drug_target_final"]
drug_target_final.to_csv(drug_target_final_path, index=False)

(2740, 2)
