# TTD Data Extraction

The purpose of this jupyter notebook is to create a csv from the TTD Dataset P1-01 from [TTD Website](https://ttd.idrblab.cn/full-data-download).

This P1-01 file comes in a txt format with details of the data source.


In [None]:
import re
import pandas as pd
import numpy as np
import pathlib
import os
from tabulate import tabulate

## Preview the txt file

In [None]:
def read_ttd(file_path, lines=50):
    """
    Read and preview the first few lines of a TTD .txt file to inspect its format.
    
    Args:
        file_path (str): Path to the file.
        lines (int): Number of lines to preview.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    
    print(f"--- Previewing: {file_path} ({min(lines, len(data))} of {len(data)} lines) ---\n")
    for line in data[:lines]:
        print(line.rstrip())



read_ttd("path/to/dataset")

## Extract table from the txt file

In [None]:


# File paths (update as needed)
file_target = "C:\\Users\\alici\\OneDrive - Nanyang Technological University\\FYP\\fyp-database\\Databases\\TTD\\P1-01-TTD_target_download.txt"


def parse_p1_01(file_path):
    """Parses P1-01-TTD_target_download.txt into structured rows."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    targets = []
    current = {}
    for line in lines:
        line = line.strip()
        if not line or line.startswith('---') or line.startswith('TTD -'):
            continue
        if '\t' in line:
            parts = line.split('\t')
            if len(parts) >= 3:
                target_id, key, value = parts[0], parts[1], '\t'.join(parts[2:])
                if key == 'TARGETID' and current:
                    targets.append(current)
                    current = {}
                if key.startswith('DRUGINFO'):
                    if 'DRUGINFO' not in current:
                        current['DRUGINFO'] = []
                    current['DRUGINFO'].append(value)
                else:
                    current[key] = value
    if current:
        targets.append(current)

    # Expand DRUGINFO entries
    expanded_rows = []
    for entry in targets:
        base = entry.copy()
        drug_infos = base.pop('DRUGINFO', [])
        for drug_info in drug_infos:
            dparts = drug_info.split('\t')
            if len(dparts) >= 3:
                base_copy = base.copy()
                base_copy['TTDDRUID'] = dparts[0]
                base_copy['DRUGNAME'] = dparts[1]
                base_copy['CLINICAL_STATUS'] = dparts[2]
                expanded_rows.append(base_copy)
            else:
                expanded_rows.append(base)
    return pd.DataFrame(expanded_rows)

df_target = parse_p1_01(file_target)
df_target.to_csv('path/to/save/csv', index=True)
df_target.head()


## Filter by clinical status

### Read file

In [None]:
# read parsed_target data
drug_target = pd.read_csv('path/to/save/csv')
drug_target.head()


### Filter by relevant clinical status

In [None]:
clinical_statuses = ['Approved', 'Approved in China', 'Approved in EU', 'Approved (orphan drug)']
drug_target_filtered = drug_target[drug_target['CLINICAL_STATUS'].isin(clinical_statuses)]
print(np.shape(drug_target_filtered))


### Keep only necessary columns

In [None]:
drug_target_final = drug_target_filtered[['DRUGNAME', 'TARGNAME']].drop_duplicates().reset_index(drop=True)
print(np.shape(drug_target_final))

# save to csv
drug_target_final.to_csv('path/to/drugtargetfinal', index=False)