In [None]:
!pip3 install pandas

In [1]:
!wget https://db.idrblab.net/ttd/sites/default/files/ttd_database/P4-01-Target-KEGGpathway_all.txt -O P4-01-Target-KEGGpathway_all.txt
!wget https://db.idrblab.net/ttd/sites/default/files/ttd_database/P4-06-Target-wikipathway_all.txt -O P4-06-Target-wikipathway_all.txt

P4-01-Target-KEGGpathway_all.txt: Permission denied
--2024-04-16 11:53:14--  https://db.idrblab.net/ttd/sites/default/files/ttd_database/P4-06-Target-wikipathway_all.txt
Resolving db.idrblab.net (db.idrblab.net)... 47.88.56.212
Connecting to db.idrblab.net (db.idrblab.net)|47.88.56.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 372677 (364K) [text/plain]
Saving to: ‘P4-06-Target-wikipathway_all.txt’


2024-04-16 11:53:14 (1.13 MB/s) - ‘P4-06-Target-wikipathway_all.txt’ saved [372677/372677]



In [2]:
import pandas as pd

def extract(filepath, id_prefix="KEGG"):
    """Extracts data from a TTD file and saves it to a CSV file.

    Args:
        filepath (str): The path to the TTD file.
        id_prefix (str, optional): The prefix to add to the target ID. Defaults to "KEGG". It can be "KEGG" or "WikiPathways".
    """
    data = []

    with open(filepath, "r") as file:
        for i in range(0, 18):
            next(file)

        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                ttdid, kegg_id, kegg_name = parts
                formatted_kegg_id = f"{id_prefix}:{kegg_id}"
                data.append({
                    'source_id': ttdid,
                    'source_type': 'Gene',
                    'target_id': formatted_kegg_id,
                    'target_type': 'Pathway',
                    'source_name': '',
                    'target_name': kegg_name,
                    'relation_type': 'Hetionet::GpPW::Gene:Pathway',
                    'ttd_gene_id': ttdid,
                    'ttd_pathway_id': kegg_id,
                    'resource': 'TTD'
                })

    df = pd.DataFrame(data)
    output_filepath = filepath.replace(".txt", "_extracted.csv")
    df.to_csv(output_filepath, index=False)

In [3]:
extract("P4-01-Target-KEGGpathway_all.txt", "KEGG")
extract("P4-06-Target-wikipathway_all.txt", "WikiPathways")

In [3]:
import pandas as pd

def format_df(target_pathway_file, gene_symbol_file, output_file):
    df1 = pd.read_csv(target_pathway_file, sep=',')
    df2 = pd.read_csv(gene_symbol_file, sep=gene_symbol_file.endswith("tsv") and "\t" or ",")

    df2.rename(columns={'target_id': 'gene_symbol'}, inplace=True)
    print(df2.head())
    merged_df = pd.merge(
        df1,
        df2[["ttd_target_id", "gene_symbol"]],
        left_on="source_id",
        right_on="ttd_target_id",
        how="left",
    )

    merged_df.drop('ttd_target_id', axis=1, inplace=True)
    merged_df["source_name"] = merged_df["gene_symbol"].str.split(":").str[1]
    merged_df["source_id"] = merged_df["gene_symbol"]
    merged_df.drop("gene_symbol", axis=1, inplace=True)

    merged_df.to_csv(output_file, index=False, sep="\t")

In [7]:
format_df("P4-01-Target-KEGGpathway_all_extracted.csv", "../idmapping/ttd_gene_id.tsv", "formatted_ttd_target_keggpathway.tsv")
format_df("P4-06-Target-wikipathway_all_extracted.csv", "../idmapping/ttd_gene_id.tsv", "formatted_ttd_target_wikipathway.tsv")

  ttd_target_id                                  target_name   gene_symbol  \
0        T47101  Fibroblast growth factor receptor 1 (FGFR1)  SYMBOL:FGFR1   
1        T59328      Epidermal growth factor receptor (EGFR)   SYMBOL:EGFR   
2        T89515                Polypeptide deformylase (PDF)    SYMBOL:PDF   
3        T08391                       Janus kinase 2 (JAK-2)   SYMBOL:JAK2   
4        T07663                 Phosphodiesterase 5A (PDE5A)  SYMBOL:PDE5A   

  ttd_uniprot_id target_type  \
0    FGFR1_HUMAN  Successful   
1     EGFR_HUMAN  Successful   
2     DEFM_HUMAN  Successful   
3     JAK2_HUMAN  Successful   
4    PDE5A_HUMAN  Successful   

                                            synonyms  \
0  c-fgr; bFGF-R-1; bFGF-R; N-sam; HBGFR; Fms-lik...   
1  Receptor tyrosine-protein kinase erbB-1; Proto...   
2                                                PDF   
3                       Tyrosine-protein kinase JAK2   
4  cGMP-specific 3',5'-cyclic phosphodiesterase; ...   

 