In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# **DEPOD 2019**

**Relation: Substrate-Dephosphosite**

*   Replace semicolon with comma in the 'Dephosphosites' column
*   Split the 'Dephosphosites' values and generate separate rows
*   Remove leading and trailing whitespace and any characters after a space
*   Filter rows starts with 'Ser-', 'Thr-', or 'Tyr-' followed by digit
*   Replace Ser- with S, Thr- with T, Tyr- as Y
*   Output columns: Phosphatase, Substrate, Dephosphosite
*   Output rows: 931

In [None]:
d1 = pd.read_excel("/content/drive/MyDrive/MS Thesis/Final/DEPOD_2019/PPase_protSubtrates_201903.xls")

d1 = d1[["Phosphatase entry names", "Substrate entry names", "Dephosphosites"]]
d1 = d1.drop_duplicates()

d1['Dephosphosites'] = d1['Dephosphosites'].str.replace(';', ',', regex=False)
d1['Dephosphosites'] = d1['Dephosphosites'].str.split(',')
d1 = d1.explode('Dephosphosites', ignore_index=True)

d1['Dephosphosites'] = d1['Dephosphosites'].str.strip()
d1['Dephosphosites'] = d1['Dephosphosites'].str.replace(r'\s.*$', '', regex=True)
d1 = d1[d1['Dephosphosites'].str.match('^(Ser|Thr|Tyr)-\d+', na=False)]

d1['Dephosphosites'] = d1['Dephosphosites'].str.replace('Ser-', 'S', regex=False)
d1['Dephosphosites'] = d1['Dephosphosites'].str.replace('Thr-', 'T', regex=False)
d1['Dephosphosites'] = d1['Dephosphosites'].str.replace('Tyr-', 'Y', regex=False)

d1 = d1.rename(columns={"Phosphatase entry names": "Phosphatase", "Substrate entry names": "Substrate", "Dephosphosites": "Dephosphosite"})
d1 = d1.drop_duplicates()

d1['Dephosphosite'] = d1['Substrate'].astype(str) + '_' + d1['Dephosphosite'].astype(str)

d1.to_csv("phosphatase_substrate_dephosphosite.csv", index=False)

# **PhosphoSitePlus**

**Relation: Gene-Phosphosite**

*   Use tab-delimited values and generate CSV file
*   Choose rows for "human"
*   Remove "-p" from "MOD_RSD" column
*   Output columns: Gene, Phosphosite
*   Output rows: 238414

In [None]:
d2 = "/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/Phosphorylation_site_dataset"

d2 = pd.read_csv(d2, delimiter="\t")

d2.to_csv("phosphorylation_site_dataset.csv", index=False)

In [None]:
d2 = d2[d2["ORGANISM"] == "human"]
d2 = d2[["GENE", "MOD_RSD"]]
d2["MOD_RSD"] = d2["MOD_RSD"].str.replace("-p", "", regex=False)
d2 = d2.rename(columns={"GENE": "Gene", "MOD_RSD": "Phosphosite"})
d2 = d2.drop_duplicates().dropna()

d2.to_csv("gene_phosphosite.csv", index=False)

**Relation: Kinase-Substrate**

*   Use tab-delimited values and generate CSV file
*   Choose rows for "human" in SUB_ORGANISM and KIN_ORGANISM
*   Output columns: Kinase, Substrate, Phosphosite
*   Output rows: 14646

In [None]:
d3 = '/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/Kinase_Substrate_Dataset'

d3 = pd.read_csv(d3, delimiter="\t")

d3.to_csv("/content/kinase_Substrate_Dataset.csv", index=False)

In [None]:
d3 = d3[(d3['SUB_ORGANISM'] == 'human') & (d3['KIN_ORGANISM'] == 'human')]
d3 = d3[['GENE', 'SUBSTRATE', 'SUB_MOD_RSD']]
d3 = d3.rename(columns={"GENE": "Kinase", "SUBSTRATE":"Substrate", "SUB_MOD_RSD": "Phosphosite"})
d3 = d3.drop_duplicates().dropna()

d3['Phosphosite'] = d3['Substrate'].astype(str) + '_' + d3['Phosphosite'].astype(str)

d3.to_csv("/content/kinase_substrate_phosphosite.csv", index=False)

In [None]:
d3 = d3[(d3['SUB_ORGANISM'] == 'human') & (d3['KIN_ORGANISM'] == 'human')]
d3 = d3[['SUB_MOD_RSD']]
d3 = d3.rename(columns={"SUB_MOD_RSD": "Phosphosite"})
d3 = d3.drop_duplicates().dropna()

d3.to_csv("/content/kinase_substrate_phosphosite.csv", index=False)

In [None]:
count=d3.nunique()
count

Unnamed: 0,0
Phosphosite,3129


In [None]:
count=d3.nunique()
count

Unnamed: 0,0
Kinase,421
Substrate,3099
Phosphosite,10203


**Relation: Gene-Phosphosite-disease**

*   Use tab-delimited values and generate CSV file
*   Choose rows for "human"
*   Remove "-p" from "MOD_RSD" column
*   Output columns: Disease, Gene, Phosphosite
*   Output rows: 2351

In [None]:
d4 = "/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/Disease-associated_sites"

d4 = pd.read_csv(d4, delimiter="\t")

d4.to_csv("disease-associated_sites.csv", index=False)

In [None]:
d4 = d4[d4["ORGANISM"] == "human"]
d4 = d4[["DISEASE", "GENE", "MOD_RSD"]]
d4["MOD_RSD"] = d4["MOD_RSD"].str.replace("-p", "", regex=False)
d4 = d4.rename(columns={"DISEASE": "Disease", "GENE": "Gene", "MOD_RSD": "Phosphosite"})
d4 = d4.drop_duplicates().dropna()

d4.to_csv("disease_gene_phosphosite.csv", index=False)

**Relation: Gene-Phosphosite**

*   Use tab-delimited values and generate CSV file
*   Choose rows for "human"
*   Chose "-p" values only from "MOD_RSD" column
*   Remove "-p" from "MOD_RSD" column
*   Output columns: Gene, Phosphosite
*   Output rows: 10525

In [None]:
d5 = "/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/Regulatory_sites"

d5 = pd.read_csv(d5, delimiter="\t")

d5.to_csv("regulatory_sites.csv", index=False)

In [None]:
d5 = d5[d5["ORGANISM"] == "human"]
d5 = d5[["GENE", "MOD_RSD"]]
d5 = d5[d5['MOD_RSD'].str.contains('-p')]
d5["MOD_RSD"] = d5["MOD_RSD"].str.replace("-p", "", regex=False)
d5 = d5.rename(columns={"GENE": "Gene", "MOD_RSD": "Phosphosite"})
d5 = d5.drop_duplicates().dropna()

d5.to_csv("regulatory_gene_phosphosite.csv", index=False)

**Relation: Phosphosite_Gene (ALL)**

*   Output columns: Gene, Phosphosite
*   Output rows: 238723

In [None]:
a = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/gene_phosphosite.csv')
b = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/disease_gene_phosphosite.csv')
b = b[['Gene', 'Phosphosite']]
c = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/PhosphoSitePlus_2025/regulatory_gene_phosphosite.csv')

phosphosite = pd.concat([a,b,c], ignore_index=True)
phosphosite = phosphosite.drop_duplicates()
phosphosite.reset_index(drop=True, inplace=True)
phosphosite.to_csv('all_gene_phosphosite.csv', index=False)

In [None]:
phosphosite = phosphosite.rename(columns={'Gene': 'Kinase'})
m1 = pd.merge(phosphosite, d3, on='Kinase', how='right')

In [None]:
m1

Unnamed: 0,Kinase,Phosphosite_x,Substrate,Phosphosite_y
0,EIF2AK1,S6,eIF2-alpha,S52
1,EIF2AK1,S144,eIF2-alpha,S52
2,EIF2AK1,Y193,eIF2-alpha,S52
3,EIF2AK1,S253,eIF2-alpha,S52
4,EIF2AK1,S258,eIF2-alpha,S52
...,...,...,...,...
473664,ULK2,S771,DENND3,S472
473665,ULK2,T826,DENND3,S472
473666,ULK2,T844,DENND3,S472
473667,ULK2,Y1018,DENND3,S472


# **STRING PPI**

**Relation: Protein-Protein**

*   Read text file into a DataFrame, use space as a separator and assign column names
*   Remove the first row
*   Convert "confidence_score" column to numeric and set invalid values to NaN
*   Keep rows for "confidence_score" > 700
*   protein values are in ensembled format which I cannot use to merge with any other dataset
*   Output Columns: protein1, protein2, confidence_score
*   Output rows: 472000



In [None]:
d6 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/String PPI/PPI.txt', sep='\s+', header=None, names=["protein1", "protein2", "confidence_score"])

d6 = d6.drop(index=0)
d6["confidence_score"] = pd.to_numeric(d6["confidence_score"], errors='coerce')
d6 = d6[d6["confidence_score"] > 700]

d6.to_csv('/content/high_confidence_score.csv', index=False)

  d6 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/String PPI/PPI.txt', sep='\s+', header=None, names=["protein1", "protein2", "confidence_score"])


# **Uniprot**

**Relation: Gene-Gene**

*   Read file using tab as a separator into a Pandas DataFrame
*   Convert 'Gene Names' to string and replace multiple spaces with comma-separated list
*   Convert 'STRING' to strings, remove leading and trailing spaces (stripping) and keep values start with '9606.'
*   Remove trailing semicolons
*   Split 'Gene Names' by commas and explode into separate rows
*   Output Columns: Gene Names, STRING
*   Output rows: 43009

In [None]:
d7 = '/content/drive/MyDrive/MS Thesis/Final/Uniprot/uniprot.tsv'

d7 = pd.read_csv(d7, delimiter="\t")
d7['Gene Names'] = d7['Gene Names'].astype(str).apply(lambda x: ", ".join(x.split()) if pd.notna(x) else None)
d7['STRING'] = d7['STRING'].astype(str).apply(lambda x: x.strip() if x.startswith('9606.') else None)

d7 = d7[['Gene Names', 'STRING']]
d7 = d7.dropna(how='any')

d7.loc[:, 'STRING'] = d7['STRING'].str.rstrip(';')
d7 = d7.assign(**{'Gene Names': d7['Gene Names'].str.split(', ')}).explode('Gene Names')
d7 = d7[d7['Gene Names'].notna()]
d7 = d7[d7['Gene Names'] != 'nan']
d7 = d7.drop_duplicates()

d7.to_csv("/content/uniprot_gene.csv", index=False)

# **Merge Uniprot and STRING PPI**

*   Merged uniprot and string ppi to get the gene names for protein1 and protein2 separately
*   Output Columns: protein1, protein2, gene1, gene2
*   Output rows: 2832966

In [None]:
d6 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/String PPI/high_confidence_score.csv')
d7 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/Uniprot/uniprot_gene.csv')

d8 = pd.merge(d6, d7, left_on='protein1', right_on='STRING', how='left')
d8 = d8.rename(columns={'Gene Names': 'gene1'})
d8 = pd.merge(d8, d7, left_on='protein2', right_on='STRING', how='left')
d8 = d8.rename(columns={'Gene Names': 'gene2'})

d8.drop(columns=['confidence_score', 'STRING_x', 'STRING_y'], inplace=True)

d8.to_csv("/content/gene_protein.csv", index=False)

# **DrugMap**

*   Create dictionary d9
*   Split lines using tab separator
*   Generate 3 fields for 3 columns
*   If 'field_name' is 'ID'/'DN'/'DE', the 'field_value' is stored dictionary for d9 and so on for others
*   Output Columns:

   *   d9: Drug ID, Drug Name, Disease Entry
   *   d10: DTT ID, Gene Name
   *   d11: DTP ID, Gene Name
   *   d12: DME ID, Gene Name
   *   d13: DTT ID, Drug ID
   *   d14: DTP ID, Drug ID
   *   d15: DME ID, Drug ID
*   Output rows:
   *   d9: 32835
   *   d10: 4352
   *   d11: 423
   *   d12: 889
   *   d13: 32148
   *   d14: 270
   *   d15: 848

In [None]:
d9 = {
    'Drug ID': [],
    'Drug Name': [],
    'Disease Entry': []
}

with open('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/01 General Information of Drug.txt', 'r') as file:
    current_id = None
    for line in file:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            continue

        drug_id = fields[0].strip()
        field_name = fields[1].strip()
        field_value = fields[2].strip()

        if drug_id != current_id:
            current_id = drug_id
            for key in d9:
                d9[key].append('')

        if field_name == 'ID':
            d9['Drug ID'][-1] = field_value
        elif field_name == 'DN':
            d9['Drug Name'][-1] = field_value
        elif field_name == 'DE':
            d9['Disease Entry'][-1] = field_value
d9 = pd.DataFrame(d9)

d9.to_csv('dm1.csv', index=False)

In [None]:
d10 = {
    'DTT ID': [],
    'Gene Name': []
}

with open('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/02 General Information of Drug Therapeutic Target (DTT).txt', 'r') as file:
    current_id = None
    for line in file:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            continue

        drug_id = fields[0].strip()
        field_name = fields[1].strip()
        field_value = fields[2].strip()

        if drug_id != current_id:
            current_id = drug_id
            for key in d10:
                d10[key].append('')

        if field_name == 'ID':
            d10['DTT ID'][-1] = field_value
        elif field_name == 'GN':
            d10['Gene Name'][-1] = field_value
d10 = pd.DataFrame(d10)

d10.to_csv('dm2.csv', index=False)

In [None]:
d11 = {
    'DTP ID': [],
    'Gene Name': []
}

with open('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/03 General Information of Drug Transporter (DTP).txt', 'r') as file:
    current_id = None
    for line in file:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            continue

        drug_id = fields[0].strip()
        field_name = fields[1].strip()
        field_value = fields[2].strip()

        if drug_id != current_id:
            current_id = drug_id
            for key in d11:
                d11[key].append('')

        if field_name == 'ID':
            d11['DTP ID'][-1] = field_value
        elif field_name == 'GN':
            d11['Gene Name'][-1] = field_value
d11 = pd.DataFrame(d11)

d11.to_csv('dm3.csv', index=False)

In [None]:
d12 = {
    'DME ID': [],
    'Gene Name': []
}

with open('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/04 General Information of Drug Metabolizing Enzyme (DME) (1).txt', 'r') as file:
    current_id = None
    for line in file:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            continue

        drug_id = fields[0].strip()
        field_name = fields[1].strip()
        field_value = fields[2].strip()

        if drug_id != current_id:
            current_id = drug_id
            for key in d12:
                d12[key].append('')

        if field_name == 'ID':
            d12['DME ID'][-1] = field_value
        elif field_name == 'GN':
            d12['Gene Name'][-1] = field_value
d12 = pd.DataFrame(d12)

d12.to_csv('dm4.csv', index=False)

In [None]:
d13 = {
    'Drug ID': [],
    'DTT ID': []
}

with open('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/06 Drug to DTT Mapping Information.txt', 'r') as file:
    current_id = None
    for line in file:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            continue

        drug_id = fields[0].strip()
        field_name = fields[1].strip()
        field_value = fields[2].strip()

        if drug_id != current_id:
            current_id = drug_id
            for key in d13:
                d13[key].append('')

        if field_name == 'DI':
            d13['Drug ID'][-1] = field_value
        elif field_name == 'TI':
            d13['DTT ID'][-1] = field_value
d13 = pd.DataFrame(d13)

d13.to_csv('dm6.csv', index=False)

In [None]:
d14 = {
    'DTP ID': [],
    'Drug ID': []
}

with open('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/07 Drug to DTP Mapping Information.txt', 'r') as file:
    current_id = None
    for line in file:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            continue

        drug_id = fields[0].strip()
        field_name = fields[1].strip()
        field_value = fields[2].strip()

        if drug_id != current_id:
            current_id = drug_id
            for key in d14:
                d14[key].append('')

        if field_name == 'TI':
            d14['DTP ID'][-1] = field_value
        elif field_name == 'DI':
            d14['Drug ID'][-1] = field_value
d14 = pd.DataFrame(d14)

d14.to_csv('dm7.csv', index=False)

In [None]:
d15 = {
    'DME ID': [],
    'Drug ID': []
}

with open('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/08 Drug to DME Mapping Information.txt', 'r') as file:
    current_id = None
    for line in file:
        fields = line.strip().split('\t')
        if len(fields) < 3:
            continue

        entry_id = fields[0].strip()
        field_name = fields[1].strip()
        field_value = fields[2].strip()

        if entry_id != current_id:
            current_id = entry_id
            for key in d15:
                d15[key].append('')

        if field_name == 'EI':
            d15['DME ID'][-1] = field_value
        elif field_name == 'DI':
            d15['Drug ID'][-1] = field_value
d15 = pd.DataFrame(d15)

d15.to_csv('dm8.csv', index=False)

*   Output Columns: Drug Name, Disease Entry, gene
*   Output rows: 29407782

In [None]:
dm1 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/dm1.csv')
dm2 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/dm2.csv')
dm3 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/dm3.csv')
dm4 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/dm4.csv')
dm6 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/dm6.csv')
dm7 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/dm7.csv')
dm8 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/dm8.csv')

merge1 = pd.merge(dm2, dm6, on='DTT ID', how='outer')
merge2 = pd.merge(dm3, dm7, on='DTP ID', how='outer')
merge3 = pd.merge(dm4, dm8, on='DME ID', how='outer')
merge = pd.merge(dm1, merge1, on='Drug ID', how='outer')
merge = merge.rename(columns={"Gene Name": "gene1"})
merge = pd.merge(merge, merge2, on='Drug ID', how='outer')
merge = merge.rename(columns={"Gene Name": "gene2"})
merge = pd.merge(merge, merge3, on='Drug ID', how='outer')
merge = merge.rename(columns={"Gene Name": "gene3"})

merge.drop(columns=['Drug ID', 'DTT ID', 'DTP ID', 'DME ID'], inplace=True)

merge['gene'] = merge[['gene1', 'gene2', 'gene3']].fillna('').agg(', '.join, axis=1)
merge.drop(columns=['gene1', 'gene2', 'gene3'], inplace=True)
d16 = merge.assign(gene=merge['gene'].str.split(', ')).explode('gene')

d16.to_csv("/content/drug_disease_gene.csv", index=False)

**Relation: Gene-Disease**

*   Output Columns: Disease, gene
*   Output rows: 7213

In [None]:
d17 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/drug_disease_gene.csv')

d17 = d17[['Disease Entry', 'gene']]
d17 = d17[(d17['Disease Entry'] != 'Discovery agent') & (d17['gene'] != 'NO-GeName')]
d17 = d17.drop_duplicates().dropna()
d17 = d17.rename(columns={"Disease Entry": "Disease"})

d17.to_csv("/content/disease_gene.csv", index=False)

  d17 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/drug_disease_gene.csv')


**Relation: Gene-Drug**

*   Output Columns: Drug, gene
*   Output rows: 30104

In [None]:
d18 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/drug_disease_gene.csv')

d18 = d18[['Drug Name', 'gene']]
d18 = d18[(d18['gene'] != 'NO-GeName')]
d18 = d18.drop_duplicates().dropna()
d18 = d18.rename(columns={"Drug Name": "Drug"})

d18.to_csv("/content/drug_gene.csv", index=False)

  d18 = pd.read_csv('/content/drive/MyDrive/MS Thesis/Final/DrugMap_2023/drug_disease_gene.csv')


# **MSigDB**

*   Output Columns: Pathway, gene
*   Output rows: 447025

In [None]:
d19 = "/content/drive/MyDrive/MS Thesis/Final/MSigDB/MsigDB.txt"

with open(d19, "r") as file:
    lines = file.readlines()

data = []
for line in lines:
    pathway, genes = line.strip().split("\t")
    genes_list = genes.split(", ")
    for gene in genes_list:
        data.append([pathway, gene])

d19 = pd.DataFrame(data, columns=["Pathway", "gene"])
d19 = d19.drop_duplicates().dropna()

d19.to_csv("MSigDB.csv", index=False)

# **PTMSigDB**

In [None]:
import pandas as pd

d20 = "/content/drive/MyDrive/MS Thesis/Final/MSigDB/PTMsigDB.txt"

data = []

with open(d20, "r") as file:
    lines = file.readlines()

for line in lines:
    if ": " in line:
        pathway, genes = line.strip().split(": ", 1)
        genes_list = genes.split(" ")
        for gene in genes_list:
            gene = gene.replace("-", "_")  # Replace hyphen with underscore
            data.append([pathway, gene])

d20 = pd.DataFrame(data, columns=["Pathway", "Phosphosite"])
d20 = d20.drop_duplicates().dropna()

d20.to_csv("PTMSigDB.csv", index=False)