In [1]:
import pandas as pd

In [2]:
col_order = ["Metadata_JCP2022", "Metadata_broad_sample", "Metadata_InChIKey", "Metadata_pert_iname", "Metadata_pubchem_cid"]

In [3]:
datasets_metadata_df = pd.read_csv('datasets/metadata/compound.csv.gz', usecols=["Metadata_JCP2022", "Metadata_InChIKey"])
datasets_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey
0,JCP2022_000001,AAAHWCWPZPSPIW-UHFFFAOYSA-N
1,JCP2022_000002,AAAJHRMBUHXWLD-UHFFFAOYSA-N
2,JCP2022_000003,AAALVYBICLMAMA-UHFFFAOYSA-N
3,JCP2022_000004,AAANUZMCJQUYNX-UHFFFAOYSA-N
4,JCP2022_000005,AAAQFGUYHFJNHI-UHFFFAOYSA-N


In [4]:
target2_metadata_df = pd.read_csv(
    "JUMP-Target/JUMP-Target-2_compound_metadata.tsv",
    sep="\t",
    dtype=str,
    usecols=["broad_sample", "InChIKey", "pert_iname", "pubchem_cid"],
).rename(
    columns={
        "broad_sample": "Metadata_broad_sample",
        "InChIKey": "Metadata_InChIKey",
        "pubchem_cid": "Metadata_pubchem_cid",
        "pert_iname": "Metadata_pert_iname",
    }
)
target2_metadata_df.head()

Unnamed: 0,Metadata_broad_sample,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957
1,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-UHFFFAOYSA-N,quinine,94175
2,BRD-A85242401-001-12-3,LPYXWGMUVRGUOY-UHFFFAOYSA-N,ascorbic-acid,9888239
3,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070
4,BRD-K57313110-001-06-8,GUUGZPSUOTWOMD-UHFFFAOYSA-N,pidolic-acid,7405


In [5]:
merged_df = target2_metadata_df.merge(datasets_metadata_df, on="Metadata_InChIKey", how='left')[col_order]
merged_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid
0,JCP2022_043547,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957
1,JCP2022_050797,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-UHFFFAOYSA-N,quinine,94175
2,JCP2022_050997,BRD-A85242401-001-12-3,LPYXWGMUVRGUOY-UHFFFAOYSA-N,ascorbic-acid,9888239
3,JCP2022_108326,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070
4,JCP2022_027911,BRD-K57313110-001-06-8,GUUGZPSUOTWOMD-UHFFFAOYSA-N,pidolic-acid,7405


In [6]:
merged_df.to_csv("output/target-2-JUMP-mapped.tsv", sep="\t", index=False)

### poscon mapping

In [7]:
poscon_df = pd.read_csv('input/poscons.csv')
poscon_df.head()

Unnamed: 0,Metadata_pert_iname
0,AMG900
1,NVS-PAK1-1
2,dexamethasone
3,LY2109761
4,FK-866


In [8]:
poscon_merged_df = merged_df.merge(poscon_df, on="Metadata_pert_iname", how='inner')
poscon_merged_df

Unnamed: 0,Metadata_JCP2022,Metadata_broad_sample,Metadata_InChIKey,Metadata_pert_iname,Metadata_pubchem_cid
0,JCP2022_046054,BRD-K58550667-001-12-9,KPBNHDGDUADAGP-UHFFFAOYSA-N,FK-866,6914657
1,JCP2022_085227,BRD-K91188791-001-17-5,SRVFFFJZQVENJC-UHFFFAOYSA-N,aloxistatin,65663
2,JCP2022_050797,BRD-K59632282-052-03-1,LOUPRKONTZGTKE-UHFFFAOYSA-N,quinidine,441074
3,JCP2022_025848,BRD-A10188456-001-04-9,GJFCONYVAUNLKB-UHFFFAOYSA-N,dexamethasone,5702035
4,JCP2022_025848,BRD-K38775274-001-22-1,GJFCONYVAUNLKB-UHFFFAOYSA-N,dexamethasone,5743
5,JCP2022_064022,BRD-K28132190-001-02-0,OINGHOPGNMYCAB-UHFFFAOYSA-N,NVS-PAK1-1,122187564
6,JCP2022_012818,BRD-K89517477-001-03-9,CQKBSRPVZZLCJE-UHFFFAOYSA-N,TC-S-7004,57523919
7,JCP2022_037716,BRD-K21728777-001-02-3,IVUGFMLRJOCGAS-UHFFFAOYSA-N,AMG900,24856041
8,JCP2022_035095,BRD-K47557313-001-03-9,IHLVSLOZUHKNMQ-UHFFFAOYSA-N,LY2109761,11655119


In [9]:
poscon_merged_df.to_csv('output/target-2-JUMP-poscon-mapped.tsv', sep='\t', index=False)

In [10]:
with pd.ExcelWriter("output/target-2-JUMP-mapping.xlsx") as writer:
    merged_df.to_excel(
        writer, index=False, sheet_name="Target-2-JUMP"
    )

with pd.ExcelWriter("output/target-2-JUMP-mapping.xlsx", mode="a") as writer:
    poscon_merged_df.to_excel(
        writer, index=False, sheet_name="Target-2-JUMP-poscon"
    )