In [1]:
import pandas as pd

### Check the number of unique compounds in each Target-2 plate

Extract Target2 plates and compounds from all sources

In [2]:
well_df = pd.read_csv('datasets/metadata/well.csv.gz')
plate_df = pd.read_csv('datasets/metadata/plate.csv.gz').drop(columns="Metadata_Source", axis=1)

In [3]:
merged_df = plate_df.query('Metadata_PlateType=="TARGET2"').merge(
    well_df, on="Metadata_Plate", how="inner"
).query("Metadata_Source!='source_9'") # Filter out 1536 well plates

How many Target-2 plates have n wells?

In [4]:
merged_df.groupby(
    ["Metadata_Source", "Metadata_Plate"]
).Metadata_JCP2022.count().reset_index().Metadata_JCP2022.value_counts().reset_index().rename(
    columns={"index": "n_wells", "Metadata_JCP2022": "n_plates"}
)

Unnamed: 0,n_wells,n_plates
0,384,113
1,383,15
2,382,3
3,378,1


How many Target-2 plates have n unique compounds?

In [5]:
merged_df.groupby(
    ["Metadata_Source", "Metadata_Plate"]
).Metadata_JCP2022.nunique().reset_index().Metadata_JCP2022.value_counts().reset_index().rename(
    columns={"index": "n_compounds", "Metadata_JCP2022": "n_plates"}
)

Unnamed: 0,n_compounds,n_plates
0,302,96
1,301,19
2,300,11
3,299,3
4,287,2
5,298,1


How many source 4 Target-2 plates have n unique compounds?

In [6]:
merged_df.query("Metadata_Source=='source_4'").groupby(
    ["Metadata_Source", "Metadata_Plate"]
).Metadata_JCP2022.nunique().reset_index().Metadata_JCP2022.value_counts().reset_index().rename(
    columns={"index": "n_compounds", "Metadata_JCP2022": "n_plates"}
)

Unnamed: 0,n_compounds,n_plates
0,302,17
1,301,5


List Target-2 plates in `source_4`

In [7]:
merged_df.query('Metadata_Source=="source_4"').Metadata_Plate.unique()

array(['BR00121438', 'BR00121439', 'BR00121436', 'BR00121425',
       'BR00121437', 'BR00121430', 'BR00121429', 'BR00121424',
       'BR00121428', 'BR00121427', 'BR00121423', 'BR00121426',
       'BR00126113', 'BR00126114', 'BR00126115', 'BR00126116',
       'BR00126117', 'BR00127145', 'BR00127146', 'BR00127147',
       'BR00127148', 'BR00127149'], dtype=object)

### Map to `broad_sample` names

Retrieve the Target-2 platemap

In [8]:
target_2_platemap = pd.read_csv(
    "JUMP-Target/JUMP-Target-2_compound_platemap.tsv",
    sep="\t",
    usecols=["well_position", "broad_sample"],
).rename(
    columns={"well_position": "Metadata_Well", "broad_sample": "Metadata_broad_sample"}
)

Retrieve a single Target-2 plate from `source_4` (`BR0012143`)

In [9]:
jump_source_4_target2_platemap = merged_df.query('Metadata_Plate=="BR00121438"')

In [10]:
merged_platemap_df = jump_source_4_target2_platemap.merge(
    target_2_platemap, on="Metadata_Well", how="inner"
)

In [11]:
### Add other metadata from JUMP and Target-2

Read compound metadata

In [12]:
compound_metadata_df = pd.read_csv(
    'datasets/metadata/compound.csv.gz'
).rename(columns={"Metadata_InChIKey": "JUMP_InChIKey", "Metadata_InChI": "JUMP_InChI"})

In [13]:
target_2_metadata = pd.read_csv(
    "JUMP-Target/JUMP-Target-2_compound_metadata.tsv",
    sep="\t",
    usecols=["broad_sample", "InChIKey", "pert_iname", "pubchem_cid", "smiles"],
).rename(
    columns={
        "broad_sample": "Metadata_broad_sample",
        "InChIKey": "Target2_InChIKey",
        "pert_iname": "Target2_pert_iname",
        "pubchem_cid": "Target2_pubchem_cid",
        "smiles": "Target2_smiles",
    }
)

Merge data frames

In [14]:
df = (
    merged_platemap_df.merge(compound_metadata_df, on="Metadata_JCP2022", how="inner")
    .merge(target_2_metadata, on="Metadata_broad_sample", how="inner")
)

In [15]:
df.head()

Unnamed: 0,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Source,Metadata_Well,Metadata_JCP2022,Metadata_broad_sample,JUMP_InChIKey,JUMP_InChI,Target2_InChIKey,Target2_pert_iname,Target2_pubchem_cid,Target2_smiles
0,2021_04_26_Batch1,BR00121438,TARGET2,source_4,A01,JCP2022_043547,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,CCCCCCCCO
1,2021_04_26_Batch1,BR00121438,TARGET2,source_4,A02,JCP2022_050797,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-UHFFFAOYSA-N,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...,LOUPRKONTZGTKE-AFHBHXEDSA-N,quinine,94175.0,COc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C...
2,2021_04_26_Batch1,BR00121438,TARGET2,source_4,H12,JCP2022_050797,BRD-K59632282-052-03-1,LOUPRKONTZGTKE-UHFFFAOYSA-N,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...,LOUPRKONTZGTKE-LHHVKLHASA-N,quinidine,441074.0,COc1ccc2nccc([C@H](O)[C@H]3C[C@@H]4CC[N@]3C[C@...
3,2021_04_26_Batch1,BR00121438,TARGET2,source_4,A03,JCP2022_050997,BRD-A85242401-001-12-3,LPYXWGMUVRGUOY-UHFFFAOYSA-N,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...,KRGQEOSDQHTZMX-IGCYCDGOSA-N,ascorbic-acid,9888239.0,OC[C@H](O)[C@H]1OC(=O)C(=O)C1O
4,2021_04_26_Batch1,BR00121438,TARGET2,source_4,A04,JCP2022_108326,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,OC(=O)c1ccccc1O


Remove DMSO wells

In [16]:
df = df.query("Metadata_JCP2022!='JCP2022_033924'").sort_values(by="Metadata_Well")

Write to file

In [17]:
df.to_csv('output/target-2-mapped.tsv', sep='\t', index=False)