In [None]:
!pip install anndata

In [None]:
!pip install s3fs

In [2]:
import io
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import anndata as ad

In [3]:
%load_ext autoreload
%autoreload 2
from google.colab import drive
import os
gdrive_path='/content/gdrive/MyDrive/Masterpraktikum'
drive.mount('/content/gdrive', force_remount=True)
os.chdir(gdrive_path)

Mounted at /content/gdrive


## Loading profiles from the JUMP Cell Painting Datasets  

*adapted from: https://github.com/jump-cellpainting/datasets/blob/main/sample_notebook.ipynb*

### Which data will we be using?

- All Target2 plates
- 8 positive controls and DMSO of the COMPOUND plates

### Load metadata

The following files contain the metadata information for the entire dataset.
The schema is [here](metadata/README.md).

In [5]:
GIT_CLONE_DIR = "/content/gdrive/MyDrive/Masterpraktikum/Blockteil/datasets/"

In [6]:
plates = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/plate.csv.gz"))
wells = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/well.csv.gz"))
compound = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/compound.csv.gz"))

In [21]:
#tsv file obtained from https://github.com/jump-cellpainting/JUMP-Target/blob/master/JUMP-Target-2_compound_metadata.tsv
compound_metadata = pd.read_csv("/content/gdrive/MyDrive/Masterpraktikum/Blockteil/JUMP-Target-2_compound_metadata.tsv", delimiter = "\t")
compound_metadata.head()

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO
1,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-AFHBHXEDSA-N,quinine,94175.0,KCNN4,trt,,COc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C...
2,BRD-A85242401-001-12-3,KRGQEOSDQHTZMX-IGCYCDGOSA-N,ascorbic-acid,9888239.0,P3H1,trt,,OC[C@H](O)[C@H]1OC(=O)C(=O)C1O
3,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O
4,BRD-K57313110-001-06-8,ODHCTXKNWHHXJC-VKHMYHEASA-N,pidolic-acid,7405.0,VEGFA,trt,,OC(=O)[C@@H]1CCC(=O)N1


### Subset on plates and wells of interest

**1. Subset on the target2 plates:**

In [None]:
sample_target2 = plates.query("Metadata_PlateType == 'TARGET2'")
sample_target2.head()

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType
161,source_10,2021_08_03_U2OS_48_hr_run12,Dest210726-160150,TARGET2
180,source_10,2021_08_09_U2OS_48_hr_run13,Dest210727-153003,TARGET2
195,source_10,2021_08_12_U2OS_48_hr_run15,Dest210803-153958,TARGET2
213,source_10,2021_08_17_U2OS_48_hr_run16,Dest210809-134534,TARGET2
231,source_10,2021_08_20_U2OS_48_hr_run17,Dest210810-173723,TARGET2


**2. Subset on the 8 poscons and the DMSO of the COMPOUND plates.** <br>
To do so, we need the JCP2022 IDs of the poscons and the DMSO. In the following, we obtain the JCP2022 IDs of the 8 positive controls from the POSCON8 plates. THE JCP2022 ID of the DMSO from a metadata file: https://github.com/jump-cellpainting/JUMP-Target/blob/master/JUMP-Target-2_compound_metadata.tsv

In [None]:
poscon8_plates = plates.query("Metadata_PlateType == 'POSCON8'")
plates_wells = pd.merge(poscon8_plates, wells, on = ["Metadata_Plate", "Metadata_Source"])
# get IDs of the 8 positive controls:
poscon8_ids = plates_wells.Metadata_JCP2022.unique()
poscon8_ids

array(['JCP2022_085227', 'JCP2022_037716', 'JCP2022_025848',
       'JCP2022_046054', 'JCP2022_035095', 'JCP2022_064022',
       'JCP2022_050797', 'JCP2022_012818'], dtype=object)

In [None]:
#DMSO:
lnchikey_dmso = compound_metadata.query("pert_iname == 'DMSO'")["InChIKey"]
lnchikey_dmso
# get JCP_2022 ID from metadata/compound.tsv
dmso_key = compound[compound['Metadata_InChIKey'].isin(lnchikey_dmso)]["Metadata_JCP2022"].unique()

In [None]:
poscon_negcon_ids = np.concatenate([dmso_key,poscon8_ids])
poscon_negcon_ids

array(['JCP2022_033924', 'JCP2022_085227', 'JCP2022_037716',
       'JCP2022_025848', 'JCP2022_046054', 'JCP2022_035095',
       'JCP2022_064022', 'JCP2022_050797', 'JCP2022_012818'], dtype=object)

In [None]:
# subset compound plates on those wells:
compound_plates = plates.query("Metadata_PlateType == 'COMPOUND'")
compound_wells = pd.merge(compound_plates, wells, on = ["Metadata_Plate", "Metadata_Source"])
sample_pc_nc = compound_wells[compound_wells['Metadata_JCP2022'].isin(poscon_negcon_ids)]

In [None]:
sample_pc_nc.head()

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_PlateType,Metadata_Well,Metadata_JCP2022
0,source_1,Batch1_20221004,UL001641,COMPOUND,A02,JCP2022_033924
1,source_1,Batch1_20221004,UL001641,COMPOUND,A03,JCP2022_085227
2,source_1,Batch1_20221004,UL001641,COMPOUND,A04,JCP2022_033924
43,source_1,Batch1_20221004,UL001641,COMPOUND,A45,JCP2022_033924
44,source_1,Batch1_20221004,UL001641,COMPOUND,A46,JCP2022_085227


### Loading profiles
Now let's load the profiles from these plates and wells.

Setting `columns = None` below will load all of the features, n_features = 4762

In [None]:
profile_formatter = (
    "s3://cellpainting-gallery/cpg0016-jump/"
    "{Metadata_Source}/workspace/profiles/"
    "{Metadata_Batch}/{Metadata_Plate}/{Metadata_Plate}.parquet"
)

**1. Load Target2 plates**

In [None]:
sample = sample_target2
sample

In [None]:
dframes_target2 = []
columns = None
for _, row in tqdm(sample.iterrows(), total=len(sample)):
    s3_path = profile_formatter.format(**row.to_dict())
    dframes_target2.append(
        pd.read_parquet(s3_path, storage_options={"anon": True}, columns=columns)
    )
dframes_target2 = pd.concat(dframes_target2)
dframes_target2

  0%|          | 0/141 [00:00<?, ?it/s]

Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_RNA_10_02_256,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256
0,source_10,Dest210726-160150,A01,3846.4,8135.6,594.88,544.30,506.29,454.05,550.31,...,52.879,49.352,50.861,51.543,50.538,51.884,52.263,52.625,52.144,53.302
1,source_10,Dest210726-160150,A02,3484.2,7088.1,541.40,539.61,458.50,455.57,499.72,...,62.107,55.949,59.508,60.461,59.221,60.675,61.428,62.315,61.100,62.480
2,source_10,Dest210726-160150,A03,3281.9,6740.3,540.91,564.09,458.88,483.44,498.92,...,54.887,50.217,52.887,53.876,52.707,53.984,54.283,55.613,54.511,55.356
3,source_10,Dest210726-160150,A04,3476.5,7233.5,554.90,491.65,472.11,406.53,512.92,...,56.545,52.062,56.238,57.078,55.913,57.400,57.872,58.614,57.834,58.898
4,source_10,Dest210726-160150,A05,3592.1,7581.8,565.46,555.42,480.46,468.08,522.43,...,64.136,58.298,62.002,63.093,61.794,63.153,63.637,64.359,63.523,64.394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,source_9,GR00004409,Z44,2501.1,4571.1,560.30,560.26,493.83,493.32,526.64,...,54.051,53.101,51.344,51.742,51.299,51.507,51.875,53.164,51.923,52.834
1532,source_9,GR00004409,Z45,2497.0,4529.4,555.90,563.43,491.44,495.61,523.31,...,52.398,50.418,49.035,49.288,49.087,49.531,49.877,51.324,49.960,51.313
1533,source_9,GR00004409,Z46,2470.0,4479.2,548.36,578.95,482.96,512.79,515.18,...,60.019,58.125,56.818,57.189,56.923,57.286,57.917,59.310,57.726,59.307
1534,source_9,GR00004409,Z47,2725.4,5025.1,553.86,565.84,485.54,495.00,519.22,...,52.872,51.857,49.610,49.757,49.552,50.035,50.196,51.243,50.180,51.654


**2. Load positive and negative controls**

In [None]:
sample = sample_pc_nc#.iloc[[3]]
sample

In [None]:
sample_plates = sample.groupby(['Metadata_Source','Metadata_Batch', "Metadata_Plate"]).count().reset_index()

In [None]:
columns = None
dfs_pcs_ncs = []
for _, row in tqdm(sample_plates.iterrows(), total=len(sample_plates)):
    #pull plate
    s3_path = profile_formatter.format(**row.to_dict())
    plate = pd.read_parquet(s3_path, storage_options={"anon": True}, columns=columns)
    #subset on poscon and negcon wells
    wells_interest = sample_pc_nc[sample_pc_nc["Metadata_Plate"] == row["Metadata_Plate"]]["Metadata_Well"]
    #append to dataframe
    dfs_pcs_ncs.append(
        plate[plate["Metadata_Well"].isin(wells_interest)]
    )
dfs_pcs_ncs = pd.concat(dfs_pcs_ncs)

  0%|          | 0/1729 [00:00<?, ?it/s]

In [None]:
dfs_pcs_ncs.to_csv("/content/gdrive/MyDrive/Masterpraktikum/Blockteil/PCs_DMSO.csv")

### Create AnnData

**Target2**

In [None]:
# get .X
X = np.array(dframes_target2.iloc[:,3:])
# get obs
obs = dframes_target2.iloc[:,:3]
# create anndata
adata_target2 = ad.AnnData(X=X, obs=obs)
adata_target2

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 64464 × 4762
    obs: 'Metadata_Source', 'Metadata_Plate', 'Metadata_Well'

Save anndata:

In [None]:
adata_target2.write("/content/gdrive/MyDrive/Masterpraktikum/Blockteil/target2.h5ad")

**Poscon8 and DMSO**

### Add additional metadata

**Target2**

In [45]:
adata_target2 = ad.read_h5ad("/content/gdrive/MyDrive/Masterpraktikum/Blockteil/target2.h5ad")
adata_target2

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 64464 × 4762
    obs: 'Metadata_Source', 'Metadata_Plate', 'Metadata_Well'

In [46]:
metadata = compound.merge(wells, on="Metadata_JCP2022")
adata_target2.obs = pd.merge(adata_target2.obs, metadata, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"])
adata_target2.obs.head()

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI
0,source_10,Dest210726-160150,A01,JCP2022_043547,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3"
1,source_10,Dest210726-160150,A02,JCP2022_050797,LOUPRKONTZGTKE-UHFFFAOYSA-N,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...
2,source_10,Dest210726-160150,A03,JCP2022_050997,LPYXWGMUVRGUOY-UHFFFAOYSA-N,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...
3,source_10,Dest210726-160150,A04,JCP2022_108326,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8..."
4,source_10,Dest210726-160150,A05,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3


Let's add metadata of the microscopes:

In [47]:
microscope_config = pd.read_csv(os.path.join(GIT_CLONE_DIR, "metadata/microscope_config.csv"))
microscope_config['Metadata_Source'] = 'source_' + microscope_config['Metadata_Source'].astype(str)
microscope_config.head()

Unnamed: 0,Metadata_Source,Metadata_Microscope_Name,Metadata_Widefield_vs_Confocal,Metadata_Excitation_Type,Metadata_Objective_NA,Metadata_N_Brightfield_Planes_Min,Metadata_N_Brightfield_Planes_Max,Metadata_Distance_Between_Z_Microns,Metadata_Sites_Per_Well,Metadata_Filter_Configuration
0,source_1,Opera Phenix,Widefield,Laser,1.0,1,1,,4,H
1,source_2,CV8000,Confocal,Laser,1.0,3,3,8.0,6,A
2,source_3,Opera Phenix,Widefield,Laser,1.0,0,3,5.0,9,B
3,source_4,Opera Phenix,Widefield,Laser,1.0,3,3,5.0,9,B
4,source_5,CV8000,Confocal,Laser,0.75,3,3,5.0,9,C


In [48]:
adata_target2.obs = pd.merge(adata_target2.obs, microscope_config[["Metadata_Source", "Metadata_Microscope_Name"
, "Metadata_Widefield_vs_Confocal", "Metadata_Excitation_Type", "Metadata_Sites_Per_Well"]],
         on = "Metadata_Source")
adata_target2.obs.head()

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,Metadata_Microscope_Name,Metadata_Widefield_vs_Confocal,Metadata_Excitation_Type,Metadata_Sites_Per_Well
0,source_10,Dest210726-160150,A01,JCP2022_043547,KBPLFHHGFOOTCA-UHFFFAOYSA-N,"InChI=1S/C8H18O/c1-2-3-4-5-6-7-8-9/h9H,2-8H2,1H3",CV8000,Confocal,Laser,6
1,source_10,Dest210726-160150,A02,JCP2022_050797,LOUPRKONTZGTKE-UHFFFAOYSA-N,InChI=1S/C20H24N2O2/c1-3-13-12-22-9-7-14(13)10...,CV8000,Confocal,Laser,6
2,source_10,Dest210726-160150,A03,JCP2022_050997,LPYXWGMUVRGUOY-UHFFFAOYSA-N,InChI=1S/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/...,CV8000,Confocal,Laser,6
3,source_10,Dest210726-160150,A04,JCP2022_108326,YGSDEFSMJLZEOE-UHFFFAOYSA-N,"InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8...",CV8000,Confocal,Laser,6
4,source_10,Dest210726-160150,A05,JCP2022_033924,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3,CV8000,Confocal,Laser,6


Let's try to add some information about the pertubations and target genes:

In [49]:
compound_metadata.head()

Unnamed: 0,broad_sample,InChIKey,pert_iname,pubchem_cid,target,pert_type,control_type,smiles
0,BRD-K09338665-001-07-1,KBPLFHHGFOOTCA-UHFFFAOYSA-N,1-octanol,957.0,GJB4,trt,,CCCCCCCCO
1,BRD-K48278478-001-01-2,LOUPRKONTZGTKE-AFHBHXEDSA-N,quinine,94175.0,KCNN4,trt,,COc1ccc2nccc([C@@H](O)[C@H]3C[C@@H]4CC[N@]3C[C...
2,BRD-A85242401-001-12-3,KRGQEOSDQHTZMX-IGCYCDGOSA-N,ascorbic-acid,9888239.0,P3H1,trt,,OC[C@H](O)[C@H]1OC(=O)C(=O)C1O
3,BRD-K93632104-001-17-2,YGSDEFSMJLZEOE-UHFFFAOYSA-N,salicylic-acid,118212070.0,AKR1C1,trt,,OC(=O)c1ccccc1O
4,BRD-K57313110-001-06-8,ODHCTXKNWHHXJC-VKHMYHEASA-N,pidolic-acid,7405.0,VEGFA,trt,,OC(=O)[C@@H]1CCC(=O)N1


In [50]:
adata_target2.obs = pd.merge(adata_target2.obs, compound_metadata, left_on = "Metadata_InChIKey", right_on = "InChIKey", how = "left")

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [51]:
adata_target2

AnnData object with n_obs × n_vars = 64464 × 4762
    obs: 'Metadata_Source', 'Metadata_Plate', 'Metadata_Well', 'Metadata_JCP2022', 'Metadata_InChIKey', 'Metadata_InChI', 'Metadata_Microscope_Name', 'Metadata_Widefield_vs_Confocal', 'Metadata_Excitation_Type', 'Metadata_Sites_Per_Well', 'broad_sample', 'InChIKey', 'pert_iname', 'pubchem_cid', 'target', 'pert_type', 'control_type', 'smiles'

For how many pertubations is a pert_type not available?

In [52]:
len(adata_target2.obs[adata_target2.obs["pert_type"].isna()]["Metadata_InChIKey"].unique())

120

For how many is a pert_type available?

In [53]:
len(adata_target2.obs[~(adata_target2.obs["pert_type"].isna())]["Metadata_InChIKey"].unique())

182

Let's save the anndata:

In [54]:
adata_target2.write("/content/gdrive/MyDrive/Masterpraktikum/Blockteil/target2_md.h5ad")