In [1245]:
import pandas as pd
import re
import scanpy as sc

from numpy import where
from pathlib import Path

# Purpose
Adamson 2016 dataset as provided by the GEARS authors appears to have some inconsistencies compared to the source data provided by the Adamson et. al. authors via GEO (https://ftp.ncbi.nlm.nih.gov/geo/series/GSE90nnn/GSE90546/suppl/GSE90546_RAW.tar). This notebook matches the dataset provided by GEARS with the original GEO submission and updates any metadata as necessary.

In [1246]:
DATA_DIR = Path('/work/shared/perturbseq_testing/data/adamson_2016')

## Load GEARS data

In [1247]:
gears = sc.read_h5ad(DATA_DIR.joinpath('perturb_processed.h5ad'))
gears

AnnData object with n_obs × n_vars = 68603 × 5060
    obs: 'condition', 'cell_type', 'dose_val', 'control', 'condition_name'
    var: 'gene_name'
    uns: 'non_dropout_gene_idx', 'non_zeros_gene_idx', 'rank_genes_groups_cov_all', 'top_non_dropout_de_20', 'top_non_zero_de_20'

### save the ordering of barcodes in .obs so that the fixed version can be sorted in the same order

In [1248]:
gears_original_index = gears.obs.index
print(gears_original_index)

Index(['AAACATACACCGAT-1', 'AAACATACAGAGAT-1', 'AAACATACCAGAAA-1',
       'AAACATACGTTGAC-1', 'AAACATACTGTTCT-1', 'AAACCGTGCAGCTA-1',
       'AAACCGTGCCTGAA-1', 'AAACCGTGCGGAGA-1', 'AAACCGTGGAACTC-1',
       'AAACGCACCATGGT-1',
       ...
       'TTTGACTGTACGAC-10', 'TTTGCATGAATCGC-10', 'TTTGCATGCACACA-10',
       'TTTGCATGCCCGTT-10', 'TTTGCATGCCTATT-10', 'TTTGCATGCTTTAC-10',
       'TTTGCATGGAGGAC-10', 'TTTGCATGTAGAGA-10', 'TTTGCATGTCAAGC-10',
       'TTTGCATGTGGAGG-10'],
      dtype='object', name='cell_barcode', length=68603)


In [1249]:
gears.obs.index.is_unique

True

## Load Adamson data (generate from NCBI GEO source for pilot, UPR, and epistasis experiments)

In [1250]:
pilot = sc.read(DATA_DIR.joinpath('adamson_2016_pilot_from_source.h5ad'))
pilot



AnnData object with n_obs × n_vars = 5768 × 35635
    obs: 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'good_coverage', 'number_of_cells', 'condition', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'ensembl_id', 'gene_symbol', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [1251]:
upr = sc.read_h5ad(DATA_DIR.joinpath('adamson_2016_upr_from_source.h5ad'))
upr



AnnData object with n_obs × n_vars = 65337 × 32738
    obs: 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'good_coverage', 'number_of_cells', 'condition', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'ensembl_id', 'gene_symbol', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [1252]:
epistasis = sc.read(DATA_DIR.joinpath('adamson_2016_epistasis_from_source.h5ad'))
epistasis



AnnData object with n_obs × n_vars = 15006 × 32738
    obs: 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'good_coverage', 'number_of_cells', 'condition', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'ensembl_id', 'gene_symbol', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

## Mark barcode collisions that results from GEARS authors combining all 3 sub-experiments in Adamson 2016 into a single data object
It appears from the GEARS AnnData object that they included all and then resolved duplicate indices resulting from the barcode collisions by using AnnData's make_unique functionality.

### pilot/UPR collisions

In [1253]:
pilot_upr_collisions = set(pilot.obs_names).intersection(set(upr.obs_names))
len(pilot_upr_collisions)

46

### pilot/epistasis collisions

In [1254]:
pilot_epistasis_collisions = set(pilot.obs_names).intersection(set(epistasis.obs_names))
len(pilot_epistasis_collisions)

53

### UPR/epistasis collisions

In [1255]:
upr_epistasis_collisions = set(upr.obs_names).intersection(set(epistasis.obs_names))
len(upr_epistasis_collisions)

136

### check for 3-way collisions within Adamson 2016

In [1256]:
len(pilot_upr_collisions.intersection(upr_epistasis_collisions))

0

### total collisions within Adamson 2016 data

In [1257]:
all_adamson_collisions = (pilot_upr_collisions.union(pilot_epistasis_collisions)).union(upr_epistasis_collisions)
len(all_adamson_collisions)

235

### how many of these are in GEARS?

In [1258]:
adamson_collisions_in_gears = all_adamson_collisions.intersection(set(gears.obs.index))
len(adamson_collisions_in_gears)

232

### of the 232 in GEARS, are they all in there twice which would lead to 464 rows?

In [1259]:
gears.obs[gears.obs.index.str.startswith(tuple(adamson_collisions_in_gears))].shape

(404, 5)

### answer: no
To move forward in understanding this, need to flag which rows in GEARS are actually coming from barcode collisions in Adamson. Define two additional columns: `potential_collision` meaning that the barcode is a collider in Adamson, but does not necessarily mean that both were taken by the GEARS authors, and `barcode_collision` defined as those that are actually still colliding within the GEARS version of Adamson.

In [1260]:
gears.obs['potential_collision'] = gears.obs.index.str.startswith(tuple(adamson_collisions_in_gears))

## Combine Adamson datasets into single object that includes a new column identifying the source experiment

In [1261]:
pilot.obs['experiment'] = 'pilot'
upr.obs['experiment'] = 'upr'
epistasis.obs['experiment'] = 'epistasis'

adamson_all = sc.concat([pilot, upr, epistasis])
adamson_all

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 86111 × 32738
    obs: 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'good_coverage', 'number_of_cells', 'condition', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'experiment'

## As expected there is a warning about non-unique obs_names resulting from the collisions
Leave this alone for now because they will be resolved. Instead, use a regular expression to recreate the original barcode names before the collisions were made unique in the GEARS AnnData. Then merge the .obs dataframe into the GEARS .obs dataframe as a starting point for figuring out which Adamson experiment was the source for each cell in the duplicate pairs.

In [1262]:
duplicated_barcodes = [barcode for barcode in gears.obs_names if re.match(r'[ACTG]{14}-\d-\d', barcode)]
print(f'Number of collisions: {len(duplicated_barcodes)}')
print(f'Examples of duplicated barcodes: {duplicated_barcodes[:10]}')

Number of collisions: 172
Examples of duplicated barcodes: ['AAAGAGACTGCCAA-1-1', 'AATGTAACTTCTGT-1-1', 'ACAAGCACTGACTG-1-1', 'ACCCAAGAGACACT-1-1', 'ACGTGATGAGGCGA-1-1', 'AGAATACTCCACCT-1-1', 'AGACACACTATGCG-1-1', 'AGAGAAACTAGAAG-1-1', 'AGTCGCCTAACCGT-1-1', 'ATCACTTGGCCCTT-1-1']


The extra `-1` in these barcodes is the expected behavior of AnnData's `make_unique` functions

In [1263]:
gears.obs['original_barcode'] = [barcode.rsplit('-', 1)[0] if re.match(r'[ACTG]{14}-\d-\d', barcode) else barcode for barcode in gears.obs_names]

In [1264]:
gears.obs['barcode_collision'] = gears.obs['original_barcode'].duplicated(keep=False)
gears.obs['barcode_collision'].value_counts()

barcode_collision
False    68259
True       344
Name: count, dtype: int64

Reminder: These 344 rows represent 344 cells arising from 172 collisions

In [1265]:
gears.obs['potential_collision'].value_counts()

potential_collision
False    68199
True       404
Name: count, dtype: int64

In [1266]:
gears.obs.loc[gears.obs['potential_collision'], 'barcode_collision'].value_counts()

barcode_collision
True     344
False     60
Name: count, dtype: int64

### out of 404 potential collisions, 344 are coming from barcodes where both versions from Adamson made it into GEARS while 60 are coming from ones where only one of the pair from Adamson showed up in the GEARS version of their data
A little arithmetic shows this all checks out:
235 barcode collisions in Adamson
232 of these barcodes appear in GEARS
344 are cases where both Adamson cells are there so divide by 2 and get 172
60 are cases where only one Adamson cell showed up
172 + 60 = 232 and this is equal to the number of colliding Adamson barcodes that appear in GEARS

In [1267]:
combined_obs = gears.obs.merge(adamson_all.obs,
                               left_on='original_barcode',
                               right_index=True,
                               how='left',
                               suffixes=['_gears', '_adamson'])
combined_obs

Unnamed: 0_level_0,condition_gears,cell_type,dose_val,control,condition_name,potential_collision,original_barcode,barcode_collision,guide_identity,read_count,...,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_ribo,log1p_total_counts_ribo,pct_counts_ribo,total_counts_hb,log1p_total_counts_hb,pct_counts_hb,experiment
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCGAT-1,CREB1+ctrl,K562(?),1+1,0,K562(?)_CREB1+ctrl_1+1,False,AAACATACACCGAT-1,False,CREB1_pDS269,1286.0,...,0.0,0.000000,0.000000,2770.0,7.926963,34.037846,53.0,3.988984,0.651266,pilot
AAACATACAGAGAT-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,False,AAACATACAGAGAT-1,False,SNAI1_pDS266,296.0,...,0.0,0.000000,0.000000,3593.0,8.187021,40.011135,121.0,4.804021,1.347439,pilot
AAACATACCAGAAA-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,False,AAACATACCAGAAA-1,False,62(mod)_pBA581,1829.0,...,0.0,0.000000,0.000000,11445.0,9.345396,40.003498,470.0,6.154858,1.642782,pilot
AAACATACGTTGAC-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,False,AAACATACGTTGAC-1,False,EP300_pDS268,1580.0,...,0.0,0.000000,0.000000,3992.0,8.292298,35.184204,53.0,3.988984,0.467125,pilot
AAACATACTGTTCT-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,False,AAACATACTGTTCT-1,False,62(mod)_pBA581,748.0,...,0.0,0.000000,0.000000,3533.0,8.170186,35.817112,141.0,4.955827,1.429440,pilot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCATGCTTTAC-10,STT3A+ctrl,K562(?),1+1,0,K562(?)_STT3A+ctrl_1+1,False,TTTGCATGCTTTAC-10,False,STT3A_pDS011,476.0,...,871.0,6.770790,5.996971,3294.0,8.100162,22.679703,145.0,4.983607,0.998348,upr
TTTGCATGGAGGAC-10,ARHGAP22+ctrl,K562(?),1+1,0,K562(?)_ARHGAP22+ctrl_1+1,False,TTTGCATGGAGGAC-10,False,ARHGAP22_pDS458,539.0,...,539.0,6.291569,4.612751,3153.0,8.056427,26.983313,135.0,4.912655,1.155327,upr
TTTGCATGTAGAGA-10,ctrl,K562(?),1,1,K562(?)_ctrl_1,False,TTTGCATGTAGAGA-10,False,63(mod)_pBA580,647.0,...,1203.0,7.093405,7.242625,4353.0,8.378850,26.207104,495.0,6.206576,2.980132,upr
TTTGCATGTCAAGC-10,KCTD16+ctrl,K562(?),1+1,0,K562(?)_KCTD16+ctrl_1+1,False,TTTGCATGTCAAGC-10,False,KCTD16_pDS096,98.0,...,1056.0,6.963190,7.296345,3004.0,8.008033,20.755890,319.0,5.768321,2.204104,upr


## GEARS calls control `ctrl` while `control` was used in Adamson so harmonize this to facilitate comparisons
GEARS also added `+ctrl` to everything that was a single perturbation so that they could handle double perturbations. Use a lambda function to resolve both at the same time.

In [1268]:
combined_obs['condition_gears_harmonized'] = combined_obs['condition_gears'].apply(lambda s: s.split('+ctrl')[0]).replace('ctrl', 'control')
combined_obs.loc[:, ['condition_gears_harmonized', 'condition_adamson']]

  combined_obs['condition_gears_harmonized'] = combined_obs['condition_gears'].apply(lambda s: s.split('+ctrl')[0]).replace('ctrl', 'control')


Unnamed: 0_level_0,condition_gears_harmonized,condition_adamson
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACATACACCGAT-1,CREB1,CREB1
AAACATACAGAGAT-1,control,SNAI1
AAACATACCAGAAA-1,control,control
AAACATACGTTGAC-1,control,EP300
AAACATACTGTTCT-1,control,control
...,...,...
TTTGCATGCTTTAC-10,STT3A,STT3A
TTTGCATGGAGGAC-10,ARHGAP22,ARHGAP22
TTTGCATGTAGAGA-10,control,control
TTTGCATGTCAAGC-10,KCTD16,KCTD16


## Mark rows where GEARS condition agrees with the condition in the original source Adamson data

In [1269]:
combined_obs['condition_agrees'] = (combined_obs['condition_gears_harmonized'].astype(str) == combined_obs['condition_adamson'].astype(str))
combined_obs['condition_agrees'].value_counts()

condition_agrees
True     50610
False    18397
Name: count, dtype: int64

### count potential_collisions with rows where condition agrees

In [1270]:
combined_obs[combined_obs['potential_collision']].value_counts('condition_agrees')

condition_agrees
False    594
True     214
Name: count, dtype: int64

### count actual barcode_collisions with rows where condition agrees

In [1271]:
combined_obs[combined_obs['barcode_collision']].value_counts('condition_agrees')

condition_agrees
False    500
True     188
Name: count, dtype: int64

### of the potential collisions where only one of the Adamson cells was included, how many agree on condition with the original Adamson annotation?

In [1272]:
combined_obs[(combined_obs['potential_collision']) & (~combined_obs['barcode_collision'])].value_counts('condition_agrees')

condition_agrees
False    94
True     26
Name: count, dtype: int64

## It appears that GEARS authors also changed the condition of the epistasis data to `ctrl` even though most of these were actually doubly-perturbed cells
Count how many of the disagreeing rows are from the epistasis data

In [1273]:
combined_obs.loc[combined_obs['condition_agrees'] == False, 'experiment'].value_counts()

experiment
epistasis    13647
upr           2946
pilot         1804
Name: count, dtype: int64

In [1274]:
18397 - 13647

4750

In [1275]:
combined_obs.loc[(combined_obs['potential_collision']) & (~combined_obs['barcode_collision']) & (combined_obs['condition_agrees'] == False), 'experiment'].value_counts()

experiment
epistasis    48
upr          38
pilot         8
Name: count, dtype: int64

### For convenience and to make output more readable, create a shorter list of columns to display

In [1276]:
columns_of_interest = ['original_barcode', 'condition_gears', 'condition_gears_harmonized', 'condition_adamson', 'condition_agrees', 'experiment']

## Split combined_obs into rows with barcode collisions, potential collisions only, and fully unique barcodes (in Adamson) and initialize a columns of False values to mark which rows to keep in the ones with collisions (`keep`) and which rows have had their source cell from Adamson 2016 verified (`source_verified`)

In [1277]:
combined_obs_unique = combined_obs.loc[(~combined_obs['barcode_collision']) & (~combined_obs['potential_collision'])].copy()
combined_obs_unique['keep'] = True
combined_obs_unique['source_verified'] = True
print(f'Unique dataframe shape: {combined_obs_unique.shape}')

Unique dataframe shape: (68199, 37)


In [1278]:
combined_obs_barcode_collisions = combined_obs[combined_obs['barcode_collision']].copy()
print(f'Barcode collisions dataframe shape: {combined_obs_barcode_collisions.shape}')
combined_obs_barcode_collisions['keep'] = False
combined_obs_barcode_collisions['source_verified'] = False
combined_obs_barcode_collisions['keep'].value_counts()

Barcode collisions dataframe shape: (688, 35)


keep
False    688
Name: count, dtype: int64

In [1279]:
combined_obs_potential_collisions = combined_obs[(combined_obs['potential_collision']) & (~combined_obs['barcode_collision'])].copy()
print(f'Potential collisions dataframe shape: {combined_obs_potential_collisions.shape}')
combined_obs_potential_collisions['keep'] = False
combined_obs_potential_collisions['source_verified'] = False
combined_obs_potential_collisions['keep'].value_counts()

Potential collisions dataframe shape: (120, 35)


keep
False    120
Name: count, dtype: int64

### make sure row numbers of these new dataframes add up to the row number of the original dataframe

In [1280]:
assert(combined_obs.shape[0] == (combined_obs_unique.shape[0] + combined_obs_barcode_collisions.shape[0] + combined_obs_potential_collisions.shape[0]))

## Deal with actual barcode collisions first
### Grouping by the original barcode, count how many rows have conditions that agree between GEARS and Adamson
There should be 4 rows per barcode because each barcode will match one cell each from two different Adamson data subsets (see above that there are no 3-way collisions within Adamson). Therefore, the possible values here should be 2 agree (i.e., it is fully resolvable), 1 agrees (i.e., it might be resolvable if it turns out the disagreeing one comes from epistasis and was incorrectly relabeled as control), and 0 agree (i.e., both have incorrect condition labels in GEARS and we cannot resolve which experiment each cell of the pair originally came from).

In [1281]:
agreement_counts = (combined_obs_barcode_collisions
                        .groupby('original_barcode').aggregate(
                            count_agree=pd.NamedAgg(column="condition_agrees", aggfunc="sum"))
                   )
agreement_counts.value_counts()

count_agree
1              108
2               40
0               24
Name: count, dtype: int64

### simply for convenience and clarity of code, pull the barcodes in each of these categories into their own separate variables

In [1282]:
agreement_2_barcodes = agreement_counts.index[agreement_counts['count_agree'] == 2]
print(f'2 agree barcodes: {len(agreement_2_barcodes)}')
agreement_1_barcodes = agreement_counts.index[agreement_counts['count_agree'] == 1]
print(f'1 agree barcodes: {len(agreement_1_barcodes)}')
agreement_0_barcodes = agreement_counts.index[agreement_counts['count_agree'] == 0]
print(f'0 agree barcodes: {len(agreement_0_barcodes)}')

2 agree barcodes: 40
1 agree barcodes: 108
0 agree barcodes: 24


## example of original barcode with 2 agreements

In [1283]:
combined_obs[combined_obs['original_barcode'] == agreement_2_barcodes[0]][columns_of_interest]

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAATCCCTCTTGGA-1,AAATCCCTCTTGGA-1,ctrl,control,control,True,pilot
AAATCCCTCTTGGA-1,AAATCCCTCTTGGA-1,ctrl,control,DAD1,False,upr
AAATCCCTCTTGGA-1-1,AAATCCCTCTTGGA-1,DAD1+ctrl,DAD1,control,False,pilot
AAATCCCTCTTGGA-1-1,AAATCCCTCTTGGA-1,DAD1+ctrl,DAD1,DAD1,True,upr


### If there are two rows that can be resolved, we can mark these to keep as the correct sources of the cells in GEARS, leaving the other two rows with the pre-initialized False value for `keep`
It is possible to be tricked here if the 2 rows that agree are for the same original cell (e.g., if it had the same condition in both Adamson subsets and this condition matches what shows up in GEARS). It is unclear if this ever happens, but by making sure that the index is not duplicated for the ones marked as `keep` it can be shown that this did not happen.

In [1284]:
combined_obs_barcode_collisions.loc[(combined_obs_barcode_collisions['original_barcode'].isin(agreement_2_barcodes)) & (combined_obs_barcode_collisions['condition_agrees'] == True), 'keep'] = True

combined_obs_barcode_collisions.loc[(combined_obs_barcode_collisions['original_barcode'].isin(agreement_2_barcodes)) & (combined_obs_barcode_collisions['condition_agrees'] == True), 'source_verified'] = True

# make sure duplicate rows aren't being marked as keep
assert(combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique)
print(f"Index is unique? {combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique}")

Index is unique? True


## example of original barcode with 1 agreement

In [1285]:
combined_obs[combined_obs['original_barcode'] == agreement_1_barcodes[0]][columns_of_interest]

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AACACGTGGTACCA-2,AACACGTGGTACCA-2,ctrl,control,YIPF5,False,upr
AACACGTGGTACCA-2,AACACGTGGTACCA-2,ctrl,control,PERK_only,False,epistasis
AACACGTGGTACCA-2-1,AACACGTGGTACCA-2,YIPF5+ctrl,YIPF5,YIPF5,True,upr
AACACGTGGTACCA-2-1,AACACGTGGTACCA-2,YIPF5+ctrl,YIPF5,PERK_only,False,epistasis


In [1286]:
combined_obs[combined_obs['original_barcode'].isin(agreement_1_barcodes)].value_counts('condition_agrees')

condition_agrees
False    324
True     108
Name: count, dtype: int64

### get the cell_barcode for the rows where there is agreement

In [1287]:
agreement_1_resolved_barcodes = combined_obs_barcode_collisions[(combined_obs_barcode_collisions['original_barcode'].isin(agreement_1_barcodes)) & (combined_obs_barcode_collisions['condition_agrees'])].index
# how many?
print(len(agreement_1_resolved_barcodes))
# all unique?
print(len(set(agreement_1_resolved_barcodes)))
print(agreement_1_resolved_barcodes)

108
108
Index(['AATGTAACTTCTGT-1', 'ACGTGATGAGGCGA-1', 'AGACACACTATGCG-1',
       'ATCACTTGGCCCTT-1', 'ATTGTCTGGCAGAG-1', 'CACTAACTCCTCGT-1',
       'CCTAGAGAGGAAGC-1', 'CTACAACTGAGGGT-1', 'CTGATTTGAAGTAG-1',
       'GGAGTTTGTTTCGT-1',
       ...
       'GAAACCTGCGTGTA-3-1', 'GAGATAGAGTATCG-3-1', 'GCCACGGAAACCAC-3-1',
       'GGTATCGACCCAAA-3-1', 'TAATCCACGGCATT-3-1', 'TCATCAACATGACC-3-1',
       'TGACACGAGCGATT-3-1', 'TGTAACCTCACCAA-3-1', 'TGTATCTGAACGTC-3-1',
       'TTCAGTTGCGGGAA-3-1'],
      dtype='object', name='cell_barcode', length=108)


### of the cell_barcodes that are not resolved (i.e., the complement in the collision pair), how many of those came from epistasis where everything was incorrectly relabeled as control?

### Helper function to check if any the unresolved cell of a collision pair with 1 agreement came from the epistasis experiment

In [1288]:
def any_epistasis(s):
    result  = any([entry == 'epistasis' for entry in s])
    return result

In [1289]:
agreement_1_unresolved_barcodes = combined_obs_barcode_collisions[combined_obs_barcode_collisions['original_barcode'].isin(agreement_1_barcodes)].index.difference(agreement_1_resolved_barcodes)
agreement_1_unresolved_epistasis = (combined_obs_barcode_collisions.loc[agreement_1_unresolved_barcodes]
     .groupby('cell_barcode')
     .aggregate(any_epistasis=pd.NamedAgg('experiment', any_epistasis))
                                   )
agreement_1_unresolved_epistasis.value_counts()

any_epistasis
True             100
False              8
Name: count, dtype: int64

### get these 100 barcodes, combine with the 108 resolved barcodes from above, and mark those rows as keep and source_verified

In [1290]:
combined_obs_barcode_collisions.loc[(combined_obs_barcode_collisions.index.isin(agreement_1_resolved_barcodes)) & 
    (combined_obs_barcode_collisions['condition_agrees']), 'keep'] = True

combined_obs_barcode_collisions.loc[(combined_obs_barcode_collisions.index.isin(agreement_1_resolved_barcodes)) & 
    (combined_obs_barcode_collisions['condition_agrees']), 'source_verified'] = True

# make sure duplicate rows aren't being kept
assert(combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique)
print(f"Index is unique? {combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique}")

Index is unique? True


In [1291]:
combined_obs_barcode_collisions.loc[(combined_obs_barcode_collisions.index.isin(agreement_1_unresolved_epistasis[agreement_1_unresolved_epistasis['any_epistasis']== True].index)) & 
    (combined_obs_barcode_collisions['experiment'] == 'epistasis'), 'keep'] = True

combined_obs_barcode_collisions.loc[(combined_obs_barcode_collisions.index.isin(agreement_1_unresolved_epistasis[agreement_1_unresolved_epistasis['any_epistasis']== True].index)) & 
    (combined_obs_barcode_collisions['experiment'] == 'epistasis'), 'source_verified'] = True

# make sure duplicate cells have not been added
assert(combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique)
print(f"Index is unique? {combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique}")

Index is unique? True


### 8 should be remaining that should be kept but have the wrong labels

In [1292]:
(combined_obs_barcode_collisions.loc[
     (combined_obs_barcode_collisions.index.isin(agreement_1_unresolved_epistasis[agreement_1_unresolved_epistasis['any_epistasis']== False].index))
     ][columns_of_interest + ['keep']]
)

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment,keep
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACAAGAGAACGTGT-1,ACAAGAGAACGTGT-1,ctrl,control,EP300,False,pilot,False
ACAAGAGAACGTGT-1,ACAAGAGAACGTGT-1,ctrl,control,DERL2,False,upr,False
AGAAAGTGGGTTTG-1,AGAAAGTGGGTTTG-1,ctrl,control,SPI1,False,pilot,False
AGAAAGTGGGTTTG-1,AGAAAGTGGGTTTG-1,ctrl,control,DARS,False,upr,False
CTGGCACTGTTCTT-1,CTGGCACTGTTCTT-1,ctrl,control,EP300,False,pilot,False
CTGGCACTGTTCTT-1,CTGGCACTGTTCTT-1,ctrl,control,FECH,False,upr,False
GCACGGTGGTTTCT-1,GCACGGTGGTTTCT-1,ctrl,control,SNAI1,False,pilot,False
GCACGGTGGTTTCT-1,GCACGGTGGTTTCT-1,ctrl,control,AMIGO3,False,upr,False
GCGAAGGAGGAAAT-1,GCGAAGGAGGAAAT-1,ctrl,control,EP300,False,pilot,False
GCGAAGGAGGAAAT-1,GCGAAGGAGGAAAT-1,ctrl,control,TARS,False,upr,False


### to know which of these is the cell that was actually taken from Adamson by the GEARS authors, need to consider which experiment the other member of the barcode collision pair came from and then this is the other one

In [1293]:
(combined_obs_barcode_collisions[
    (combined_obs_barcode_collisions['original_barcode'].isin(agreement_1_unresolved_epistasis[agreement_1_unresolved_epistasis['any_epistasis']== False].index))
     ]
     .value_counts(['experiment', 'condition_agrees'])
)

experiment  condition_agrees
pilot       False               16
upr         False                8
            True                 8
Name: count, dtype: int64

### in all cases it was from the UPR experiment, so the one to keep is the one from the pilot experiment

In [1294]:
(combined_obs_barcode_collisions.loc[
     (combined_obs_barcode_collisions.index.isin(agreement_1_unresolved_epistasis[agreement_1_unresolved_epistasis['any_epistasis']== False].index)) &
     (combined_obs_barcode_collisions['experiment'] == 'pilot')
     ][columns_of_interest + ['keep']]
)

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment,keep
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACAAGAGAACGTGT-1,ACAAGAGAACGTGT-1,ctrl,control,EP300,False,pilot,False
AGAAAGTGGGTTTG-1,AGAAAGTGGGTTTG-1,ctrl,control,SPI1,False,pilot,False
CTGGCACTGTTCTT-1,CTGGCACTGTTCTT-1,ctrl,control,EP300,False,pilot,False
GCACGGTGGTTTCT-1,GCACGGTGGTTTCT-1,ctrl,control,SNAI1,False,pilot,False
GCGAAGGAGGAAAT-1,GCGAAGGAGGAAAT-1,ctrl,control,EP300,False,pilot,False
GGCGACTGCAGTTG-1,GGCGACTGCAGTTG-1,ctrl,control,EP300,False,pilot,False
TAAGAACTGCAGTT-1,TAAGAACTGCAGTT-1,ctrl,control,SPI1,False,pilot,False
TATCCAACTGCACA-1,TATCCAACTGCACA-1,ctrl,control,SNAI1,False,pilot,False


In [1295]:
combined_obs_barcode_collisions.loc[
    (combined_obs_barcode_collisions.index.isin(agreement_1_unresolved_epistasis[agreement_1_unresolved_epistasis['any_epistasis']== False].index)) &
    (combined_obs_barcode_collisions['experiment'] == 'pilot'),
    'keep'] = True

combined_obs_barcode_collisions.loc[
    (combined_obs_barcode_collisions.index.isin(agreement_1_unresolved_epistasis[agreement_1_unresolved_epistasis['any_epistasis']== False].index)) &
    (combined_obs_barcode_collisions['experiment'] == 'pilot'),
    'source_verified'] = True

# make sure duplicate rows aren't being kept
assert(combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique)
print(f"Index is unique? {combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep'] == True].index.is_unique}")

Index is unique? True


## examine all sets of original barcodes with 0 agreements

In [1296]:
for barcode in agreement_counts.index[agreement_counts['count_agree'] == 0]:
    print(barcode)
    print(combined_obs_barcode_collisions[combined_obs_barcode_collisions['original_barcode'] == barcode][['condition_gears_harmonized', 'condition_adamson', 'experiment']])
    print()

AAAGAGACTGCCAA-1
                   condition_gears_harmonized condition_adamson experiment
cell_barcode                                                              
AAAGAGACTGCCAA-1                      control              SPI1      pilot
AAAGAGACTGCCAA-1                      control         IRE1_only  epistasis
AAAGAGACTGCCAA-1-1                    control              SPI1      pilot
AAAGAGACTGCCAA-1-1                    control         IRE1_only  epistasis

ACAAGCACTGACTG-1
                   condition_gears_harmonized condition_adamson experiment
cell_barcode                                                              
ACAAGCACTGACTG-1                      control             EP300      pilot
ACAAGCACTGACTG-1                      control       3x_neg_ctrl  epistasis
ACAAGCACTGACTG-1-1                    control             EP300      pilot
ACAAGCACTGACTG-1-1                    control       3x_neg_ctrl  epistasis

AGAATACTCCACCT-1
                   condition_gears_harmonized c

### these all appear to be incorrect
Even the one case where the cell from epistasis really was a control (ATTAGTGAGCTGTA-1), this isn't believeable because in all cases but one, there is a possible cell from epistasis that is mislabeled as control. Therefore, it could just be a coincidence that it was labeled control and it will not be trusted here.

Even more interestingly, there is one example (GCAGTCCTCATGCA-1) where the cells from Adamson with those barcodes came from pilot and UPR, with neither from epistasis. It is unclear why this, as well as the other of the collision pairs in the rest of these examples, are mislabeled as control in GEARS.

### to resolve this, collapse the options from Adamson down to a delimited string providing the options, keep those rows, and mark source_verified as False

In [1297]:
agg = dict.fromkeys(combined_obs_barcode_collisions.columns.difference(['condition_adamson', 'experiment']), 'first')
agg['condition_adamson'] = lambda s: '|'.join(s)
agg['experiment'] = lambda s: '|'.join(s)

agreement_0_replacement_rows = (combined_obs_barcode_collisions[
     combined_obs_barcode_collisions['original_barcode'].isin(agreement_counts.index[agreement_counts['count_agree'] == 0])]
     .groupby('cell_barcode')
     .agg(agg)
)
agreement_0_replacement_rows['keep'] = True
agreement_0_replacement_rows['source_verified'] = False
agreement_0_replacement_rows

Unnamed: 0_level_0,UMI_count,barcode_collision,cell_type,condition_agrees,condition_gears,condition_gears_harmonized,condition_name,control,coverage,dose_val,...,pct_counts_ribo,potential_collision,read_count,source_verified,total_counts,total_counts_hb,total_counts_mt,total_counts_ribo,condition_adamson,experiment
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAGAGACTGCCAA-1,50.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,13.74,1,...,37.997589,True,687.0,False,10777.0,38.0,0.0,4095.0,SPI1|IRE1_only,pilot|epistasis
AAAGAGACTGCCAA-1-1,50.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,13.74,1,...,37.997589,True,687.0,False,10777.0,38.0,0.0,4095.0,SPI1|IRE1_only,pilot|epistasis
ACAAGCACTGACTG-1,71.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,14.732394,1,...,38.726486,True,1046.0,False,12281.0,225.0,0.0,4756.0,EP300|3x_neg_ctrl,pilot|epistasis
ACAAGCACTGACTG-1-1,71.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,14.732394,1,...,38.726486,True,1046.0,False,12281.0,225.0,0.0,4756.0,EP300|3x_neg_ctrl,pilot|epistasis
AGAATACTCCACCT-1,179.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,15.469274,1,...,41.632729,True,2769.0,False,10559.0,47.0,0.0,4396.0,EP300|ATF6_IRE1,pilot|epistasis
AGAATACTCCACCT-1-1,179.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,15.469274,1,...,41.632729,True,2769.0,False,10559.0,47.0,0.0,4396.0,EP300|ATF6_IRE1,pilot|epistasis
AGAGAAACTAGAAG-1,38.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,12.026316,1,...,36.26635,True,457.0,False,7569.0,79.0,0.0,2745.0,SPI1|ATF6_PERK,pilot|epistasis
AGAGAAACTAGAAG-1-1,38.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,12.026316,1,...,36.26635,True,457.0,False,7569.0,79.0,0.0,2745.0,SPI1|ATF6_PERK,pilot|epistasis
ATTAGTGAGCTGTA-1,116.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,16.043103,1,...,38.267441,True,1861.0,False,10978.0,51.0,0.0,4201.0,EP300|3x_neg_ctrl,pilot|epistasis
ATTAGTGAGCTGTA-1-1,116.0,True,K562(?),False,ctrl,control,K562(?)_ctrl_1,1,16.043103,1,...,38.267441,True,1861.0,False,10978.0,51.0,0.0,4201.0,EP300|3x_neg_ctrl,pilot|epistasis


### remove these cells from the main barcode_collisions dataframe and replace them with the data just created

In [1298]:
barcodes_to_drop = combined_obs_barcode_collisions[
     combined_obs_barcode_collisions['original_barcode'].isin(agreement_counts.index[agreement_counts['count_agree'] == 0])
].index
barcodes_to_drop

Index(['AAAGAGACTGCCAA-1', 'AAAGAGACTGCCAA-1', 'ACAAGCACTGACTG-1',
       'ACAAGCACTGACTG-1', 'AGAATACTCCACCT-1', 'AGAATACTCCACCT-1',
       'AGAGAAACTAGAAG-1', 'AGAGAAACTAGAAG-1', 'ATTAGTGAGCTGTA-1',
       'ATTAGTGAGCTGTA-1', 'CCACTTCTACTTTC-1', 'CCACTTCTACTTTC-1',
       'CGTAGCCTTTTACC-1', 'CGTAGCCTTTTACC-1', 'GCAGTCCTCATGCA-1',
       'GCAGTCCTCATGCA-1', 'GCGAGAGACTGTAG-1', 'GCGAGAGACTGTAG-1',
       'GGACCCGAACCCTC-1', 'GGACCCGAACCCTC-1', 'GGCACGTGAGAAGT-1',
       'GGCACGTGAGAAGT-1', 'GGGCAAGAGAGAGC-1', 'GGGCAAGAGAGAGC-1',
       'GGGCCATGCGAGAG-1', 'GGGCCATGCGAGAG-1', 'GTAGCAACTCATTC-1',
       'GTAGCAACTCATTC-1', 'TACCATTGTTTACC-1', 'TACCATTGTTTACC-1',
       'TATCTGACACCACA-1', 'TATCTGACACCACA-1', 'TGCGAAACTGCGTA-1',
       'TGCGAAACTGCGTA-1', 'TGGTAGACGCTAAC-1', 'TGGTAGACGCTAAC-1',
       'AAAGAGACTGCCAA-1-1', 'AAAGAGACTGCCAA-1-1', 'ACAAGCACTGACTG-1-1',
       'ACAAGCACTGACTG-1-1', 'AGAATACTCCACCT-1-1', 'AGAATACTCCACCT-1-1',
       'AGAGAAACTAGAAG-1-1', 'AGAGAAACTAGAAG-1-1',

In [1299]:
combined_obs_barcode_collisions.drop(barcodes_to_drop, inplace=True)
combined_obs_barcode_collisions

Unnamed: 0_level_0,condition_gears,cell_type,dose_val,control,condition_name,potential_collision,original_barcode,barcode_collision,guide_identity,read_count,...,log1p_total_counts_ribo,pct_counts_ribo,total_counts_hb,log1p_total_counts_hb,pct_counts_hb,experiment,condition_gears_harmonized,condition_agrees,keep,source_verified
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAATCCCTCTTGGA-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,AAATCCCTCTTGGA-1,True,62(mod)_pBA581,524.0,...,8.024862,35.975037,83.0,4.430817,0.977390,pilot,control,True,True,True
AAATCCCTCTTGGA-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,AAATCCCTCTTGGA-1,True,DAD1_pDS499,585.0,...,7.509883,24.313883,145.0,4.983607,1.931788,upr,control,False,False,False
AAGCCATGATTCGG-1,BHLHE40+ctrl,K562(?),1+1,0,K562(?)_BHLHE40+ctrl_1+1,True,AAGCCATGATTCGG-1,True,BHLHE40_pDS258,837.0,...,7.910957,34.345470,128.0,4.859812,1.612700,pilot,BHLHE40,True,True,True
AAGCCATGATTCGG-1,BHLHE40+ctrl,K562(?),1+1,0,K562(?)_BHLHE40+ctrl_1+1,True,AAGCCATGATTCGG-1,True,SEC61A1_pDS031,3254.0,...,8.787830,31.981453,355.0,5.874931,1.732552,upr,BHLHE40,False,False,False
AATCAAACTCCCGT-1,ZNF326+ctrl,K562(?),1+1,0,K562(?)_ZNF326+ctrl_1+1,True,AATCAAACTCCCGT-1,True,ZNF326_pDS262,1325.0,...,8.076204,37.942425,69.0,4.248495,0.814063,pilot,ZNF326,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGTAACCTCACCAA-3-1,SRPRB+ctrl,K562(?),1+1,0,K562(?)_SRPRB+ctrl_1+1,True,TGTAACCTCACCAA-3,True,3x_neg_ctrl_pMJ144-2,240.0,...,8.842605,28.970829,314.0,5.752573,1.314192,epistasis,SRPRB,False,False,False
TGTATCTGAACGTC-3-1,TMED10+ctrl,K562(?),1+1,0,K562(?)_TMED10+ctrl_1+1,True,TGTATCTGAACGTC-3,True,TMED10_pDS036,1266.0,...,8.415382,20.475262,375.0,5.929589,1.700603,upr,TMED10,True,True,True
TGTATCTGAACGTC-3-1,TMED10+ctrl,K562(?),1+1,0,K562(?)_TMED10+ctrl_1+1,True,TGTATCTGAACGTC-3,True,PERK_only_pMJ146,176.0,...,8.449343,25.297876,404.0,6.003887,2.188042,epistasis,TMED10,False,False,False
TTCAGTTGCGGGAA-3-1,CARS+ctrl,K562(?),1+1,0,K562(?)_CARS+ctrl_1+1,True,TTCAGTTGCGGGAA-3,True,CARS_pDS460,684.0,...,7.693481,24.917624,112.0,4.727388,1.272583,upr,CARS,True,True,True


In [1300]:
combined_obs_barcode_collisions_resolved = pd.concat([combined_obs_barcode_collisions[combined_obs_barcode_collisions['keep']], agreement_0_replacement_rows])

# make sure indices are unique or else duplicated cells were added
assert(combined_obs_barcode_collisions_resolved.index.is_unique)
print(f"Index is unique? {combined_obs_barcode_collisions_resolved.index.is_unique}")

combined_obs_barcode_collisions_resolved

Index is unique? True


Unnamed: 0_level_0,condition_gears,cell_type,dose_val,control,condition_name,potential_collision,original_barcode,barcode_collision,guide_identity,read_count,...,log1p_total_counts_ribo,pct_counts_ribo,total_counts_hb,log1p_total_counts_hb,pct_counts_hb,experiment,condition_gears_harmonized,condition_agrees,keep,source_verified
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAATCCCTCTTGGA-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,AAATCCCTCTTGGA-1,True,62(mod)_pBA581,524.0,...,8.024862,35.975037,83.0,4.430817,0.977390,pilot,control,True,True,True
AAGCCATGATTCGG-1,BHLHE40+ctrl,K562(?),1+1,0,K562(?)_BHLHE40+ctrl_1+1,True,AAGCCATGATTCGG-1,True,BHLHE40_pDS258,837.0,...,7.910957,34.345470,128.0,4.859812,1.612700,pilot,BHLHE40,True,True,True
AATCAAACTCCCGT-1,ZNF326+ctrl,K562(?),1+1,0,K562(?)_ZNF326+ctrl_1+1,True,AATCAAACTCCCGT-1,True,ZNF326_pDS262,1325.0,...,8.076204,37.942425,69.0,4.248495,0.814063,pilot,ZNF326,True,True,True
AATGTAACTTCTGT-1,BHLHE40+ctrl,K562(?),1+1,0,K562(?)_BHLHE40+ctrl_1+1,True,AATGTAACTTCTGT-1,True,BHLHE40_pDS258,878.0,...,8.581669,38.812054,139.0,4.941642,1.011792,pilot,BHLHE40,True,True,True
ACAAGAGAACGTGT-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,ACAAGAGAACGTGT-1,True,EP300_pDS268,155.0,...,8.303752,36.944191,95.0,4.564348,0.869167,pilot,control,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGCGAAACTGCGTA-1-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TGCGAAACTGCGTA-1,True,SNAI1_pDS266,509.0,...,7.560601,27.923212,8.0,2.197225,0.116347,pilot|epistasis,control,False,True,False
TGGTAGACGCTAAC-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TGGTAGACGCTAAC-1,True,SPI1_pDS255,837.0,...,8.123855,39.372009,53.0,3.988984,0.618653,pilot|epistasis,control,False,True,False
TGGTAGACGCTAAC-1-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TGGTAGACGCTAAC-1,True,SPI1_pDS255,837.0,...,8.123855,39.372009,53.0,3.988984,0.618653,pilot|epistasis,control,False,True,False
TGTGAGACTGAAGA-2,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TGTGAGACTGAAGA-2,True,ASCC3_pDS052,335.0,...,8.235361,26.794088,92.0,4.532599,0.653688,upr|epistasis,control,False,True,False


In [1301]:
combined_obs_barcode_collisions_resolved['condition_corrected'] = where(combined_obs_barcode_collisions_resolved['source_verified'],
                                                                        combined_obs_barcode_collisions_resolved['condition_adamson'],
                                                                        'ambiguous')
combined_obs_barcode_collisions_resolved['condition_corrected'].value_counts()

condition_corrected
ambiguous      48
control        44
3x_neg_ctrl    24
ATF6_only      16
ATF6_PERK      14
               ..
DHDDS           1
GBF1            1
GMPPB           1
SPCS2           1
DNAJC19         1
Name: count, Length: 82, dtype: int64

In [1302]:
combined_obs_barcode_collisions_resolved.value_counts('source_verified')

source_verified
True     296
False     48
Name: count, dtype: int64

In [1303]:
combined_obs_barcode_collisions_resolved.value_counts(['source_verified', 'condition_agrees'])

source_verified  condition_agrees
True             True                188
                 False               108
False            False                48
Name: count, dtype: int64

In [1304]:
combined_obs_barcode_collisions_resolved[~combined_obs_barcode_collisions_resolved['condition_agrees']][columns_of_interest]

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACAAGAGAACGTGT-1,ACAAGAGAACGTGT-1,ctrl,control,EP300,False,pilot
AGAAAGTGGGTTTG-1,AGAAAGTGGGTTTG-1,ctrl,control,SPI1,False,pilot
CTGGCACTGTTCTT-1,CTGGCACTGTTCTT-1,ctrl,control,EP300,False,pilot
GCACGGTGGTTTCT-1,GCACGGTGGTTTCT-1,ctrl,control,SNAI1,False,pilot
GCGAAGGAGGAAAT-1,GCGAAGGAGGAAAT-1,ctrl,control,EP300,False,pilot
...,...,...,...,...,...,...
TGCGAAACTGCGTA-1-1,TGCGAAACTGCGTA-1,ctrl,control,SNAI1|ATF6_only,False,pilot|epistasis
TGGTAGACGCTAAC-1,TGGTAGACGCTAAC-1,ctrl,control,SPI1|ATF6_PERK_IRE1,False,pilot|epistasis
TGGTAGACGCTAAC-1-1,TGGTAGACGCTAAC-1,ctrl,control,SPI1|ATF6_PERK_IRE1,False,pilot|epistasis
TGTGAGACTGAAGA-2,TGTGAGACTGAAGA-2,ctrl,control,ASCC3|ATF6_PERK_IRE1,False,upr|epistasis


# Deal with potential collisions

In [1305]:
combined_obs_potential_collisions[columns_of_interest]

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACTCTCCTCACACA-1,ACTCTCCTCACACA-1,ctrl,control,SNAI1,False,pilot
ACTCTCCTCACACA-1,ACTCTCCTCACACA-1,ctrl,control,STT3A,False,upr
ATCCAGGAGCTCCT-1,ATCCAGGAGCTCCT-1,ctrl,control,EP300,False,pilot
ATCCAGGAGCTCCT-1,ATCCAGGAGCTCCT-1,ctrl,control,Gal4-4(mod),False,upr
ATTGAAACCGACTA-1,ATTGAAACCGACTA-1,BHLHE40+ctrl,BHLHE40,BHLHE40,True,pilot
...,...,...,...,...,...,...
CAGCTCACATGTGC-3,CAGCTCACATGTGC-3,TTI1+ctrl,TTI1,ATF6_PERK_IRE1,False,epistasis
GAGTCTGAAAAGTG-3,GAGTCTGAAAAGTG-3,SLMO2+ctrl,SLMO2,SLMO2,True,upr
GAGTCTGAAAAGTG-3,GAGTCTGAAAAGTG-3,SLMO2+ctrl,SLMO2,,False,epistasis
TGAATAACCGCCTT-3,TGAATAACCGCCTT-3,STT3A+ctrl,STT3A,STT3A,True,upr


In [1306]:
combined_obs_potential_collisions.value_counts('condition_agrees')

condition_agrees
False    94
True     26
Name: count, dtype: int64

### check for agreement within a barcode pair to see if all 26 agreements are paired with a disagreement, otherwise cannot disambiguate which Adamson 2016 cell was the true source in GEARS

In [1307]:
(combined_obs_potential_collisions
     .groupby('original_barcode')
     .agg(count_agree=pd.NamedAgg('condition_agrees', 'sum'))
     .value_counts('count_agree')
)

count_agree
0    34
1    26
Name: count, dtype: int64

### pull out the 26 unique matching ones and mark them as `keep` and `source_verified`

In [1308]:
(combined_obs_potential_collisions
     .groupby('original_barcode')
     .agg(count_agree=pd.NamedAgg('condition_agrees', 'sum'))
     .sort_values('count_agree', ascending=False)
)

Unnamed: 0_level_0,count_agree
original_barcode,Unnamed: 1_level_1
AAAGGCCTTCTGGA-1,1
CACTTTGACCAAGT-1,1
CAGCTCACATGTGC-3,1
CATCGCTGGTACCA-1,1
CCAAGTGACTGAGT-1,1
CCAATTTGCTCGAA-2,1
CGACTCTGTCTCCG-1,1
CGTCCAACAGACTC-2,1
GAGTCTGAAAAGTG-3,1
GCACCACTGATAGA-1,1


In [1309]:
# combined_obs[combined_obs['experiment'] == 'epistasis'].value_counts('condition_gears').head(30)

In [1310]:
combined_obs_potential_collisions.loc[combined_obs_potential_collisions['condition_agrees'], 'keep'] = True
combined_obs_potential_collisions.loc[combined_obs_potential_collisions['condition_agrees'], 'source_verified'] = True

In [1311]:
partial_collisions_resolved_barcodes = combined_obs_potential_collisions[combined_obs_potential_collisions['condition_agrees']].index
print(len(partial_collisions_resolved_barcodes))

26


### quick check that all cell_barcode entries match original_barcode (they should because no duplicate barcode was present in GEARS to have to make unique)

In [1312]:
all([x == y for x,y in zip(combined_obs_potential_collisions.index, combined_obs_potential_collisions['original_barcode'])])

True

### Check the non-matching ones, which should be 68 rows (34 cells but doubled because we get the match to each of 2 Adamson experiments it could have come from)

In [1313]:
combined_obs_potential_collisions.loc[combined_obs_potential_collisions.index.difference(partial_collisions_resolved_barcodes)]

Unnamed: 0_level_0,condition_gears,cell_type,dose_val,control,condition_name,potential_collision,original_barcode,barcode_collision,guide_identity,read_count,...,log1p_total_counts_ribo,pct_counts_ribo,total_counts_hb,log1p_total_counts_hb,pct_counts_hb,experiment,condition_gears_harmonized,condition_agrees,keep,source_verified
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAGTTTGTCGATG-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,AAAGTTTGTCGATG-1,False,SPCS2_pDS401,1262.0,...,8.621914,23.315693,372.0,5.921578,1.562500,upr,control,False,False,False
AAAGTTTGTCGATG-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,AAAGTTTGTCGATG-1,False,ATF6_PERK_IRE1_pMJ158,156.0,...,8.995289,28.088751,340.0,5.831882,1.184298,epistasis,control,False,False,False
AACTCACTCTCATT-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,AACTCACTCTCATT-1,False,SLMO2_pDS433,361.0,...,8.339740,23.159060,201.0,5.308268,1.112033,upr,control,False,False,False
AACTCACTCTCATT-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,AACTCACTCTCATT-1,False,ATF6_PERK_pMJ150,90.0,...,8.425955,28.606358,281.0,5.641907,1.761645,epistasis,control,False,False,False
ACCCGTTGTGGTCA-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,ACCCGTTGTGGTCA-1,False,DNAJC19_pDS074,988.0,...,8.560444,23.537899,355.0,5.874931,1.600758,upr,control,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGGGTATGTGGAAA-1,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TGGGTATGTGGAAA-1,False,PERK_IRE1_pMJ154,438.0,...,9.094930,34.919453,197.0,5.288267,0.772155,epistasis,control,False,False,False
TGTATCTGACCCAA-3,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TGTATCTGACCCAA-3,False,DARS_pDS495,178.0,...,8.124743,26.928293,143.0,4.969813,1.140624,upr,control,False,False,False
TGTATCTGACCCAA-3,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TGTATCTGACCCAA-3,False,PERK_only_pMJ146,321.0,...,9.046644,31.342072,343.0,5.840641,1.266384,epistasis,control,False,False,False
TTTGACTGGAATAG-3,ctrl,K562(?),1,1,K562(?)_ctrl_1,True,TTTGACTGGAATAG-3,False,SEC61G_pDS440,629.0,...,8.434029,26.458069,217.0,5.384495,1.248131,upr,control,False,False,False


### Check if these are all control cells and, if so, could they all have come from epistasis where all the double-perturbations were relabeled control by the GEARS authors
Even if this is the case, would not trust it and will instead label these cells as `source_verified` False.

In [1314]:
combined_obs_potential_collisions.loc[combined_obs_potential_collisions.index.difference(partial_collisions_resolved_barcodes)].value_counts('condition_gears_harmonized')

condition_gears_harmonized
control    68
AMIGO3      0
SOCS1       0
SLMO2       0
SLC39A7     0
           ..
GBF1        0
FECH        0
FARSB       0
EIF2S1      0
MTHFD1      0
Name: count, Length: 88, dtype: int64

In [1315]:
combined_obs_potential_collisions.loc[combined_obs_potential_collisions.index.difference(partial_collisions_resolved_barcodes)].groupby('original_barcode').agg(any_epistasis=pd.NamedAgg('experiment', any_epistasis)).value_counts()

any_epistasis
True             31
False             3
Name: count, dtype: int64

### Create new dataframe that collapses these pairs back down to single entries, capturing both possible Adamson sources

In [1316]:
unresolved_potential_collisions = combined_obs_potential_collisions.loc[combined_obs_potential_collisions.index.difference(partial_collisions_resolved_barcodes)]
unresolved_potential_collisions['condition_adamson'] = unresolved_potential_collisions['condition_adamson'].astype(str)
unresolved_potential_collisions['condition_adamson'] = unresolved_potential_collisions['condition_adamson'].fillna('none')

agg = dict.fromkeys(unresolved_potential_collisions.columns.difference(['condition_adamson', 'experiment']), 'first')
agg['condition_adamson'] = lambda s: '|'.join(s)
agg['experiment'] = lambda s: '|'.join(s)

potential_collisions_replacement_rows = (
    unresolved_potential_collisions
     .groupby(unresolved_potential_collisions.index)
     .agg(agg)
)
potential_collisions_replacement_rows['keep'] = True
potential_collisions_replacement_rows['source_verified'] = False
potential_collisions_replacement_rows[columns_of_interest]

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAAGTTTGTCGATG-1,AAAGTTTGTCGATG-1,ctrl,control,SPCS2|ATF6_PERK_IRE1,False,upr|epistasis
AACTCACTCTCATT-1,AACTCACTCTCATT-1,ctrl,control,SLMO2|ATF6_PERK,False,upr|epistasis
ACCCGTTGTGGTCA-1,ACCCGTTGTGGTCA-1,ctrl,control,DNAJC19|3x_neg_ctrl,False,upr|epistasis
ACTCTCCTCACACA-1,ACTCTCCTCACACA-1,ctrl,control,SNAI1|STT3A,False,pilot|upr
ACTGCCACTAGAGA-1,ACTGCCACTAGAGA-1,ctrl,control,MARS|ATF6_only,False,upr|epistasis
AGGGTTTGCTCCAC-1,AGGGTTTGCTCCAC-1,ctrl,control,BHLHE40|3x_neg_ctrl,False,pilot|epistasis
ATCCAGGAGCTCCT-1,ATCCAGGAGCTCCT-1,ctrl,control,EP300|Gal4-4(mod),False,pilot|upr
ATCGTTTGCTATGG-2,ATCGTTTGCTATGG-2,ctrl,control,HSPA9|ATF6_IRE1,False,upr|epistasis
ATTTAGGAGCTGAT-2,ATTTAGGAGCTGAT-2,ctrl,control,FECH|3x_neg_ctrl,False,upr|epistasis
CAAGAAGAACCAGT-3,CAAGAAGAACCAGT-3,ctrl,control,Gal4-4(mod)|PERK_only,False,upr|epistasis


### Combine these with the rest of the potential collisions data and there should be only 60 rows marked as `keep`

In [1317]:
potential_collisions_df = pd.concat([combined_obs_potential_collisions, potential_collisions_replacement_rows])
potential_collisions_df.value_counts('keep')

keep
False    94
True     60
Name: count, dtype: int64

## Recombine all of the merged metadata across Adamson and GEARS to create a new .obs for the GEARS version of the data

In [1318]:
new_gears_obs = pd.concat([combined_obs_unique, combined_obs_barcode_collisions_resolved, potential_collisions_df])
new_gears_obs.value_counts('keep')

keep
True     68603
False       94
Name: count, dtype: int64

Keeping the original authors' `3x_neg_ctrl` label for control cells in the epistasis experiment has been advantageous to this point as it provided a distinct label compared to control cells coming from other experiments. But now it can be harmonized to `control` like the other control cells.

In [1319]:
new_gears_obs = new_gears_obs[new_gears_obs['keep']]
new_gears_obs = new_gears_obs.drop('keep', axis=1)
new_gears_obs['condition_adamson'] = new_gears_obs['condition_adamson'].astype(str).replace('3x_neg_ctrl', 'control')
new_gears_obs['condition_agrees'] = (new_gears_obs['condition_gears_harmonized'] == new_gears_obs['condition_adamson'])
new_gears_obs['condition_corrected'] = where(new_gears_obs['source_verified'], new_gears_obs['condition_adamson'], 'ambiguous')
new_gears_obs[columns_of_interest + ['condition_corrected']]

Unnamed: 0_level_0,original_barcode,condition_gears,condition_gears_harmonized,condition_adamson,condition_agrees,experiment,condition_corrected
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACATACACCGAT-1,AAACATACACCGAT-1,CREB1+ctrl,CREB1,CREB1,True,pilot,CREB1
AAACATACAGAGAT-1,AAACATACAGAGAT-1,ctrl,control,SNAI1,False,pilot,SNAI1
AAACATACCAGAAA-1,AAACATACCAGAAA-1,ctrl,control,control,True,pilot,control
AAACATACGTTGAC-1,AAACATACGTTGAC-1,ctrl,control,EP300,False,pilot,EP300
AAACATACTGTTCT-1,AAACATACTGTTCT-1,ctrl,control,control,True,pilot,control
...,...,...,...,...,...,...,...
TCAGGATGCTTCTA-3,TCAGGATGCTTCTA-3,ctrl,control,FECH|ATF6_only,False,upr|epistasis,ambiguous
TCGCAGCTAACGGG-1,TCGCAGCTAACGGG-1,ctrl,control,EP300|PERK_IRE1,False,pilot|epistasis,ambiguous
TGGGTATGTGGAAA-1,TGGGTATGTGGAAA-1,ctrl,control,QARS|PERK_IRE1,False,upr|epistasis,ambiguous
TGTATCTGACCCAA-3,TGTATCTGACCCAA-3,ctrl,control,DARS|PERK_only,False,upr|epistasis,ambiguous


## sort the corrected .obs dataframe to be in the same order as the original GEARS .obs

In [1320]:
new_gears_obs.index.is_unique

True

In [1321]:
new_gears_obs = new_gears_obs.reindex(gears_original_index)
assert(all(new_gears_obs.index == gears_original_index))
print(f'Order matches? {all(new_gears_obs.index == gears_original_index)}')

Order matches? True


### extra paranoia sanity check that should be completely unnecessary if all the indices match: do all `'condition_gears` in `new_gears_obs` match `condition` in GEARS .obs?

In [1322]:
assert(all(gears.obs['condition'] == new_gears_obs['condition_gears']))
print(f"All conditions match? {all(gears.obs['condition'] == new_gears_obs['condition_gears'])}")

All conditions match? True


### how many conditions agree between GEARS labels and corrected labels

In [1323]:
new_gears_obs.value_counts('condition_agrees')

condition_agrees
True     53718
False    14885
Name: count, dtype: int64

## Replace .obs and write the corrected output file

In [1324]:
gears.obs = new_gears_obs

In [1325]:
gears.write(DATA_DIR.joinpath('gears_adamson_updated_metadata.h5ad'))

## Assessment of differences between original GEARS annotations and metadata that has now been updated from the source GEO submission

In [1326]:
gears.obs.value_counts('condition_corrected')

condition_corrected
control           9378
IRE1_only         1547
ATF6_only         1542
ATF6_PERK_IRE1    1536
PERK_IRE1         1504
                  ... 
CAD                242
COPZ1              220
PPWD1              190
COPB1              185
ambiguous           82
Name: count, Length: 98, dtype: int64

In [1327]:
gears.obs.value_counts(['experiment', 'source_verified'])

experiment       source_verified
upr              True               49796
epistasis        True               13423
pilot            True                5302
upr|epistasis    False                 39
pilot|epistasis  False                 38
pilot|upr        False                  5
Name: count, dtype: int64

In [1328]:
gears.obs.value_counts('condition_agrees')

condition_agrees
True     53718
False    14885
Name: count, dtype: int64

In [1329]:
gears.obs.value_counts(['experiment', 'condition_agrees'])

experiment       condition_agrees
upr              True                47017
epistasis        False               10315
pilot            True                 3593
epistasis        True                 3108
upr              False                2779
pilot            False                1709
upr|epistasis    False                  39
pilot|epistasis  False                  38
pilot|upr        False                   5
Name: count, dtype: int64

In [1330]:
adamson_all.obs.loc[adamson_all.obs.index.difference(gears.obs.index)].value_counts('experiment')

experiment
upr          15501
epistasis     1497
pilot          450
Name: count, dtype: int64

In [1331]:
gears[(gears.obs['condition_gears'] == 'ctrl') & (gears.obs['condition_corrected'] != 'ctrl')].obs.value_counts('experiment')

experiment
epistasis          13423
upr                 7384
pilot               3374
upr|epistasis         39
pilot|epistasis       38
pilot|upr              5
Name: count, dtype: int64

In [1332]:
gears[(gears.obs['condition_gears'] == 'ctrl') & (gears.obs['condition_corrected'] != 'control') & (gears.obs['experiment']== 'upr')].obs[['original_barcode', 'condition_gears', 'condition_corrected', 'guide_identity']]

Unnamed: 0_level_0,original_barcode,condition_gears,condition_corrected,guide_identity
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACTTGAAACGGG-1,AAACTTGAAACGGG-1,ctrl,DNAJC19,DNAJC19_pDS026
AAATACTGGTCGAT-1,AAATACTGGTCGAT-1,ctrl,SEC61B,SEC61B_pDS033
AAATTCGAATTCTC-1,AAATTCGAATTCTC-1,ctrl,ASCC3,ASCC3_pDS052
AACACTCTCTCCCA-1,AACACTCTCTCCCA-1,ctrl,SEC61B,SEC61B_pDS033
AACAGAGACACTGA-1,AACAGAGACACTGA-1,ctrl,SEC61B,SEC61B_pDS033
...,...,...,...,...
TTGTACACACCTTT-10,TTGTACACACCTTT-10,ctrl,ASCC3,ASCC3_pDS052
TTGTAGCTCCTCCA-10,TTGTAGCTCCTCCA-10,ctrl,SEC61B,SEC61B_pDS033
TTTAGAGAACGTAC-10,TTTAGAGAACGTAC-10,ctrl,SEC61B,SEC61B_pDS033
TTTCGAACGGTGGA-10,TTTCGAACGGTGGA-10,ctrl,DNAJC19,DNAJC19_pDS026


In [1333]:
gears[(gears.obs['condition_gears'] == 'ctrl') & (gears.obs['condition_corrected'] != 'control') & (gears.obs['experiment']== 'upr')].obs.value_counts('condition_corrected')

condition_corrected
SEC61B     1000
ASCC3       954
DNAJC19     825
Name: count, dtype: int64