In [2]:
import pandas as pd
import numpy as np
import scanpy as sc
from geosketch import gs
import os
import anndata

  from pandas.core.index import RangeIndex


In [115]:
dir_martin = "./GSE134809_adata_celltype_annotation_dream.h5ad"
adata_martin = sc.read(dir_martin)

In [116]:
dir_brca = "./GPL16791_breast_cancer_annotation.h5ad"
adata_brca = sc.read(dir_brca)


In [117]:

# TODO only take crc patients
dir_crc = "./crc_anndata.h5ad"
adata_crc = sc.read(dir_brca)


In [124]:
adata_crc.obs['CONDITION'].unique()

[blood CD45+ leukocytes, breast CD45+ leukocytes, breast tumor CD45+ leukocytes, lymph node CD45+ leukocytes]
Categories (4, object): [blood CD45+ leukocytes, breast CD45+ leukocytes, breast tumor CD45+ leukocytes, lymph node CD45+ leukocytes]

In [126]:
adata_crc.obs['celltype_dream'].unique()

[memory.CD4.T.cells, others, monocytes, memory.B.cells, naive.CD8.T.cells, ..., neutrophils, naive.B.cells, fibroblasts, cancer.cells, endothelial.cells]
Length: 16
Categories (16, object): [memory.CD4.T.cells, others, monocytes, memory.B.cells, ..., naive.B.cells, fibroblasts, cancer.cells, endothelial.cells]

In [6]:
dir_smillie = "./Smillie2019_adata_celltype_annotation_dream.h5ad"
adata_smillie = sc.read(dir_smillie)

ValueError: invalid shape in fixed-type tuple.

In [11]:
martin_annot = adata_martin.obs['celltype_dream']
martin_annot.to_csv('./martin_annot.csv', index=False, header=False)

brca_annot = adata_brca.obs['celltype_dream']
brca_annot.to_csv('./brca_annot.csv', index=False, header=False)

smillie_annot = adata_smillie.obs['celltype_dream']
smillie_annot.to_csv('./smillie_annot.csv', index=False, header=False)

crc_annot = adata_crc.obs['celltype_dream']
crc_annot.to_csv('./crc_annot.csv', index=False, header=False)

### Subset datasets using Geosketch

SCDC too slow with full datasets. Aim for max 8000 cells in total

In [15]:
adata_martin.shape

(62202, 1417)

In [16]:
adata_brca.shape

(42248, 2385)

In [17]:
N = 4000 # Number of samples to obtain from the data set.
sketch_index = gs(adata_martin.X, N, replace=False, verbose=True)

X_sketch = adata_martin.X[sketch_index]

obs = adata_martin.obs.iloc[sketch_index]
var = adata_martin.var
uns = adata_martin.uns.copy()
rmkeys = ['neighbors', 'pca', 'rank_genes_groups']
for key in rmkeys:
    uns.pop(key, None)

raw = adata_martin.raw[sketch_index]
adata_martin_sub = anndata.AnnData(X_sketch, obs=obs, var=var, uns=uns, raw=raw)
adata_martin_sub.write('./martin_geosketch.h5ad')

anno = adata_martin_sub.obs['celltype_dream']
anno.to_csv('./martin_geosketch_scanno.csv', index=False, header=False)

2020-05-07 12:12:41.025313 | [geosketch] Found 62201 non-empty grid cells
2020-05-07 12:12:41.039004 | [geosketch] Grid size 62201, increase unit to 0.625
2020-05-07 12:19:26.601761 | [geosketch] Found 30152 non-empty grid cells
2020-05-07 12:19:26.603684 | [geosketch] Grid size 30152, increase unit to 0.8125
2020-05-07 12:20:28.688197 | [geosketch] Found 1673 non-empty grid cells
2020-05-07 12:20:28.688917 | [geosketch] Grid size 1673, decrease unit to 0.71875
2020-05-07 12:23:06.963016 | [geosketch] Found 7817 non-empty grid cells
2020-05-07 12:23:06.964042 | [geosketch] Grid size 7817, increase unit to 0.765625
2020-05-07 12:24:31.423712 | [geosketch] Found 3021 non-empty grid cells
2020-05-07 12:24:31.424719 | [geosketch] Grid size 3021, decrease unit to 0.7421875
2020-05-07 12:26:09.049249 | [geosketch] Found 4302 non-empty grid cells
2020-05-07 12:26:09.049987 | [geosketch] Found 4302 grid cells


NameError: name 'anndata' is not defined

In [98]:
def geosketch_subsample(adata, N=4000, filename='adata_geosketch.h5ad', column='<last_column>', raw=True):
    
    if (column == '<last_column>'):
        column = adata.obs.columns[-1]
    print('Initiating Geosketch. This process may take a while for large datasets.')
    if raw:
        E = adata.raw.X.toarray() # convert from sparse to ndarray
        sketch_index = gs(E, N, replace=False, verbose=True)
        X_sketch = E[sketch_index]
        var = adata.raw.var
        raw_dat = None
        
    else:
        sketch_index = gs(adata.X, N, replace=False, verbose=True)
        X_sketch = adata.X[sketch_index]
        var = adata.var
        raw_dat = adata.raw[sketch_index]
        
    obs = adata.obs.iloc[sketch_index]
    uns = adata.uns.copy()
    rmkeys = ['neighbors', 'pca', 'rank_genes_groups'] # remove these entries from adata.uns as they cause issues with geosketching
    for key in rmkeys:
        uns.pop(key, None)

    
    adata_sub = anndata.AnnData(X_sketch, obs=obs, var=var, uns=uns, raw=raw_dat)
    adata_sub.write(os.path.join('./', filename))

    anno = adata_sub.obs[column]
    anno.to_csv(os.path.join('./', filename.split('.')[0] + '_scanno.csv'), index=False, header=False)
    
    return adata_sub


In [None]:
geosketch_subsample(adata_brca, filename='brca_raw_geosketch.h5ad', raw=True)

Initiating Geosketch. This process may take a while for large datasets.


In [None]:
geosketch_subsample(adata_martin, filename='martin_raw_geosketch.h5ad', raw=True)

In [91]:
adata_crc_geo = geosketch_subsample(adata_crc, filename='crc_raw_geosketch.h5ad', raw=False)

Initiating Geosketch. This process may take a while for large datasets.
2020-05-11 00:21:42.537658 | [geosketch] Found 42248 non-empty grid cells
2020-05-11 00:21:42.618977 | [geosketch] Grid size 42248, increase unit to 0.625
2020-05-11 00:27:19.233861 | [geosketch] Found 32816 non-empty grid cells
2020-05-11 00:27:19.331684 | [geosketch] Grid size 32816, increase unit to 0.8125
2020-05-11 00:27:56.627365 | [geosketch] Found 463 non-empty grid cells
2020-05-11 00:27:56.668251 | [geosketch] Grid size 463, decrease unit to 0.71875
2020-05-11 00:28:41.190603 | [geosketch] Found 2549 non-empty grid cells
2020-05-11 00:28:41.193219 | [geosketch] Grid size 2549, decrease unit to 0.671875
2020-05-11 00:30:00.170395 | [geosketch] Found 6721 non-empty grid cells
2020-05-11 00:30:00.171172 | [geosketch] Grid size 6721, increase unit to 0.6953125
2020-05-11 00:31:09.344323 | [geosketch] Found 4098 non-empty grid cells
2020-05-11 00:31:09.365408 | [geosketch] Found 4098 grid cells


keep_keys = ['louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups']
ks = adata_martin.uns.copy().keys()
for k in ks:
    if k not in keep_keys:
        adata_martin.uns.pop(k, None)
        
ks = adata_brca.uns.copy().keys()
for k in ks:
    if k not in keep_keys:
        adata_brca.uns.pop(k, None)

### Subset adata.raw


In [66]:
idx = adata_martin_sub.obs.index
geo_idx = adata_martin.obs.index.isin(idx)
rawX = adata_martin.raw.X[geo_idx]

In [78]:
# dirty way
obs = adata_martin_sub.obs
var = adata_martin_sub.raw.var
uns = adata_martin_sub.uns
raw = adata_martin_sub.raw

adata_martin_raw = anndata.AnnData(raw.X, obs=obs, var=var, uns=uns, raw=raw)
adata_martin_raw.write('martin_geosketch_raw.h5ad')

In [79]:
obs = adata_brca_sub.obs
var = adata_brca_sub.raw.var
uns = adata_brca_sub.uns
raw = adata_brca_sub.raw

adata_brca_raw = anndata.AnnData(raw.X, obs=obs, var=var, uns=uns, raw=raw)
adata_brca_raw.write('brca_geosketch_raw.h5ad')

In [101]:

obs = adata_crc_geo.obs
var = adata_crc_geo.raw.var
uns = adata_crc_geo.uns
raw = adata_crc_geo.raw

adata_crc_raw = anndata.AnnData(raw.X, obs=obs, var=var, uns=uns, raw=raw)
adata_crc_raw.write('crc_raw_geosketch.h5ad')

set raw.X as X for full non-geoskchetch dataset

In [None]:
adata_brca
adata_martin

In [127]:
obs = adata_brca.obs
var = adata_brca.raw.var
uns = adata_brca.uns
raw = adata_brca.raw

adata_brca_raw = anndata.AnnData(raw.X, obs=obs, var=var, uns=uns, raw=raw)
adata_brca_raw.write('brca_raw.h5ad')

In [128]:
obs = adata_martin.obs
var = adata_martin.raw.var
uns = adata_martin.uns
raw = adata_martin.raw

adata_martin_raw = anndata.AnnData(raw.X, obs=obs, var=var, uns=uns, raw=raw)
adata_martin_raw.write('martin_raw.h5ad')

### Martin data - are there too many missing cell types?

In [109]:
adata_martin.obs.loc[adata_martin.obs['Subject'] == 'pat. 5']['celltype_dream'].unique()

[memory.CD4.T.cells, fibroblasts, regulatory.T.cells, others, macrophages, ..., naive.CD4.T.cells, memory.B.cells, memory.CD8.T.cells, naive.B.cells, NK.cells]
Length: 12
Categories (12, object): [memory.CD4.T.cells, fibroblasts, regulatory.T.cells, others, ..., memory.B.cells, memory.CD8.T.cells, naive.B.cells, NK.cells]

In [110]:
adata_martin_geo_raw.obs.loc[adata_martin_geo_raw.obs['Subject'] == 'pat. 5']['celltype_dream'].unique()

[fibroblasts, endothelial.cells, others, macrophages, myeloid.dendritic.cells, regulatory.T.cells, NK.cells, memory.B.cells]
Categories (8, object): [fibroblasts, endothelial.cells, others, macrophages, myeloid.dendritic.cells, regulatory.T.cells, NK.cells, memory.B.cells]

In [111]:
adata_brca.obs.head()

Unnamed: 0_level_0,CELL,BioSample,CONDITION,GPL,Genome_build,ID,ID_REF,SRA,Sample_geo_accession,Sample_organism_ch1,cell type,donor,donor age (years),resident tissue,percent_mito,n_counts,n_genes,louvain,celltype_dream
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
121270372580596-BC01_BLOOD1,121270372580596-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.068536,958.0,476,0,memory.CD4.T.cells
121270373898541-BC01_BLOOD1,121270373898541-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.095819,573.0,356,0,memory.CD4.T.cells
121270375926581-BC01_BLOOD1,121270375926581-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.070922,562.0,397,3,others
121270376000796-BC01_BLOOD1,121270376000796-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.105092,921.0,432,0,memory.CD4.T.cells
121270378289958-BC01_BLOOD1,121270378289958-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.0837,1132.0,681,6,monocytes


In [113]:
adata_brca.obs.loc[adata_brca.obs['ID'] == 'BC01_BLOOD1']['celltype_dream'].unique()

[memory.CD4.T.cells, others, monocytes, memory.B.cells, naive.CD8.T.cells, ..., NK.cells, myeloid.dendritic.cells, naive.CD4.T.cells, memory.CD8.T.cells, neutrophils]
Length: 12
Categories (12, object): [memory.CD4.T.cells, others, monocytes, memory.B.cells, ..., myeloid.dendritic.cells, naive.CD4.T.cells, memory.CD8.T.cells, neutrophils]

In [114]:
adata_brca_geo_raw.obs.loc[adata_brca_geo_raw.obs['ID'] == 'BC01_BLOOD1']['celltype_dream'].unique()

[monocytes, myeloid.dendritic.cells, others, memory.B.cells]
Categories (4, object): [monocytes, myeloid.dendritic.cells, others, memory.B.cells]

### Fix martin index - for BisqueRNA::SeuratToExpressionSet()
currently splitting with this index gives two labels per individual as follows

In [58]:
idx5 = adata_martin_geo_raw.obs.loc[adata_martin_geo_raw.obs['Subject']=='pat. 5'].index
ids5 = []
for i in idx5:
    ids5.append(i.split('.')[0])
len(set(ids5))
set(ids5)

{'GSM3972009_69', 'GSM3972010_68'}

thus fix index

In [118]:
idx = adata_martin.obs.index
idx_new = []
for i, val in enumerate(idx):
    out = adata_martin.obs['Subject'][i].replace(" ", "").replace('.','') + '_' + val.split('_')[1]
    idx_new.append(out)

In [119]:
adata_martin.obs.index = idx_new

In [120]:
adata_martin.obs.head()

Unnamed: 0,CELL,CONDITION,Sample_geo_accession,Sample_title,Subject,tissue,status,10x chemistry,Sample_relation,Sample_relation_2,...,ID_REF,percent_mito,n_counts,n_genes,louvain,cell_compartment,celltype,celltype_finegrained,cluster_celltype,celltype_dream
pat5_69.AAACATACACACCA-1,GSM3972009_69.AAACATACACACCA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.012165,1233.0,500,5,immune,T cell,Resident memory T cell,5: T cell,memory.CD4.T.cells
pat5_69.AAACATTGGTGTCA-1,GSM3972009_69.AAACATTGGTGTCA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.012548,4142.0,1277,22,stroma,Fibroblast,Activated Fibroblast,22: Fibroblast,fibroblasts
pat5_69.AAACGCACTTAGGC-1,GSM3972009_69.AAACGCACTTAGGC-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.006716,5806.0,1727,22,stroma,Fibroblast,Activated Fibroblast,22: Fibroblast,fibroblasts
pat5_69.AAACGCTGCTACCC-1,GSM3972009_69.AAACGCTGCTACCC-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.010526,1327.0,627,7,immune,T cell,Regulatory T cell,7: T cell,regulatory.T.cells
pat5_69.AAACTTGAGTCACA-1,GSM3972009_69.AAACTTGAGTCACA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.013407,3803.0,1327,24,stroma,Pericyte,Pericyte,24: Pericyte,others


### geosketch output analysis

check if all subjects still represented, likewise for cell types

In [19]:
# 11 samples in orig ds
adata_martin.obs['Subject'].values.unique()

[pat. 5, pat. 6, pat. 7, pat. 8, pat. 10, ..., pat. 12, pat. 13, pat. 14, pat. 15, pat. 16]
Length: 11
Categories (11, object): [pat. 5, pat. 6, pat. 7, pat. 8, ..., pat. 13, pat. 14, pat. 15, pat. 16]

In [18]:
# 50 samples in original ds
adata_brca.obs['ID'].values.unique()

[BC01_BLOOD1, BC01_BLOOD3, BC01_NORMAL1, BC01_NORMAL2, BC01_NORMAL3, ..., BC07_TUMOR4, BC08_NORMAL1, BC08_TUMOR1, BC08_TUMOR2, BC08_TUMOR3]
Length: 50
Categories (50, object): [BC01_BLOOD1, BC01_BLOOD3, BC01_NORMAL1, BC01_NORMAL2, ..., BC08_NORMAL1, BC08_TUMOR1, BC08_TUMOR2, BC08_TUMOR3]

In [27]:
adata_brca_geo_raw = sc.read('./brca_geosketch_raw.h5ad')
adata_martin_geo_raw = sc.read('./martin_geosketch_raw.h5ad')

In [20]:
# still 11 subjects
adata_martin_geo.obs['Subject'].values.unique()

[pat. 5, pat. 6, pat. 7, pat. 8, pat. 10, ..., pat. 12, pat. 13, pat. 14, pat. 15, pat. 16]
Length: 11
Categories (11, object): [pat. 5, pat. 6, pat. 7, pat. 8, ..., pat. 13, pat. 14, pat. 15, pat. 16]

In [21]:
# still 50 subjects
adata_brca_geo_raw.obs['ID'].values.unique()

[BC01_BLOOD1, BC01_BLOOD3, BC01_NORMAL1, BC01_NORMAL2, BC01_NORMAL3, ..., BC07_TUMOR4, BC08_NORMAL1, BC08_TUMOR1, BC08_TUMOR2, BC08_TUMOR3]
Length: 50
Categories (50, object): [BC01_BLOOD1, BC01_BLOOD3, BC01_NORMAL1, BC01_NORMAL2, ..., BC08_NORMAL1, BC08_TUMOR1, BC08_TUMOR2, BC08_TUMOR3]

now check cell types

In [29]:
# geosketch missing 'naive.CD8.T.cells'
x = adata_brca.obs['celltype_dream'].values.unique().tolist()
y = adata_brca_geo_raw.obs['celltype_dream'].values.unique().tolist()
set(x) - set(y)

{'naive.CD8.T.cells'}

In [28]:
# geosketch missing 'naive.CD8.T.cells'
x = adata_martin.obs['celltype_dream'].values.unique().tolist()
y = adata_martin_geo_raw.obs['celltype_dream'].values.unique().tolist()
set(x) - set(y)

{'naive.CD8.T.cells'}

### Convert to SingleCellExperiment R Object

saves anndata as SCE object. Postprocess in R

based on tutorial here: https://github.com/LuckyMD/Code_snippets/blob/master/Seurat_to_anndata.ipynb

In [129]:
import anndata2ri
anndata2ri.activate()

In [130]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [132]:
%%R -i adata_martin_raw
saveRDS(adata_martin_raw, 'martin_raw_sce.RDS')

In [133]:
%%R -i adata_brca_raw
saveRDS(adata_brca_raw, 'brca_raw_sce.RDS')

In [38]:
# do brca
adata_brca_sub = sc.read('brca_geosketch.h5ad')

In [41]:
%%R -i adata_brca_sub
saveRDS(adata_brca_sub, 'brca_sce.RDS')

In [80]:
%%R -i adata_martin_raw
saveRDS(adata_martin_raw, 'martin_raw_sce.RDS')

In [81]:
%%R -i adata_brca_raw
saveRDS(adata_brca_raw, 'brca_raw_sce.RDS')

In [None]:
#%%R -i adata_martin
#library(Seurat)
#martin.seurat <- as.Seurat(adata_martin)
#saveRDS(martin.seurat, 'martin_seurat.RDS')

R[write to console]: 
 *** caught segfault ***

R[write to console]: address 0x10, cause 'memory not mapped'

R[write to console]: 
Traceback:

R[write to console]:  1: 
R[write to console]: py_initialize(config$python, config$libpython, config$pythonhome, 
R[write to console]:     config$virtualenv_activate, config$version >= "3.0", interactive(), 
R[write to console]:     numpy_load_error)
R[write to console]: 

R[write to console]:  2: 
R[write to console]: doTryCatch(return(expr), name, parentenv, handler)
R[write to console]: 

R[write to console]:  3: 
R[write to console]: tryCatchOne(expr, names, parentenv, handlers[[1L]])
R[write to console]: 

R[write to console]:  4: 
R[write to console]: tryCatchList(expr, classes, parentenv, handlers)
R[write to console]: 

R[write to console]:  5: 
R[write to console]: tryCatch({
R[write to console]:     py_initialize(config$python, config$libpython, config$pythonhome, 
R[write to console]:         config$virtualenv_activate, config$versio

R[write to console]:     .Internal(eval(expr, envir, enclos)), target = new("signature", 
R[write to console]:         .Data = "ANY", names = "expr", package = "methods"), 
R[write to console]:         defined = new("signature", .Data = "ANY", names = "expr", 
R[write to console]:             package = "methods"), generic = "eval"), skeleton = (new("derivedDefaultMethod", 
R[write to console]:         .Data = function (expr, envir = parent.frame(), enclos = if (is.list(envir) || 
R[write to console]:             is.pairlist(envir)) parent.frame() else baseenv()) 
R[write to console]:         .Internal(eval(expr, envir, enclos)), target = new("signature", 
R[write to console]:             .Data = "ANY", names = "expr", package = "methods"), 
R[write to console]:         defined = new("signature", .Data = "ANY", names = "expr", 
R[write to console]:             package = "methods"), generic = "eval"))(expr, envir, 
R[write to console]:         enclos)))(expression(withVisible({
R[write t

Selection: 4
Selection: 4
Selection: 
Selection: 
Selection: 
Selection: 
Selection: 
Selection: 
Selection: 
Selection: x


From cffi callback <function _consoleread at 0x15c519ef0>:
Traceback (most recent call last):
  File "/Users/mqp/miniconda3/envs/scanpy/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 884, in _input_request
    ident, reply = self.session.recv(self.stdin_socket, 0)
  File "/Users/mqp/miniconda3/envs/scanpy/lib/python3.7/site-packages/jupyter_client/session.py", line 813, in recv
    msg_list = socket.recv_multipart(mode, copy=copy)
  File "/Users/mqp/miniconda3/envs/scanpy/lib/python3.7/site-packages/zmq/sugar/socket.py", line 475, in recv_multipart
    parts = [self.recv(flags, copy=copy, track=track)]
  File "zmq/backend/cython/socket.pyx", line 791, in zmq.backend.cython.socket.Socket.recv
  File "zmq/backend/cython/socket.pyx", line 827, in zmq.backend.cython.socket.Socket.recv
  File "zmq/backend/cython/socket.pyx", line 186, in zmq.backend.cython.socket._recv_copy
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrup

Selection: q


### Notes converting SCE to eset in R

run from cmd line: `env R_MAX_VSIZE=100Gb R`

### Fix issues loading martin ds in R ReadH5AD: 
Error in file[["obs"]][] : object of type 'environment' is not subsettable

Note: these were not needed. the issue was with a bug in AnnData new version: https://github.com/satijalab/seurat/issues/2485

In [44]:
adata_martin_sub.obs.head()
# thus in R: 
# out.eset <- BisqueRNA::SeuratToExpressionSet(seurat, delimiter='_', position=2, version = 'v3')

Unnamed: 0_level_0,CELL,CONDITION,Sample_geo_accession,Sample_title,Subject,tissue,status,10x chemistry,Sample_relation,Sample_relation_2,...,ID_REF,percent_mito,n_counts,n_genes,louvain,cell_compartment,celltype,celltype_finegrained,cluster_celltype,celltype_dream
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM3972009_69.AAACATTGGTGTCA-1,GSM3972009_69.AAACATTGGTGTCA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.012548,4142.0,1277,22,stroma,Fibroblast,Activated Fibroblast,22: Fibroblast,fibroblasts
GSM3972009_69.AAAGCAGATTTCGT-1,GSM3972009_69.AAAGCAGATTTCGT-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.015983,1438.0,692,15,stroma,Endothelial cell,Blood vascular endothelial cell,15: Endothelial cell,endothelial.cells
GSM3972009_69.AAATGTTGGTGCTA-1,GSM3972009_69.AAATGTTGGTGCTA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.027858,4519.0,1331,22,stroma,Fibroblast,Activated Fibroblast,22: Fibroblast,fibroblasts
GSM3972009_69.AACCGCCTTCGACA-1,GSM3972009_69.AACCGCCTTCGACA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.015419,2723.0,1161,15,stroma,Endothelial cell,Blood vascular endothelial cell,15: Endothelial cell,endothelial.cells
GSM3972009_69.AACTCTTGCACACA-1,GSM3972009_69.AACTCTTGCACACA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.017823,4148.0,1518,22,stroma,Fibroblast,Activated Fibroblast,22: Fibroblast,fibroblasts


In [43]:
adata_brca_sub.obs.head()


Unnamed: 0_level_0,CELL,BioSample,CONDITION,GPL,Genome_build,ID,ID_REF,SRA,Sample_geo_accession,Sample_organism_ch1,cell type,donor,donor age (years),resident tissue,percent_mito,n_counts,n_genes,louvain,celltype_dream
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
121270378289958-BC01_BLOOD1,121270378289958-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.0837,1132.0,681,6,monocytes
126279346403757-BC01_BLOOD1,126279346403757-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.088715,1649.0,911,6,myeloid.dendritic.cells
130075866818845-BC01_BLOOD1,130075866818845-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.06127,1768.0,982,6,monocytes
135012925267174-BC01_BLOOD1,135012925267174-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.069473,2442.0,1292,6,monocytes
162071494146405-BC01_BLOOD1,162071494146405-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.109677,617.0,446,6,myeloid.dendritic.cells


In [46]:
adata_martin.obs.columns

Index(['CELL', 'CONDITION', 'Sample_geo_accession', 'Sample_title', 'Subject',
       'tissue', 'status', '10x chemistry', 'Sample_relation',
       'Sample_relation_2', 'Sample_supplementary_file_1',
       'Sample_supplementary_file_2', 'Sample_supplementary_file_3', 'ID_REF',
       'percent_mito', 'n_counts', 'n_genes', 'louvain', 'cell_compartment',
       'celltype', 'celltype_finegrained', 'cluster_celltype',
       'celltype_dream'],
      dtype='object')

In [3]:
rmkeys = ['neighbors', 'pca', 'rank_genes_groups', 'celltype_dream_colors']
for key in rmkeys:
    adata_martin.uns.pop(key, None)

In [58]:
adata_martin.uns.keys()

dict_keys(['louvain', 'louvain_colors'])

In [57]:
keep_keys = ['louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups']
ks = adata_martin.uns.copy().keys()
for k in ks:
    if k not in keep_keys:
        adata_martin.uns.pop(k, None)

In [4]:
adata_martin.write(filename='./martin_anndata07rc1.h5ad')

### Check SCDC error
Error in y[y < q15] <- q15[y < q15] :
NAs are not allowed in subscripted assignments
In addition: There were 50 or more warnings (use warnings() to see the first 50)

In FUN(newX[, i], ...) : no non-missing arguments to max; returning -Inf

In [11]:
# check if NaNs in datasets
np.isnan(adata_martin.X).any()

False

In [12]:
np.isnan(adata_brca.X).any()

False

### reduce number of features

scdc too slow

In [20]:
# remove others?
adata_brca.obs.loc[adata_brca.obs['celltype_dream']=='others']

Unnamed: 0_level_0,CELL,BioSample,CONDITION,GPL,Genome_build,ID,ID_REF,SRA,Sample_geo_accession,Sample_organism_ch1,cell type,donor,donor age (years),resident tissue,percent_mito,n_counts,n_genes,louvain,celltype_dream
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
121270375926581-BC01_BLOOD1,121270375926581-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.070922,562.0,397,3,others
121938124355446-BC01_BLOOD1,121938124355446-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.063171,2408.0,845,4,others
126834339069149-BC01_BLOOD1,126834339069149-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.098446,578.0,414,3,others
129586204366638-BC01_BLOOD1,129586204366638-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.081556,795.0,375,0,others
156590056269670-BC01_BLOOD1,156590056269670-BC01_BLOOD1,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,blood CD45+ leukocytes,GPL16791,Gencode GRCh38,BC01_BLOOD1,GSM3148585,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108636,GSM3148585,Homo sapiens,CD45+ leukocytes,BC01,38,blood,0.109347,564.0,373,3,others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197058962770845-BC08_TUMOR3,197058962770845-BC08_TUMOR3,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,breast tumor CD45+ leukocytes,GPL16791,Gencode GRCh38,BC08_TUMOR3,GSM3148640,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108691,GSM3148640,Homo sapiens,CD45+ leukocytes,BC08,72,breast tumor,0.048529,1933.0,907,4,others
197058964417380-BC08_TUMOR3,197058964417380-BC08_TUMOR3,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,breast tumor CD45+ leukocytes,GPL16791,Gencode GRCh38,BC08_TUMOR3,GSM3148640,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108691,GSM3148640,Homo sapiens,CD45+ leukocytes,BC08,72,breast tumor,0.058771,1122.0,746,3,others
197058966514012-BC08_TUMOR3,197058966514012-BC08_TUMOR3,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,breast tumor CD45+ leukocytes,GPL16791,Gencode GRCh38,BC08_TUMOR3,GSM3148640,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108691,GSM3148640,Homo sapiens,CD45+ leukocytes,BC08,72,breast tumor,0.050257,1749.0,1016,3,others
200355754961310-BC08_TUMOR3,200355754961310-BC08_TUMOR3,https://www.ncbi.nlm.nih.gov/biosample/SAMN092...,breast tumor CD45+ leukocytes,GPL16791,Gencode GRCh38,BC08_TUMOR3,GSM3148640,https://www.ncbi.nlm.nih.gov/sra?term=SRX4108691,GSM3148640,Homo sapiens,CD45+ leukocytes,BC08,72,breast tumor,0.037879,528.0,379,2,others


In [22]:
adata_martin.obs.loc[adata_martin.obs['celltype_dream']=='others']

Unnamed: 0_level_0,CELL,CONDITION,Sample_geo_accession,Sample_title,Subject,tissue,status,10x chemistry,Sample_relation,Sample_relation_2,...,ID_REF,percent_mito,n_counts,n_genes,louvain,cell_compartment,celltype,celltype_finegrained,cluster_celltype,celltype_dream
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM3972009_69.AAACTTGAGTCACA-1,GSM3972009_69.AAACTTGAGTCACA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.013407,3803.0,1327,24,stroma,Pericyte,Pericyte,24: Pericyte,others
GSM3972009_69.AAACTTGATCACCC-1,GSM3972009_69.AAACTTGATCACCC-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.006319,6330.0,864,12,immune,Plasma cell,IgG plasma cell or plasmablast,12: Plasma cell,others
GSM3972009_69.AAAGACGAATCACG-1,GSM3972009_69.AAAGACGAATCACG-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.015349,10294.0,2204,20,immune,T cell,Cycling T cell,20: T cell,others
GSM3972009_69.AAAGATCTTATCGG-1,GSM3972009_69.AAAGATCTTATCGG-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.004392,15480.0,920,12,immune,Plasma cell,IgG plasma cell or plasmablast,12: Plasma cell,others
GSM3972009_69.AAAGCAGATTGGCA-1,GSM3972009_69.AAAGCAGATTGGCA-1,Involved,GSM3972009,Ileal Involved 69,pat. 5,ileal,Involved,V1,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972009,0.019023,5467.0,1233,12,immune,Plasma cell,IgG plasma cell or plasmablast,12: Plasma cell,others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM3972030_209.TTTGGTTTCAAACGGG-1,GSM3972030_209.TTTGGTTTCAAACGGG-1,Involved,GSM3972030,Ileal Involved 209,pat. 16,ileal,Involved,V2,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972030,0.060678,7136.0,1289,14,immune,Plasma cell,IgA or IgM plasma cell,14: Plasma cell,others
GSM3972030_209.TTTGTCAAGCATCATC-1,GSM3972030_209.TTTGTCAAGCATCATC-1,Involved,GSM3972030,Ileal Involved 209,pat. 16,ileal,Involved,V2,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972030,0.121032,1512.0,533,4,immune,Plasma cell,IgA or IgM plasma cell,4: Plasma cell,others
GSM3972030_209.TTTGTCAAGGTGATTA-1,GSM3972030_209.TTTGTCAAGGTGATTA-1,Involved,GSM3972030,Ileal Involved 209,pat. 16,ileal,Involved,V2,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972030,0.024231,11802.0,1626,14,immune,Plasma cell,IgA or IgM plasma cell,14: Plasma cell,others
GSM3972030_209.TTTGTCAGTCTCCCTA-1,GSM3972030_209.TTTGTCAGTCTCCCTA-1,Involved,GSM3972030,Ileal Involved 209,pat. 16,ileal,Involved,V2,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,...,GSM3972030,0.048219,4998.0,795,4,immune,Plasma cell,IgA or IgM plasma cell,4: Plasma cell,others


### determine missing celltypes for each dataset 

In [25]:
ct_b = adata_brca.obs['celltype_dream'].values.unique().tolist()

In [26]:
ct_m = adata_martin.obs['celltype_dream'].values.unique().tolist()

In [29]:
ct_s = adata_smillie.obs['celltype_dream'].values.unique().tolist()

In [13]:
# dream celltypes 
ct = """memory.B.cells
naive.B.cells
memory.CD4.T.cells
naive.CD4.T.cells
regulatory.T.cells
memory.CD8.T.cells
naive.CD8.T.cells
NK.cells
neutrophils
monocytes
myeloid.dendritic.cells
macrophages
fibroblasts
endothelial.cells""".split()

In [38]:
# intersection dream x brca
set(ct) & set(ct_b)

{'NK.cells',
 'endothelial.cells',
 'fibroblasts',
 'macrophages',
 'memory.B.cells',
 'memory.CD4.T.cells',
 'memory.CD8.T.cells',
 'monocytes',
 'myeloid.dendritic.cells',
 'naive.B.cells',
 'naive.CD4.T.cells',
 'naive.CD8.T.cells',
 'neutrophils',
 'regulatory.T.cells'}

In [42]:
list(set(ct) - set(ct_m))

['monocytes', 'neutrophils']

In [43]:
# brca dataset has all required celltypes
list(set(ct) - set(ct_b))

[]

In [27]:
# intersect brca x martin
set(ct_b) & set(ct_m)

{'NK.cells',
 'endothelial.cells',
 'fibroblasts',
 'macrophages',
 'memory.B.cells',
 'memory.CD4.T.cells',
 'memory.CD8.T.cells',
 'myeloid.dendritic.cells',
 'naive.B.cells',
 'naive.CD4.T.cells',
 'naive.CD8.T.cells',
 'others',
 'regulatory.T.cells'}

In [30]:
# brca x smillie
set(ct_b) & set(ct_s)

{'NK.cells',
 'endothelial.cells',
 'fibroblasts',
 'macrophages',
 'memory.CD4.T.cells',
 'memory.CD8.T.cells',
 'monocytes',
 'myeloid.dendritic.cells',
 'others',
 'regulatory.T.cells'}

In [19]:
print(','.join("'{0}'".format(w) for w in ct))

'memory.B.cells','naive.B.cells','memory.CD4.T.cells','naive.CD4.T.cells','regulatory.T.cells','memory.CD8.T.cells','naive.CD8.T.cells','NK.cells','neutrophils','monocytes','myeloid.dendritic.cells','macrophages','fibroblasts','endothelial.cells'


'memory.B.cells','naive.B.cells','memory.CD4.T.cells','naive.CD4.T.cells','regulatory.T.cells','memory.CD8.T.cells','naive.CD8.T.cells','NK.cells','neutrophils','monocytes','myeloid.dendritic.cells','macrophages','fibroblasts','endothelial.cells'