In [1]:
import os
import numpy as np
import scipy as sp
import pandas as pd
import scanpy.api as sc
from gprofiler import GProfiler



In [2]:
file_path = '/data/martin/single_cell/Ding_Levin_biorxiv_2019'

In [3]:
# UMI: read data
list_cell = list()
with open(file_path + '/cells.umi.txt') as f:
    for line in f:
        list_cell.append(line.strip())
list_gene = list()
with open(file_path + '/genes.umi.txt') as f:
    for line in f:
        list_gene.append(line.strip())
list_gene = [x.split('_')[-1] for x in list_gene]
list_line = []
with open(file_path + '/counts.umi.txt') as f:
    for line in f:
        list_line.append(line)
# specifiy row and column
n_row = 33694
n_col = 44433
# 
list_row = []
list_col = []
list_ct = []
for line in list_line[2:]:
    line = line.strip().split(' ')
    list_row.append(int(line[0])-1)
    list_col.append(int(line[1])-1)
    list_ct.append(int(line[2]))

In [4]:
# generate anndata object
X = sp.sparse.csc_matrix((list_ct, (list_row, list_col)), shape=(n_row, n_col))
anndata = sc.AnnData(X=X.T)
anndata.obs_names = list_cell
anndata.var_names = list_gene

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [5]:
# write files 
for data in ['pbmc1', 'pbmc2']:
    for method in ['10xChromiumv2', '10xChromiumv2A', '10xChromiumv2B']:
        ind_select = [(x.split('_')[0]==data) & (x.split('_')[1]==method)
                      for x in list_cell]
        ind_select = np.array(ind_select)
        print(data, method, np.sum(ind_select))
        if np.sum(ind_select)==0:
            continue
        temp_anndata = anndata[ind_select, :]
        temp_anndata.write(file_path + '/%s.%s.h5ad'%(data, method))

pbmc1 10xChromiumv2 0
pbmc1 10xChromiumv2A 5184


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


pbmc1 10xChromiumv2B 3222


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


pbmc2 10xChromiumv2 3362
pbmc2 10xChromiumv2A 0
pbmc2 10xChromiumv2B 0


In [6]:
# bulk: PBMC 1
temp_df = pd.read_csv(file_path + '/rsem.bulk.pbmc1.results', sep='\t')
temp_df.index = temp_df['gene_id']
# find gene name
gp = GProfiler(return_dataframe=True)
id_2_symbol = gp.convert(organism='hsapiens', query=list(temp_df.index),
                         target_namespace='ENTREZGENE_ACC')
id_2_symbol.index = id_2_symbol['incoming']
temp_df = temp_df.join(id_2_symbol[['name']])
temp_df.index = temp_df['name']
# convert to anndata and write 
anndata = sc.AnnData(temp_df[['TPM']].T)
anndata.write(file_path + '/rsem.bulk.pbmc1.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [7]:
# bulk: PBMC 2
temp_df = pd.read_csv(file_path + '/rsem.bulk.pbmc2.results', sep='\t')
temp_df.index = temp_df['gene_id']
# find gene name
gp = GProfiler(return_dataframe=True)
id_2_symbol = gp.convert(organism='hsapiens', query=list(temp_df.index),
                         target_namespace='ENTREZGENE_ACC')
id_2_symbol.index = id_2_symbol['incoming']
temp_df = temp_df.join(id_2_symbol[['name']])
temp_df.index = temp_df['name']
# convert to anndata and write d
anndata = sc.AnnData(temp_df[['TPM']].T)
anndata.write(file_path + '/rsem.bulk.pbmc2.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
