# Combine and Clean Retina Data

The purpose of this notebook is to combine all the digital gene expression data for the retina cells, downloaded from the Gene Expression Omnibus using the accession number [GSE63473](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63473).

In [11]:
import os
import common

# Assign notebook and folder names
notebook_name = '05_make_rentina_subsets_for_teaching'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders 
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/05_make_rentina_subsets_for_teaching
Data folder: ../data/05_make_rentina_subsets_for_teaching


In [161]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [162]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

filename = os.path.join(input_folder, 'GSM1626793_P14Retina_1.digital_expression.txt.gz')
filename

'../data/00_original/GSM1626793_P14Retina_1.digital_expression.txt.gz'

In [163]:
table1 = pd.read_table(filename, index_col=0, compression='gzip')
table1 = table1.T
print(table1.shape)
table1.head()

(6600, 20478)


gene,10:100015630-100100413:Kitl,10:100443902-100487350:Tmtc3,10:100488289-100573655:Cep290,10:100572274-100589259:4930430F08Rik,10:100592386-100618391:1700017N19Rik,10:101681487-102391469:Mgat4c,10:102512222-102546560:Rassf9,10:103063198-103236322:Lrriq1,10:10335703-10472326:Adgb,10:103367808-103419378:Slc6a15,...,X:99136130-99148991:Efnb1,X:99465734-99471273:Pja1,X:99821021-99848790:Tmem28,X:99975606-100400762:Eda,Y:1010543-1028847:Eif2s3y,Y:10640942-10643315:Gm20775,Y:1096861-1245759:Uty,Y:1260715-1286613:Ddx3y,Y:897788-943811:Kdm5d,Y:991630-991748:n-R5s1
GGCCGCAGTCCG,0,3,1,2,0,0,0,0,0,4,...,0,8,1,0,0,0,0,0,0,0
CTTGTGCGGGAA,0,0,3,1,0,0,0,0,0,1,...,0,9,0,0,2,0,1,5,0,0
GCGCAACTGCTC,1,0,0,2,0,4,0,0,0,3,...,1,11,0,0,0,0,0,0,0,0
GATTGGGAGGCA,0,0,2,0,0,1,0,0,0,2,...,1,2,0,0,3,0,1,0,0,0
CCTCCTAGTTGG,0,2,1,1,0,2,0,0,0,1,...,0,3,0,0,0,0,0,0,0,0


In [164]:
nav1 = [x for x in table1.columns if x.endswith('Nav1')]
nav1

['1:135434580-135585355:Nav1', '1:135606447-135688105:Nav1']

In [166]:
table1[nav1].head()

gene,1:135434580-135585355:Nav1,1:135606447-135688105:Nav1
GGCCGCAGTCCG,6,0
CTTGTGCGGGAA,5,0
GCGCAACTGCTC,2,0
GATTGGGAGGCA,2,0
CCTCCTAGTTGG,4,0



## Clean expression matrix to be compatible with the cluster labels and identities

Currently, cells are labeled by their barcode, e.g. `GCGCAACTGCTC`, and genes are labeled by their chrom:start-end:symbol, e.g. `6:51460434-51469894:Hnrnpa2b1`. But, in the supplementary data, the genes are all uppercase, e.g. `HNRNPA2B1` (which is incorrect since this is mouse data.. ) and the barcodes have `r1_` prepended before the id, e.g. `r1_GCGCAACTGCTC`.

So we need to clean the data to be compatible with this

In [168]:
gene_symbols = table1.columns.map(lambda x: x.split(':')[-1].upper())
gene_symbols.name = 'symbol'
table1.columns = gene_symbols
table1.head()

symbol,KITL,TMTC3,CEP290,4930430F08RIK,1700017N19RIK,MGAT4C,RASSF9,LRRIQ1,ADGB,SLC6A15,...,EFNB1,PJA1,TMEM28,EDA,EIF2S3Y,GM20775,UTY,DDX3Y,KDM5D,N-R5S1
GGCCGCAGTCCG,0,3,1,2,0,0,0,0,0,4,...,0,8,1,0,0,0,0,0,0,0
CTTGTGCGGGAA,0,0,3,1,0,0,0,0,0,1,...,0,9,0,0,2,0,1,5,0,0
GCGCAACTGCTC,1,0,0,2,0,4,0,0,0,3,...,1,11,0,0,0,0,0,0,0,0
GATTGGGAGGCA,0,0,2,0,0,1,0,0,0,2,...,1,2,0,0,3,0,1,0,0,0
CCTCCTAGTTGG,0,2,1,1,0,2,0,0,0,1,...,0,3,0,0,0,0,0,0,0,0


### Sum gene symbols with same name

In [169]:
table1 = table1.groupby(axis=1, level=0).sum()
print(table1.shape)
table1.head()

(6600, 20426)


symbol,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
GGCCGCAGTCCG,0,0,3,0,2,7,0,1,1,6,...,2,0,0,53,0,1,4,0,1,0
CTTGTGCGGGAA,0,1,1,0,1,5,0,0,1,5,...,0,0,0,65,0,1,6,1,3,2
GCGCAACTGCTC,0,0,1,0,3,7,0,0,1,9,...,1,0,0,38,0,0,2,0,0,0
GATTGGGAGGCA,0,0,2,2,1,1,0,0,2,2,...,0,0,0,17,0,0,2,0,2,0
CCTCCTAGTTGG,0,0,1,0,1,2,0,0,1,8,...,1,0,0,38,0,0,5,0,0,0


In [170]:
%%time
csv = os.path.join(data_folder, 'retina_batch1_expression.csv')
table1.to_csv(csv)
! ls -lh $csv

-rw-r--r--  1 olgabot  staff   257M Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/retina_batch1_expression.csv
CPU times: user 1min 48s, sys: 16 s, total: 2min 4s
Wall time: 2min 4s


### Add `r1_` prefix to barcodes to indicate the first run

In [171]:
table1.index = 'r1_' + table1.index
table1.head()

symbol,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
r1_GGCCGCAGTCCG,0,0,3,0,2,7,0,1,1,6,...,2,0,0,53,0,1,4,0,1,0
r1_CTTGTGCGGGAA,0,1,1,0,1,5,0,0,1,5,...,0,0,0,65,0,1,6,1,3,2
r1_GCGCAACTGCTC,0,0,1,0,3,7,0,0,1,9,...,1,0,0,38,0,0,2,0,0,0
r1_GATTGGGAGGCA,0,0,2,2,1,1,0,0,2,2,...,0,0,0,17,0,0,2,0,2,0
r1_CCTCCTAGTTGG,0,0,1,0,1,2,0,0,1,8,...,1,0,0,38,0,0,5,0,0,0


## Add retinal cell cluster metadata

In [172]:
csv = os.path.join(common.DATA_FOLDER, '03_clean_cluster_assignments', 
                   'cluster_names.csv')
cluster_names = pd.read_csv(csv, index_col=0, squeeze=True)
print(cluster_names.shape)
cluster_names.head()

(44808,)


cell
r1_GGCCGCAGTCCG    cluster_02
r1_CTTGTGCGGGAA    cluster_02
r1_GCGCAACTGCTC    cluster_02
r1_GATTGGGAGGCA    cluster_02
r1_GTGCCGCCTCTC    cluster_25
Name: cluster_id, dtype: object

In [173]:
csv = os.path.join(common.DATA_FOLDER, '03_clean_cluster_assignments', 
                   'cluster_bools.csv')
cluster_bools = pd.read_csv(csv, index_col=0, squeeze=True)
print(cluster_bools.shape)
cluster_bools.head()

(44808, 39)


Unnamed: 0_level_0,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,...,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
r1_GGCCGCAGTCCG,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CTTGTGCGGGAA,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GCGCAACTGCTC,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GATTGGGAGGCA,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GTGCCGCCTCTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Add gene metadata

In [174]:
csv = os.path.join(common.DATA_FOLDER, 
                   '04_extract_data_from_supplementary_excel_files', 
                   'mouse_gene_metadata.csv')
gene_metadata = pd.read_csv(csv, index_col=0)
gene_metadata.head()

Unnamed: 0_level_0,retina_01,retina_02,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,...,retina_38,retina_39,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaas,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
Acat2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
Acot9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
Actb,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
Adar,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


## Make subsets for teaching

### Dropout demo - Equal sized clusters

only use cells from the first batch, and only use differentially expressed genes from the biggest clusters

#### Find biggest clusters

In [175]:
cluster_sizes_table1 = table1.groupby(cluster_names, axis=0).size()
cluster_sizes_table1

cluster_id
cluster_01      27
cluster_02      70
cluster_03      44
cluster_04      13
cluster_05      18
cluster_06      42
cluster_07      54
cluster_08      23
cluster_09      42
cluster_10      29
cluster_11      31
cluster_12      47
cluster_13      11
cluster_14      10
cluster_15      13
cluster_16      36
cluster_17      72
cluster_18      14
cluster_19      19
cluster_20      64
cluster_21      47
cluster_22      46
cluster_23      39
cluster_24    3746
cluster_25     241
cluster_26     317
cluster_27     126
cluster_28      56
cluster_29      85
cluster_30      87
cluster_31      80
cluster_32      54
cluster_33     114
cluster_34     244
cluster_35       4
cluster_36      13
cluster_37      24
cluster_38       9
cluster_39       9
dtype: int64

In [176]:
big_clusters = cluster_sizes_table1[cluster_sizes_table1 > 100]
big_clusters

cluster_id
cluster_24    3746
cluster_25     241
cluster_26     317
cluster_27     126
cluster_33     114
cluster_34     244
dtype: int64

In [177]:
cells_in_big_clusters = cluster_names.isin(big_clusters.index)
cells_in_big_clusters = cells_in_big_clusters[cells_in_big_clusters]
cells_in_big_clusters.sum()

36622

In [178]:
table1_big_clusters, y = table1.align(cells_in_big_clusters, axis=0, join='inner')
print(table1_big_clusters.shape)
print(y.shape)

(4788, 20426)
(4788,)


### Take 50 random cells from each cluster

In [179]:
np.random.seed(2017)

n_cells = 50
table1_big_clusters_subset = table1_big_clusters.groupby(
    cluster_names, as_index=False, group_keys=False).apply(
        lambda x: x.loc[np.random.choice(x.index, size=n_cells, replace=False)])
print(table1_big_clusters_subset.shape)
table1_big_clusters_subset.head()

(300, 20426)


symbol,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
r1_TTCCTGCTAGGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_TGGAGATACTCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CGTCTACATCCG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CAAGCTTGGCGC,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
r1_ACTCACATAGAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Subset the cell metadata too

In [180]:
cell_metadata_big_clusters = cluster_bools.loc[table1_big_clusters_subset.index, big_clusters.index]
print(cell_metadata_big_clusters.shape)
cell_metadata_big_clusters.head()

(300, 6)


cluster_id,cluster_24,cluster_25,cluster_26,cluster_27,cluster_33,cluster_34
r1_TTCCTGCTAGGC,1,0,0,0,0,0
r1_TGGAGATACTCT,1,0,0,0,0,0
r1_CGTCTACATCCG,1,0,0,0,0,0
r1_CAAGCTTGGCGC,1,0,0,0,0,0
r1_ACTCACATAGAG,1,0,0,0,0,0


### Subset genes by differential expression

In [181]:
filename = os.path.join(common.DATA_FOLDER, '04_extract_data_from_supplementary_excel_files', 
                        'mouse_gene_metadata.csv')

gene_metadata = pd.read_csv(filename, index_col=0)
print(gene_metadata.shape)
gene_metadata.head()

(2007, 47)


Unnamed: 0_level_0,retina_01,retina_02,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,...,retina_38,retina_39,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaas,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
Acat2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
Acot9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
Actb,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
Adar,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


#### Get genes from the big clusters

In [182]:
index = big_clusters.index.map(lambda x: x.replace('cluster', 'retina'))
big_retina = pd.Series(big_clusters.values, index=index)
big_retina

cluster_id
retina_24    3746
retina_25     241
retina_26     317
retina_27     126
retina_33     114
retina_34     244
dtype: int64

In [183]:
in_big_clusters = gene_metadata[big_retina.index].any(axis=1)
print(in_big_clusters.sum())
in_big_clusters.head()

259


gene_symbol
    Aaas     False
    Acat2    False
    Acot9    False
    Actb     False
    Adar     False
dtype: bool

In [184]:
genes_in_big_clusters = in_big_clusters[in_big_clusters].index
genes_in_big_clusters

Index(['2010107E04RIK', '4930447C04RIK', 'A930011O12RIK', 'ABCA8A', 'ABLIM1',
       'ACSL3', 'AIPL1', 'ALDOC', 'ANK3', 'APLP2',
       ...
       'VEGFA', 'VIM', 'VSTM2B', 'VSX1', 'VSX2', 'WIPI1', 'YWHAB', 'ZBTB20',
       'ZFP365', 'ZFP36L1'],
      dtype='object', name='gene_symbol', length=259)

#### Perform the subset

In [185]:
table1_big_clusters_subset_genes = table1_big_clusters_subset.loc[:, genes_in_big_clusters]
print(table1_big_clusters_subset_genes.shape)
table1_big_clusters_subset_genes.head()

(300, 259)


gene_symbol,2010107E04RIK,4930447C04RIK,A930011O12RIK,ABCA8A,ABLIM1,ACSL3,AIPL1,ALDOC,ANK3,APLP2,...,VEGFA,VIM,VSTM2B,VSX1,VSX2,WIPI1,YWHAB,ZBTB20,ZFP365,ZFP36L1
r1_TTCCTGCTAGGC,2,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_TGGAGATACTCT,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
r1_CGTCTACATCCG,2,0,0,0,0,0,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
r1_CAAGCTTGGCGC,0,0,11,0,1,0,6,0,0,2,...,0,0,0,0,0,0,0,0,1,0
r1_ACTCACATAGAG,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,2,0,0


In [186]:
gene_metadata_big_clusters = gene_metadata.loc[genes_in_big_clusters, big_retina.index]
print(gene_metadata_big_clusters.shape)
gene_metadata_big_clusters.head()

(259, 6)


cluster_id,retina_24,retina_25,retina_26,retina_27,retina_33,retina_34
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010107E04RIK,False,False,True,False,False,False
4930447C04RIK,False,True,False,False,False,False
A930011O12RIK,False,False,False,False,False,True
ABCA8A,False,False,False,False,False,True
ABLIM1,False,False,True,False,False,False


### Save to csv

In [187]:
csv = os.path.join(data_folder, 'big_clusters_expression.csv')
table1_big_clusters_subset_genes.to_csv(csv)
! ls -lh $csv
! head $csv

-rw-r--r--  1 olgabot  staff   159K Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/big_clusters_expression.csv
,2010107E04RIK,4930447C04RIK,A930011O12RIK,ABCA8A,ABLIM1,ACSL3,AIPL1,ALDOC,ANK3,APLP2,APOE,APP,AQP4,ARR3,ATP1A1,ATP1B3,ATP2B1,B3GALT2,BC030499,BEX2,BSG,CABP5,CACNA2D1,CACNA2D3,CACNG4,CADPS,CALM1,CAMSAP2,CAR10,CAR14,CAR2,CAR8,CASP7,CAV1,CCDC136,CCK,CD59A,CD63,CD81,CD9,CDH2,CDKN1B,CELF4,CHGB,CLTB,CLU,CNGA1,CNGB1,CNN3,CNTN4,CNTNAP2,COL23A1,COL9A1,COX4I2,CP,CPLX3,CROT,CRX,CRYAB,CRYM,CST3,CTSL,CYR61,DAPL1,DBI,DDR1,DKK3,DNAJA1,DUSP1,E130114P18RIK,EGR1,ENO1,ENPP2,ESPN,FAM171B,FAM19A3,FAM57B,FLT1,FOS,FRMD3,FXYD1,FXYD6,GABRA1,GABRR1,GAS1,GLS,GLUL,GM4792,GNAI2,GNAO1,GNAS,GNAT1,GNAT2,GNB1,GNB3,GNG13,GNGT1,GNGT2,GPM6A,GPM6B,GPR179,GPR37,GRIK1,GRM6,GSG1,GUCA1A,GUCA1B,HES1,HMGN1,HMGN3,HSP90AA1,HSP90AB1,HSPA12A,HTRA1,ID1,ID2,ID3,IER2,IFT20,IL33,ISL1,ITM2B,ITM2C,JUN,JUND,KCNE2,KCNJ10,KCNMA1,KDR,LAPTM4A,LHX4,LIN7A,LRTM1,MACF1,MAP1B,MAP4,MAP6,MARCKS,MEG3,MFGE8,MGARP,MLC1,MPP4,MT1,NDN

In [188]:
csv = os.path.join(data_folder, 'big_clusters_cell_metadata.csv')
cell_metadata_big_clusters.to_csv(csv)
! ls -lh $csv
! head $csv

-rw-r--r--  1 olgabot  staff   8.3K Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/big_clusters_cell_metadata.csv
,cluster_24,cluster_25,cluster_26,cluster_27,cluster_33,cluster_34
r1_TTCCTGCTAGGC,1,0,0,0,0,0
r1_TGGAGATACTCT,1,0,0,0,0,0
r1_CGTCTACATCCG,1,0,0,0,0,0
r1_CAAGCTTGGCGC,1,0,0,0,0,0
r1_ACTCACATAGAG,1,0,0,0,0,0
r1_TAACGGACACGC,1,0,0,0,0,0
r1_CGCATGGGATAC,1,0,0,0,0,0
r1_TAACGACGCTTG,1,0,0,0,0,0
r1_TCGGCAGCCTCT,1,0,0,0,0,0


In [189]:
csv = os.path.join(data_folder, 'big_clusters_gene_metadata.csv')
gene_metadata_big_clusters.to_csv(csv)
! ls -lh $csv
! head $csv

-rw-r--r--  1 olgabot  staff    10K Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/big_clusters_gene_metadata.csv
gene_symbol,retina_24,retina_25,retina_26,retina_27,retina_33,retina_34
2010107E04RIK,False,False,True,False,False,False
4930447C04RIK,False,True,False,False,False,False
A930011O12RIK,False,False,False,False,False,True
ABCA8A,False,False,False,False,False,True
ABLIM1,False,False,True,False,False,False
ACSL3,False,False,False,False,False,True
AIPL1,True,False,True,False,False,True
ALDOC,False,False,False,False,False,True
ANK3,False,False,True,False,False,False


### Make `xarray` dataset

In [190]:
import xarray as xr

In [191]:
ds_big_clusters = xr.Dataset(
    {'expression': (['cells', 'genes'], table1_big_clusters_subset_genes),
     'cell_metadata': (['cells', 'cell_features', ], cell_metadata_big_clusters),
     'gene_metadata': (['genes', 'gene_features', ], gene_metadata_big_clusters),
    }, 
    coords={'cells': table1_big_clusters_subset_genes.index,
            'genes': table1_big_clusters_subset_genes.columns,
            'gene_features': gene_metadata_big_clusters.columns,
            'cell_features': cell_metadata_big_clusters.columns}

)

### Write to netdcdf

In [212]:
%%time
netcdf = os.path.join(data_folder, 'big_clusters.netcdf')
ds_big_clusters.to_netcdf(netcdf)
! ls -lh $netcdf

-rw-r--r--  1 olgabot  staff   320K Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/big_clusters.netcdf
CPU times: user 8 ms, sys: 15.8 ms, total: 23.8 ms
Wall time: 132 ms


## Amacrine cells

This subset is for recreating Figure 5

In [193]:
amacrine_clusters = ['cluster_{}'.format(str(i).zfill(2)) for i in range(3, 24)]
amacrine_clusters

['cluster_03',
 'cluster_04',
 'cluster_05',
 'cluster_06',
 'cluster_07',
 'cluster_08',
 'cluster_09',
 'cluster_10',
 'cluster_11',
 'cluster_12',
 'cluster_13',
 'cluster_14',
 'cluster_15',
 'cluster_16',
 'cluster_17',
 'cluster_18',
 'cluster_19',
 'cluster_20',
 'cluster_21',
 'cluster_22',
 'cluster_23']

### Get amacrine cells from clusters

In [194]:
amacrine_cells = cluster_names.isin(amacrine_clusters)
print(amacrine_cells.sum())
amacrine_cells.head()

4426


cell
r1_GGCCGCAGTCCG    False
r1_CTTGTGCGGGAA    False
r1_GCGCAACTGCTC    False
r1_GATTGGGAGGCA    False
r1_GTGCCGCCTCTC    False
Name: cluster_id, dtype: bool

### Get amacrine cells in table 1

In [195]:
amacrine_cells_table1 = amacrine_cells.loc[table1.index]
print(amacrine_cells_table1.shape)
print(amacrine_cells_table1.sum())
amacrine_cells_table1.head()

(6600,)
714


r1_GGCCGCAGTCCG    False
r1_CTTGTGCGGGAA    False
r1_GCGCAACTGCTC    False
r1_GATTGGGAGGCA    False
r1_CCTCCTAGTTGG      NaN
Name: cluster_id, dtype: object

In [196]:
amacrine_cells_table1.isnull().sum()

580

In [197]:
amacrine_cells_table1 = amacrine_cells_table1.fillna(False)

In [198]:
table1_amacrine_cells = table1.loc[amacrine_cells_table1]
print(table1_amacrine_cells.shape)
table1_amacrine_cells.head()

(714, 20426)


symbol,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,0610010F05RIK,...,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
r1_GGGTGTCAGTGG,0,0,0,0,1,2,0,0,0,0,...,0,0,0,6,0,0,1,0,0,0
r1_GTTTATATGCGC,0,0,1,0,2,0,0,0,0,0,...,0,0,0,8,0,0,1,0,1,2
r1_TCTTCACTGGCT,1,0,2,0,0,2,0,2,0,0,...,3,0,0,11,0,0,4,2,1,2
r1_TCATTTAGTCGA,0,0,0,0,2,2,0,2,1,2,...,0,0,0,6,0,1,6,0,1,2
r1_GTCTATTCGGTT,0,0,2,0,0,2,0,0,0,2,...,0,0,0,1,0,0,4,0,0,0


### Subset cell metadata too

In [199]:
cell_metadata_amacrine = cluster_bools.loc[table1_amacrine_cells.index]
cell_metadata_amacrine = cell_metadata_amacrine.loc[:, cell_metadata_amacrine.any()]
print(cell_metadata_amacrine.shape)
cell_metadata_amacrine.head()

(714, 21)


Unnamed: 0,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,cluster_11,cluster_12,...,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23
r1_GGGTGTCAGTGG,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GTTTATATGCGC,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_TCTTCACTGGCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_TCATTTAGTCGA,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GTCTATTCGGTT,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Get amacrine cell genes

In [200]:
amacrine_retina = [x.replace('cluster', 'retina') for x in amacrine_clusters]
amacrine_retina[:5]

['retina_03', 'retina_04', 'retina_05', 'retina_06', 'retina_07']

In [201]:
gene_metadata_amacrine = gene_metadata[amacrine_retina]
gene_metadata_amacrine = gene_metadata_amacrine.loc[gene_metadata_amacrine.any(axis=1)]
print(gene_metadata_amacrine.shape)
gene_metadata_amacrine.head()

(614, 21)


Unnamed: 0_level_0,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,retina_11,retina_12,...,retina_14,retina_15,retina_16,retina_17,retina_18,retina_19,retina_20,retina_21,retina_22,retina_23
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1700025G04RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2610017I09RIK,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
2900011O08RIK,False,False,True,True,True,True,False,True,True,True,...,False,True,False,False,False,False,False,False,True,False
4833424O15RIK,False,False,True,False,False,True,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
4930447C04RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [202]:
table1_amacrine_cells_genes = table1_amacrine_cells[gene_metadata_amacrine.index]
print(table1_amacrine_cells_genes.shape)
table1_amacrine_cells_genes.head()

(714, 614)


symbol,1700025G04RIK,2610017I09RIK,2900011O08RIK,4833424O15RIK,4930447C04RIK,6330403K07RIK,6430548M08RIK,8430419L09RIK,A030009H04RIK,A830010M20RIK,...,YWHAG,YWHAH,ZCCHC12,ZCCHC18,ZEB2,ZFHX3,ZFP804A,ZMAT4,ZWINT,ZYX
r1_GGGTGTCAGTGG,1,0,23,4,3,7,10,4,3,4,...,7,4,0,5,2,7,0,3,6,0
r1_GTTTATATGCGC,5,0,9,3,1,4,4,6,9,4,...,3,5,1,8,11,4,2,3,8,0
r1_TCTTCACTGGCT,5,0,6,0,4,5,7,0,3,9,...,7,5,1,4,3,0,3,3,11,2
r1_TCATTTAGTCGA,2,1,14,16,0,8,0,0,3,0,...,11,4,1,5,13,0,0,0,6,0
r1_GTCTATTCGGTT,0,0,11,19,0,2,13,2,1,1,...,2,10,1,2,1,0,1,2,1,0


In [203]:
table1_amacrine_cells_genes.columns.duplicated().sum()

0

In [204]:
table1_amacrine_cells_genes.loc[:, table1_amacrine_cells_genes.columns.duplicated()].sum()

Series([], dtype: float64)

In [205]:
table1_amacrine_cells_genes['NAV1']

r1_GGGTGTCAGTGG    3
r1_GTTTATATGCGC    4
r1_TCTTCACTGGCT    3
r1_TCATTTAGTCGA    5
r1_GTCTATTCGGTT    5
r1_GATCGGTACATG    8
r1_CGTCGGTCAGTT    1
r1_CTATATCCTGTG    0
r1_AACGTGTAAGCG    6
r1_TCGATCTGAGGT    3
r1_CGCATCTCGCTT    1
r1_GACGAGCTATCC    4
r1_CGATAGTCCTAT    2
r1_TTGACTCCTTTT    6
r1_CGTTGATCTCTG    3
r1_GGGCCCCAAATG    7
r1_GTGTAATTCTTC    5
r1_GGATTGAAACAG    2
r1_CGCGGTTTTCTT    3
r1_GCAAGTTGACAA    1
r1_TAGATCACGAAT    3
r1_AGCATCCCCGTT    2
r1_AGTAGTGGATTA    1
r1_CTGACGTGCACT    4
r1_TACGTCAGTTCG    1
r1_AGATAGGATTAT    3
r1_ATTACCCAGGAG    2
r1_GCATGTCCAAAC    3
r1_CCGAGTAAGGGT    1
r1_GATCCGAGATTG    1
                  ..
r1_CACAACCTCACC    0
r1_AACTAAATCATA    0
r1_ACGGTTATCCGC    0
r1_GTTGGATATTGG    0
r1_GGCCTCGGCCAA    0
r1_AAAAGAGAGGAC    0
r1_GTTGGTGATGAT    0
r1_AATTAATTCCCG    0
r1_CGTATCCGATGT    0
r1_ACTGACCTCTCG    0
r1_AAGCCTGGCGGG    0
r1_TGATGGTGAGCT    0
r1_GCATTATATTCA    0
r1_CCCCCGGGCAGA    0
r1_ACTGACCTCTCT    0
r1_ACCGCGGGCCAG    1
r1_TAGCCAGTAT

### Write to CSV

In [206]:
csv = os.path.join(data_folder, 'amacrine_expression.csv')
table1_amacrine_cells_genes.to_csv(csv)
! ls -lh $csv
! head $csv

-rw-r--r--  1 olgabot  staff   875K Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/amacrine_expression.csv
,1700025G04RIK,2610017I09RIK,2900011O08RIK,4833424O15RIK,4930447C04RIK,6330403K07RIK,6430548M08RIK,8430419L09RIK,A030009H04RIK,A830010M20RIK,A830036E02RIK,A930001A20RIK,ABAT,ACOT7,ADARB1,ADARB2,ADCY2,AGAP1,AI593442,AI848285,AIPL1,ALCAM,ALDOC,AMIGO2,ANK3,ANKS1B,APBA1,APC,APP,AQP6,ARHGAP20,ARHGAP24,ARHGEF9,ARL4C,ASAP1,ASPH,ATP1A1,ATP1B1,ATP2B1,ATP2B2,ATP2B4,ATP6V1D,ATP6V1E1,ATP6V1G2,ATPIF1,ATRNL1,AUTS2,B2M,BAI3,BASP1,BC048943,BEX1,BEX2,BHLHE22,BNIP3,BRINP1,C1QL1,C1QL2,CABP1,CACNA2D1,CACNA2D2,CACNB4,CACNG2,CACNG3,CACNG4,CADM3,CALB1,CALB2,CALM1,CALM2,CALM3,CALN1,CALY,CAMK2A,CAMK2N1,CAMK4,CAMKV,CAPZA2,CAR2,CAR3,CAR8,CARTPT,CBFA2T3,CBLN1,CBLN2,CBLN4,CCDC88B,CCK,CCSAP,CD200,CD302,CD47,CDC42EP4,CDC7,CDH22,CDH4,CDK14,CDK5R1,CELF4,CERS5,CFL1,CHAT,CHD3,CHD5,CHGA,CHGB,CHN1,CHN2,CLMN,CLMP,CMIP,CMTM8,CNGA1,CNKSR2,CNTN6,CNTNAP2,COL12A1,COL23A1,COL25A1,COL6A1,CPLX2,CPLX3,CPNE5,CPNE6,CR

In [207]:
csv = os.path.join(data_folder, 'amacrine_cell_metadata.csv')
cell_metadata_amacrine.to_csv(csv)
! ls -lh $csv
! head $csv

-rw-r--r--  1 olgabot  staff    41K Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/amacrine_cell_metadata.csv
,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23
r1_GGGTGTCAGTGG,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GTTTATATGCGC,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_TCTTCACTGGCT,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
r1_TCATTTAGTCGA,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GTCTATTCGGTT,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GATCGGTACATG,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_CGTCGGTCAGTT,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_CTATATCCTGTG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
r1_AACGTGTAAGCG,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [208]:
csv = os.path.join(data_folder, 'amacrine_gene_metadata.csv')
gene_metadata_amacrine.to_csv(csv)
! ls -lh $csv
! head $csv

-rw-r--r--  1 olgabot  staff    77K Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/amacrine_gene_metadata.csv
gene_symbol,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,retina_11,retina_12,retina_13,retina_14,retina_15,retina_16,retina_17,retina_18,retina_19,retina_20,retina_21,retina_22,retina_23
1700025G04RIK,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2610017I09RIK,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True
2900011O08RIK,False,False,True,True,True,True,False,True,True,True,True,False,True,False,False,False,False,False,False,True,False
4833424O15RIK,False,False,True,False,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False
4930447C04RIK,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False

### Make an xarray dataset too

In [209]:
ds_amacrine = xr.Dataset(
    {'expression': (['cells', 'genes'], table1_amacrine_cells_genes),
     'cell_metadata': (['cells', 'cell_features', ], cell_metadata_amacrine),
     'gene_metadata': (['genes', 'gene_features', ], gene_metadata_amacrine),
    })

### Write to netcdf

In [211]:
%%time
netcdf = os.path.join(data_folder, 'amacrine.netcdf')
ds_amacrine.to_netcdf(netcdf)
! ls -lh $netcdf

-rw-r--r--  1 olgabot  staff   1.7M Jun 26 11:34 ../data/05_make_rentina_subsets_for_teaching/amacrine.netcdf
CPU times: user 7.19 ms, sys: 12.8 ms, total: 20 ms
Wall time: 129 ms
