# Combine and Clean Retina Data

The purpose of this notebook is to combine all the digital gene expression data for the retina cells, downloaded from the Gene Expression Omnibus using the accession number [GSE63473](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63473).

In [2]:
import os
import common

# Assign notebook and folder names
notebook_name = '06_combine_retina_data'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/05_combine_retina_data
Data folder: ../data/05_combine_retina_data


In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [4]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

filename = os.path.join(input_folder, 'GSM1626793_P14Retina_1.digital_expression.txt.gz')
filename

'../data/00_original/GSM1626793_P14Retina_1.digital_expression.txt.gz'

In [None]:
ls $input_folder

GSM1544798_SpeciesMix_ThousandSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544798_SpeciesMix_ThousandSTAMPs_MOUSE.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_MOUSE.digital_expression.txt.gz
GSM1626793_P14Retina_1.digital_expression.txt.gz
GSM1626794_P14Retina_2.digital_expression.txt.gz
GSM1626795_P14Retina_3.digital_expression.txt.gz
GSM1626796_P14Retina_4.digital_expression.txt.gz
GSM1626797_P14Retina_5.digital_expression.txt.gz
GSM1626798_P14Retina_6.digital_expression.txt.gz
GSM1626799_P14Retina_7.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_HUMAN.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_MOUSE.digital_expression.txt.gz
GSM1629193_ERCC.digital_expression.txt.gz
GSM1629193_hg19_ERCC.dict.txt.gz
GSM1629193_hg19_ERCC.refFlat.txt.gz
mmc1.pdf
mmc2.xlsx
mmc3.xlsx
mmc4.xlsx
mmc4_v2.xlsx
retina_clusteridentities.txt
~$mmc2.xlsx


In [None]:
%%time

tables = []
retina_numbers = zip(range(3, 10), range(1, 8))

template = os.path.join(input_folder, 'GSM162679{}_P14Retina_{}.digital_expression.txt.gz')

cell_metadata_dfs = []

for gsm_i, group_i in retina_numbers:
    print(f"--- gsm_i: {gsm_i}, group_i: {group_i} ---")
    filename = template.format(gsm_i, group_i)
    print('\t', filename)
    %time table = pd.read_table(filename, compression='gzip', index_col=0)
    
    # Transpose so genes are columns and cells are rows, creating a
    # (samples, features) matrix
    table = table.T
    tables.append(table)
    
    df = pd.DataFrame(index=table.index)
    df['batch'] = group_i
    cell_metadata_dfs.append(df)
    
expression = pd.concat(tables)
print('expression.shape', expression.shape)

cell_metadata = pd.concat(cell_metadata_dfs)
print('cell_metadata.shape', cell_metadata.shape)

--- gsm_i: 3, group_i: 1 ---
	 ../data/00_original/GSM1626793_P14Retina_1.digital_expression.txt.gz
CPU times: user 22.4 s, sys: 2.18 s, total: 24.6 s
Wall time: 24.7 s
--- gsm_i: 4, group_i: 2 ---
	 ../data/00_original/GSM1626794_P14Retina_2.digital_expression.txt.gz


In [None]:
expression.head()

In [None]:
cell_metadata.head()

In [None]:
cell_metadata.groupby('batch').size()


## Clean expression matrix to be compatible with the cluster labels and identities

Currently, cells are labeled by their barcode, e.g. `GCGCAACTGCTC`, and genes are labeled by their chrom:start-end:symbol, e.g. `6:51460434-51469894:Hnrnpa2b1`. But, in the supplementary data, the genes are all uppercase, e.g. `HNRNPA2B1` (which is incorrect since this is mouse data.. ) and the barcodes have `r1_` prepended before the id, e.g. `r1_GCGCAACTGCTC`.

So we need to clean the data to be compatible with this

In [None]:
gene_symbols = expression.columns.map(lambda x: x.split(':')[-1].upper())
gene_symbols.name = 'symbol'
expression.columns = gene_symbols
expression.head()

In [None]:
# %%time
# csv = os.path.join(data_folder, 'retina_expression.csv')
# expression.to_csv(csv)

```
CPU times: user 59min 38s, sys: 1min 53s, total: 1h 1min 31s
Wall time: 2h 31min 41s
```

## 2h 31m to write a csv file -- Woww!!

In [None]:
import xarray as xr 

In [None]:
expression.shape

In [None]:
expression.columns[:10]

## Add retinal cell cluster metadata

In [None]:
csv = os.path.join(common.DATA_FOLDER, '03_clean_cluster_assignments', 
                   'cluster_bools.csv')
cluster_bools = pd.read_csv(csv, index_col=0, squeeze=True)
cluster_bools.head()

In [None]:
csv = os.path.join(common.DATA_FOLDER, '03_clean_cluster_assignments', 
                   'cluster_names.csv')
cluster_names = pd.read_csv(csv, index_col=0, squeeze=True)
cluster_names.head()

In [None]:
cell_metadata.head()

In [None]:
cell_metadata.dtypes

In [None]:
print(cell_metadata.shape)
cell_metadata_clusters = cell_metadata.join(cluster_names, how='inner')
print(cell_metadata_clusters.shape)
cell_metadata_clusters.head()

In [None]:
# Can't convert to int because some are NA
# cell_metadata_clusters['cluster_id'] = cell_metadata_clusters['cluster_id'].astype(int)

In [None]:
cell_metadata_clusters.dtypes

## Add gene metadata

In [None]:
csv = os.path.join(common.DATA_FOLDER, 
                   '04_extract_data_from_supplementary_excel_files', 
                   'mouse_gene_metadata.csv')
gene_metadata = pd.read_csv(csv, index_col=0)
gene_metadata.head()

## Make subsets for teaching

### Dropout demo - Equal sized clusters

only use cells from the first batch, and only use differentially expressed genes from the biggest clusters

In [None]:
cluster_sizes_table0 = tables[0].groupby(cluster_names['cluster_id'], axis=0).size()
cluster_sizes_table0

In [None]:
big_clusters = cluster_sizes_table0[cluster_sizes_table0 > 100]
big_clusters

## Make superset of all genes for posterity

### Need to align the cell metadata with expression so xarray doesn't get mad

In [23]:
print(cell_metadata_clusters.shape)

cells_left, expression_right = cell_metadata_clusters.align(
    expression, axis=0, join='inner')
print(cells_left.shape, expression_right.shape)

(44995, 40)
(45447, 40) (45447, 24760)


### Align the gene metadata too

In [24]:
genes_t = gene_metadata.T

In [34]:
genes_t_left, expression_genes_right = genes_t.align(expression_right, axis=1, join='right')
print(genes_t_left.shape, expression_genes_right.shape)
genes_t_left.head()

(44, 24760) (45447, 24760)


Unnamed: 0,0610005C13RIK,0610007N19RIK,0610007P14RIK,0610008F07RIK,0610009B14RIK,0610009B22RIK,0610009D07RIK,0610009E02RIK,0610009L18RIK,0610009O20RIK,...,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
retina_01,,,,,,,,,,,...,,,True,,,,,False,,
retina_02,,,,,,,,,,,...,,,True,,,,,False,,
retina_03,,,,,,,,,,,...,,,False,,,,,False,,
retina_04,,,,,,,,,,,...,,,False,,,,,False,,
retina_05,,,,,,,,,,,...,,,False,,,,,False,,


In [35]:
genes_left = genes_t_left.T

In [36]:
cells_left.head()

Unnamed: 0,batch,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,...,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39
AAAAAAAAAAAA,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
AAAAAAACAGTC,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
AAAAAAATGGTA,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAAACCAGCA,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAAAAGATGAA,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1]:
ds = xr.Dataset(
    {'expression': (['cells', 'genes'], expression_genes_right),
     'cell_metadata': (['cells', 'cell_features', ], cells_left),
     'gene_metadata': (['genes', 'gene_features', ], genes_left),
    })

NameError: name 'xr' is not defined

In [38]:
range(2)

range(0, 2)

In [39]:
ds

<xarray.Dataset>
Dimensions:        (cell_features: 40, cells: 45447, gene_features: 44, genes: 24760)
Dimensions without coordinates: cell_features, cells, gene_features, genes
Data variables:
    expression     (cells, genes) float64 0.0 0.0 0.0 nan 0.0 1.0 0.0 0.0 ...
    cell_metadata  (cells, cell_features) int64 3 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
    gene_metadata  (genes, gene_features) object nan nan nan nan nan nan nan ...

In [None]:
%%time
netcdf = os.path.join(data_folder, 'retina_all_genes.netcdf')
ds.to_netcdf(netcdf)

In [None]:
ls -lha $netcdf

In [31]:
%%time
netcdf = os.path.join(data_folder, 'retina.netcdf')
ds.to_netcdf(netcdf)

CPU times: user 795 ms, sys: 1.13 s, total: 1.92 s
Wall time: 2.22 s


In [33]:
ls -lha $netcdf

-rw-r--r--  1 olgabot  staff   473M Jun 26 10:08 ../data/05_combine_retina_data/retina.netcdf
