# Combine and Clean Retina Data

The purpose of this notebook is to combine all the digital gene expression data for the retina cells, downloaded from the Gene Expression Omnibus using the accession number [GSE63473](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63473).

In [1]:
import os
import common

# Assign notebook and folder names
notebook_name = '05_combine_retina_data'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/01_data_cleaning
Data folder: ../data/01_data_cleaning


In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [3]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

filename = os.path.join(input_folder, 'GSM1626793_P14Retina_1.digital_expression.txt.gz')
filename

'../data/00_original/GSM1626793_P14Retina_1.digital_expression.txt.gz'

In [4]:
ls $input_folder

GSM1544798_SpeciesMix_ThousandSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544798_SpeciesMix_ThousandSTAMPs_MOUSE.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_MOUSE.digital_expression.txt.gz
GSM1626793_P14Retina_1.digital_expression.txt.gz
GSM1626794_P14Retina_2.digital_expression.txt.gz
GSM1626795_P14Retina_3.digital_expression.txt.gz
GSM1626796_P14Retina_4.digital_expression.txt.gz
GSM1626797_P14Retina_5.digital_expression.txt.gz
GSM1626798_P14Retina_6.digital_expression.txt.gz
GSM1626799_P14Retina_7.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_HUMAN.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_MOUSE.digital_expression.txt.gz
GSM1629193_ERCC.digital_expression.txt.gz
GSM1629193_hg19_ERCC.dict.txt.gz
GSM1629193_hg19_ERCC.refFlat.txt.gz
mmc1.pdf
mmc2.xlsx
mmc3.xlsx
mmc4.xlsx
mmc4_v2.xlsx
retina_clusteridentities.txt


In [14]:
%%time

tables = []
retina_numbers = zip(range(3, 10), range(1, 8))

template = os.path.join(input_folder, 'GSM162679{}_P14Retina_{}.digital_expression.txt.gz')

cell_metadata_dfs = []

for gsm_i, group_i in retina_numbers:
    print(f"--- gsm_i: {gsm_i}, group_i: {group_i} ---")
    filename = template.format(gsm_i, group_i)
    print('\t', filename)
    %time table = pd.read_table(filename, compression='gzip', index_col=0)
    
    # Transpose so genes are columns and cells are rows, creating a
    # (samples, features) matrix
    table = table.T
    tables.append(table)
    
    df = pd.DataFrame(index=table.index)
    df['batch'] = group_i
    cell_metadata_dfs.append(df)
    
expression = pd.concat(tables)
print('expression.shape', expression.shape)

cell_metadata = pd.concat(cell_metadata_dfs)
print('cell_metadata.shape', cell_metadata.shape)

--- gsm_i: 3, group_i: 1 ---
	 ../data/00_original/GSM1626793_P14Retina_1.digital_expression.txt.gz
CPU times: user 20.3 s, sys: 2.07 s, total: 22.4 s
Wall time: 22.8 s
--- gsm_i: 4, group_i: 2 ---
	 ../data/00_original/GSM1626794_P14Retina_2.digital_expression.txt.gz
CPU times: user 38.8 s, sys: 2.67 s, total: 41.5 s
Wall time: 41.7 s
--- gsm_i: 5, group_i: 3 ---
	 ../data/00_original/GSM1626795_P14Retina_3.digital_expression.txt.gz
CPU times: user 17.8 s, sys: 1.84 s, total: 19.7 s
Wall time: 19.9 s
--- gsm_i: 6, group_i: 4 ---
	 ../data/00_original/GSM1626796_P14Retina_4.digital_expression.txt.gz
CPU times: user 22.7 s, sys: 2.35 s, total: 25 s
Wall time: 25.3 s
--- gsm_i: 7, group_i: 5 ---
	 ../data/00_original/GSM1626797_P14Retina_5.digital_expression.txt.gz
CPU times: user 22.1 s, sys: 2.09 s, total: 24.2 s
Wall time: 24.6 s
--- gsm_i: 8, group_i: 6 ---
	 ../data/00_original/GSM1626798_P14Retina_6.digital_expression.txt.gz
CPU times: user 37.9 s, sys: 2.85 s, total: 40.7 s
Wall t

In [10]:
expression.head()

Unnamed: 0,10:100015630-100100413:Kitl,10:100306571-100307185:Gm9476,10:100443902-100487350:Tmtc3,10:100488289-100573655:Cep290,10:100572274-100589259:4930430F08Rik,10:100592386-100618391:1700017N19Rik,10:10100953-10101116:Gm25682,10:101681487-102391469:Mgat4c,10:102481760-102490418:Nts,10:102512222-102546560:Rassf9,...,Y:1096861-1245759:Uty,Y:1260715-1286613:Ddx3y,Y:1298957-1459782:Usp9y,Y:45385648-45386331:Gm21779,Y:52674915-52675541:Gm21840,Y:78835860-78836543:Gm20806,Y:86365740-86366423:Gm20861,Y:897788-943811:Kdm5d,Y:90755657-90763485:Gm21857,Y:991630-991748:n-R5s1
GGCCGCAGTCCG,0,,3,1,2,0,,0,,0,...,0,0,,,,,,0,,0.0
CTTGTGCGGGAA,0,,0,3,1,0,,0,,0,...,1,5,,,,,,0,,0.0
GCGCAACTGCTC,1,,0,0,2,0,,4,,0,...,0,0,,,,,,0,,0.0
GATTGGGAGGCA,0,,0,2,0,0,,1,,0,...,1,0,,,,,,0,,0.0
CCTCCTAGTTGG,0,,2,1,1,0,,2,,0,...,0,0,,,,,,0,,0.0


In [15]:
cell_metadata.head()

Unnamed: 0,batch
GGCCGCAGTCCG,1
CTTGTGCGGGAA,1
GCGCAACTGCTC,1
GATTGGGAGGCA,1
CCTCCTAGTTGG,1


In [17]:
cell_metadata.groupby('batch').size()

batch
1    6600
2    9000
3    6120
4    7650
5    7650
6    8280
7    4000
dtype: int64


## Clean expression matrix to be compatible with the cluster labels and identities

Currently, cells are labeled by their barcode, e.g. `GCGCAACTGCTC`, and genes are labeled by their chrom:start-end:symbol, e.g. `6:51460434-51469894:Hnrnpa2b1`. But, in the supplementary data, the genes are all uppercase, e.g. `HNRNPA2B1` (which is incorrect since this is mouse data.. ) and the barcodes have `r1_` prepended before the id, e.g. `r1_GCGCAACTGCTC`.

So we need to clean the data to be compatible with this

In [21]:
gene_symbols = expression.columns.map(lambda x: x.split(':')[-1].upper())
gene_symbols.name = 'symbol'
expression.columns = gene_symbols
expression.head()

symbol,KITL,GM9476,TMTC3,CEP290,4930430F08RIK,1700017N19RIK,GM25682,MGAT4C,NTS,RASSF9,...,UTY,DDX3Y,USP9Y,GM21779,GM21840,GM20806,GM20861,KDM5D,GM21857,N-R5S1
GGCCGCAGTCCG,0,,3,1,2,0,,0,,0,...,0,0,,,,,,0,,0.0
CTTGTGCGGGAA,0,,0,3,1,0,,0,,0,...,1,5,,,,,,0,,0.0
GCGCAACTGCTC,1,,0,0,2,0,,4,,0,...,0,0,,,,,,0,,0.0
GATTGGGAGGCA,0,,0,2,0,0,,1,,0,...,1,0,,,,,,0,,0.0
CCTCCTAGTTGG,0,,2,1,1,0,,2,,0,...,0,0,,,,,,0,,0.0


In [12]:
%%time
csv = os.path.join(data_folder, 'retina_expression.csv')
expression.to_csv(csv)

CPU times: user 59min 38s, sys: 1min 53s, total: 1h 1min 31s
Wall time: 2h 31min 41s


```
CPU times: user 59min 38s, sys: 1min 53s, total: 1h 1min 31s
Wall time: 2h 31min 41s
```

## 2h 31m to write a csv file -- Woww!!

In [13]:
ls -lh $csv

-rw-r--r--  1 olgabot  staff   2.4G Jun 23 16:17 ../data/01_data_cleaning/retina_expression.csv


In [18]:
import xarray as xr

In [22]:
expression.columns[:10]

Index(['KITL', 'GM9476', 'TMTC3', 'CEP290', '4930430F08RIK', '1700017N19RIK',
       'GM25682', 'MGAT4C', 'NTS', 'RASSF9'],
      dtype='object', name='symbol')

In [23]:
dataset = xr.DataArray(expression, dims=['cells', 'genes'])
dataset

<xarray.DataArray (cells: 49300, genes: 24760)>
array([[  0.,  nan,   3., ...,   0.,  nan,   0.],
       [  0.,  nan,   0., ...,   0.,  nan,   0.],
       [  1.,  nan,   0., ...,   0.,  nan,   0.],
       ..., 
       [  0.,  nan,   0., ...,   0.,  nan,  nan],
       [  0.,  nan,   0., ...,   0.,  nan,  nan],
       [  0.,  nan,   0., ...,   0.,  nan,  nan]])
Coordinates:
  * cells    (cells) object 'GGCCGCAGTCCG' 'CTTGTGCGGGAA' 'GCGCAACTGCTC' ...
  * genes    (genes) object 'KITL' 'GM9476' 'TMTC3' 'CEP290' '4930430F08RIK' ...

In [24]:
csv = os.path.join(common.DATA_FOLDER, '03_clean_cluster_assignments', 'cluster_ids.csv')
cluster_ids = pd.read_csv(csv, index_col=0, squeeze=True)
cluster_ids.head()

cell
GGCCGCAGTCCG     2
CTTGTGCGGGAA     2
GCGCAACTGCTC     2
GATTGGGAGGCA     2
GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [25]:
cell_metadata.head()

Unnamed: 0,batch
GGCCGCAGTCCG,1
CTTGTGCGGGAA,1
GCGCAACTGCTC,1
GATTGGGAGGCA,1
CCTCCTAGTTGG,1


In [27]:
cell_metadata.dtypes

batch    int64
dtype: object

In [26]:
cell_metadata_clusters = cell_metadata.join(cluster_ids)
cell_metadata_clusters.head()

Unnamed: 0,batch,cluster_id
AAAAAAAAAAAA,3,34.0
AAAAAAACAGTC,7,34.0
AAAAAAATGGTA,7,24.0
AAAAAACCAGCA,5,24.0
AAAAAAGATGAA,5,24.0


In [32]:
# Can't convert to int because some are NA
# cell_metadata_clusters['cluster_id'] = cell_metadata_clusters['cluster_id'].astype(int)

In [29]:
cell_metadata_clusters.dtypes

batch           int64
cluster_id    float64
dtype: object

In [33]:
dataset.attrs['cell_metadata'] = cell_metadata_clusters