# Clean Cluster Assignments

The purpose of this notebook is to clean the cluster assignments (cell metadata) of each cell (barcode). This data was downloaded from the McCarroll Lab's Drop-Seq [website](http://mccarrolllab.com/dropseq/).

In [1]:
import os
import common

# Assign notebook and folder names
notebook_name = '03_make_cell_metadata'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/03_make_cell_metadata
Data folder: ../data/03_make_cell_metadata


In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [3]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

In [4]:
ls $input_folder

GSM1544798_SpeciesMix_ThousandSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544798_SpeciesMix_ThousandSTAMPs_MOUSE.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_MOUSE.digital_expression.txt.gz
GSM1626793_P14Retina_1.digital_expression.txt.gz
GSM1626794_P14Retina_2.digital_expression.txt.gz
GSM1626795_P14Retina_3.digital_expression.txt.gz
GSM1626796_P14Retina_4.digital_expression.txt.gz
GSM1626797_P14Retina_5.digital_expression.txt.gz
GSM1626798_P14Retina_6.digital_expression.txt.gz
GSM1626799_P14Retina_7.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_HUMAN.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_MOUSE.digital_expression.txt.gz
GSM1629193_ERCC.digital_expression.txt.gz
GSM1629193_hg19_ERCC.dict.txt.gz
GSM1629193_hg19_ERCC.refFlat.txt.gz
mmc1.pdf
mmc2.xlsx
mmc3.xlsx
mmc4.xlsx
mmc4_v2.xlsx
retina_clusteridentities.txt
~$mmc4_v2.xlsx


In [5]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

filename = os.path.join(input_folder, 'retina_clusteridentities.txt')
filename

'../data/00_original/retina_clusteridentities.txt'

In [6]:
cluster_n = pd.read_table(filename, squeeze=True, header=None, index_col=0)
cluster_n.name = 'cluster_n'
cluster_n.index.name = 'cell'
cluster_n.head()

cell
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_n, dtype: int64

In [7]:
prefixes = cluster_n.index.map(lambda x: x.split('_')[0]).unique()
prefixes

Index(['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'p1'], dtype='object', name='cell')

In [8]:
cluster_n.tail()

cell
p1_TCAAAAGCCGGG    24
p1_ATTAAGTTCCAA    34
p1_CTGTCTGAGACC     2
p1_TAACGCGCTCCT    24
p1_ATTCTTGTTCTT    24
Name: cluster_n, dtype: int64

In [9]:
cluster_n.index.duplicated().sum()

0

Actually don't want to strip the prefixes of the barcodes because they indicate the run

In [10]:
# cluster_ids.index = cluster_names.index.map(lambda x: x.split('_')[1])
# cluster_ids.head()

In [12]:
csv = os.path.join(data_folder, 'cluster_n.csv')
cluster_n.to_csv(csv, header=True)
! head $csv

cell,cluster_n
r1_GGCCGCAGTCCG,2
r1_CTTGTGCGGGAA,2
r1_GCGCAACTGCTC,2
r1_GATTGGGAGGCA,2
r1_GTGCCGCCTCTC,25
r1_CCTGTGACACAC,2
r1_AATCTCGTTAAT,2
r1_GATTTCCTCTGA,1
r1_GAAGGCTGGAAC,2


In [16]:
cluster_ids = 'cluster_' + cluster_n.astype(str).str.zfill(2)
cluster_ids.name = 'cluster_id'
cluster_ids.head()

cell
r1_GGCCGCAGTCCG    cluster_02
r1_CTTGTGCGGGAA    cluster_02
r1_GCGCAACTGCTC    cluster_02
r1_GATTGGGAGGCA    cluster_02
r1_GTGCCGCCTCTC    cluster_25
Name: cluster_id, dtype: object

### Add biological group of cluster

In [17]:
celltype_folder = os.path.join(common.DATA_FOLDER, 
                               '02_make_celltype_metadata')
csv = os.path.join(celltype_folder, 'cluster_ids_to_celltypes.csv')
celltypes = pd.read_csv(csv, index_col=0, squeeze=True)
celltypes.head()

cluster_id
cluster_01          Horizontal cells
cluster_02    Retinal ganglion cells
cluster_03            Amacrine cells
cluster_04            Amacrine cells
cluster_05            Amacrine cells
Name: celltype, dtype: object

In [19]:
cluster_ids_df = cluster_ids.to_frame()
cluster_ids_df = cluster_ids_df.join(celltypes, on='cluster_id')
print(cluster_ids_df.shape)
cluster_ids_df.head()

(44808, 2)


Unnamed: 0_level_0,cluster_id,celltype
cell,Unnamed: 1_level_1,Unnamed: 2_level_1
r1_GGCCGCAGTCCG,cluster_02,Retinal ganglion cells
r1_CTTGTGCGGGAA,cluster_02,Retinal ganglion cells
r1_GCGCAACTGCTC,cluster_02,Retinal ganglion cells
r1_GATTGGGAGGCA,cluster_02,Retinal ganglion cells
r1_GTGCCGCCTCTC,cluster_25,Cones


### Add integer id to clusters

In [20]:
cluster_ids_df = cluster_ids_df.join(cluster_n)
print(cluster_ids_df.shape)
cluster_ids_df.head()

(44808, 3)


Unnamed: 0_level_0,cluster_id,celltype,cluster_n
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
r1_GGCCGCAGTCCG,cluster_02,Retinal ganglion cells,2
r1_CTTGTGCGGGAA,cluster_02,Retinal ganglion cells,2
r1_GCGCAACTGCTC,cluster_02,Retinal ganglion cells,2
r1_GATTGGGAGGCA,cluster_02,Retinal ganglion cells,2
r1_GTGCCGCCTCTC,cluster_25,Cones,25


### Add subgroup to cluster 

In [26]:
cluster_ids_df['cluster_celltype_with_id'] = cluster_ids_df.apply(
    lambda x: '{celltype} ({cluster_id})'.format(**x), axis=1)
print(cluster_ids_df.shape)
cluster_ids_df.head()

(44808, 4)


Unnamed: 0_level_0,cluster_id,celltype,cluster_n,cluster_celltype_with_id
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
r1_GGCCGCAGTCCG,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_CTTGTGCGGGAA,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_GCGCAACTGCTC,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_GATTGGGAGGCA,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_GTGCCGCCTCTC,cluster_25,Cones,25,Cones (cluster_25)


### Write to CSV

In [28]:
csv = os.path.join(data_folder, 'cell_metadata.csv')
cluster_ids_df.to_csv(csv, header=True)
! head $csv

cell,cluster_id,celltype,cluster_n,cluster_celltype_with_id
r1_GGCCGCAGTCCG,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_CTTGTGCGGGAA,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_GCGCAACTGCTC,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_GATTGGGAGGCA,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_GTGCCGCCTCTC,cluster_25,Cones,25,Cones (cluster_25)
r1_CCTGTGACACAC,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_AATCTCGTTAAT,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)
r1_GATTTCCTCTGA,cluster_01,Horizontal cells,1,Horizontal cells (cluster_01)
r1_GAAGGCTGGAAC,cluster_02,Retinal ganglion cells,2,Retinal ganglion cells (cluster_02)


## Convert to boolean matrix so it's compatible with xarray

In [29]:
cluster_bools = pd.get_dummies(cluster_ids)
cluster_bools.head()

Unnamed: 0_level_0,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,...,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
r1_GGCCGCAGTCCG,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CTTGTGCGGGAA,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GCGCAACTGCTC,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GATTGGGAGGCA,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GTGCCGCCTCTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
csv = os.path.join(data_folder, 'cluster_bools.csv')
cluster_bools.to_csv(csv, header=True)
! head $csv

cell,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23,cluster_24,cluster_25,cluster_26,cluster_27,cluster_28,cluster_29,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39
r1_GGCCGCAGTCCG,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_CTTGTGCGGGAA,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GCGCAACTGCTC,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GATTGGGAGGCA,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GTGCCGCCTCTC,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_CCTGTGACACAC,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,