# Clean Cluster Assignments

The purpose of this notebook is to clean the cluster assignments (cell metadata) of each cell (barcode). This data was downloaded from the McCarroll Lab's Drop-Seq [website](http://mccarrolllab.com/dropseq/).

In [8]:
import os
import common

# Assign notebook and folder names
notebook_name = '03_clean_cluster_assignments'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/03_clean_cluster_assignments
Data folder: ../data/03_clean_cluster_assignments


In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

In [11]:
ls $input_folder

GSM1544798_SpeciesMix_ThousandSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544798_SpeciesMix_ThousandSTAMPs_MOUSE.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_MOUSE.digital_expression.txt.gz
GSM1626793_P14Retina_1.digital_expression.txt.gz
GSM1626794_P14Retina_2.digital_expression.txt.gz
GSM1626795_P14Retina_3.digital_expression.txt.gz
GSM1626796_P14Retina_4.digital_expression.txt.gz
GSM1626797_P14Retina_5.digital_expression.txt.gz
GSM1626798_P14Retina_6.digital_expression.txt.gz
GSM1626799_P14Retina_7.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_HUMAN.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_MOUSE.digital_expression.txt.gz
GSM1629193_ERCC.digital_expression.txt.gz
GSM1629193_hg19_ERCC.dict.txt.gz
GSM1629193_hg19_ERCC.refFlat.txt.gz
mmc1.pdf
mmc2.xlsx
mmc3.xlsx
mmc4.xlsx
mmc4_v2.xlsx
retina_clusteridentities.txt


In [17]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

filename = os.path.join(input_folder, 'retina_clusteridentities.txt')
filename

'../data/00_original/retina_clusteridentities.txt'

In [46]:
cluster_ids = pd.read_table(filename, squeeze=True, header=None, index_col=0)
cluster_ids.name = 'cluster_id'
cluster_ids.index.name = 'cell'
cluster_ids.head()

cell
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [49]:
prefixes = cluster_ids.index.map(lambda x: x.split('_')[0]).unique()
prefixes

Index(['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'p1'], dtype='object', name='cell')

In [48]:
cluster_ids.tail()

cell
p1_TCAAAAGCCGGG    24
p1_ATTAAGTTCCAA    34
p1_CTGTCTGAGACC     2
p1_TAACGCGCTCCT    24
p1_ATTCTTGTTCTT    24
Name: cluster_id, dtype: int64

In [47]:
cluster_ids.index.duplicated().sum()

0

Actually don't want to strip the prefixes of the barcodes because they indicate the run

In [50]:
# cluster_ids.index = cluster_names.index.map(lambda x: x.split('_')[1])
# cluster_ids.head()

In [51]:
csv = os.path.join(data_folder, 'cluster_ids.csv')
cluster_ids.to_csv(csv, header=True)
! head $csv

cell,cluster_id
r1_GGCCGCAGTCCG,2
r1_CTTGTGCGGGAA,2
r1_GCGCAACTGCTC,2
r1_GATTGGGAGGCA,2
r1_GTGCCGCCTCTC,25
r1_CCTGTGACACAC,2
r1_AATCTCGTTAAT,2
r1_GATTTCCTCTGA,1
r1_GAAGGCTGGAAC,2


In [52]:
cluster_names = 'cluster_' + cluster_ids.astype(str).str.zfill(2)
cluster_names.head()

cell
r1_GGCCGCAGTCCG    cluster_02
r1_CTTGTGCGGGAA    cluster_02
r1_GCGCAACTGCTC    cluster_02
r1_GATTGGGAGGCA    cluster_02
r1_GTGCCGCCTCTC    cluster_25
Name: cluster_id, dtype: object

In [53]:
csv = os.path.join(data_folder, 'cluster_names.csv')
cluster_names.to_csv(csv, header=True)
! head $csv

cell,cluster_id
r1_GGCCGCAGTCCG,cluster_02
r1_CTTGTGCGGGAA,cluster_02
r1_GCGCAACTGCTC,cluster_02
r1_GATTGGGAGGCA,cluster_02
r1_GTGCCGCCTCTC,cluster_25
r1_CCTGTGACACAC,cluster_02
r1_AATCTCGTTAAT,cluster_02
r1_GATTTCCTCTGA,cluster_01
r1_GAAGGCTGGAAC,cluster_02


## Convert to boolean matrix so it's compatible with xarray

In [54]:
cluster_bools = pd.get_dummies(cluster_names)
cluster_bools.head()

Unnamed: 0_level_0,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,...,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
r1_GGCCGCAGTCCG,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_CTTGTGCGGGAA,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GCGCAACTGCTC,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GATTGGGAGGCA,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
r1_GTGCCGCCTCTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
csv = os.path.join(data_folder, 'cluster_bools.csv')
cluster_bools.to_csv(csv, header=True)
! head $csv

cell,cluster_01,cluster_02,cluster_03,cluster_04,cluster_05,cluster_06,cluster_07,cluster_08,cluster_09,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17,cluster_18,cluster_19,cluster_20,cluster_21,cluster_22,cluster_23,cluster_24,cluster_25,cluster_26,cluster_27,cluster_28,cluster_29,cluster_30,cluster_31,cluster_32,cluster_33,cluster_34,cluster_35,cluster_36,cluster_37,cluster_38,cluster_39
r1_GGCCGCAGTCCG,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_CTTGTGCGGGAA,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GCGCAACTGCTC,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GATTGGGAGGCA,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_GTGCCGCCTCTC,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
r1_CCTGTGACACAC,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,