# Clean Cluster Assignments

The purpose of this notebook is to clean the cluster assignments (cell metadata) of each cell (barcode). This data was downloaded from the McCarroll Lab's Drop-Seq [website](http://mccarrolllab.com/dropseq/).

In [8]:
import os
import common

# Assign notebook and folder names
notebook_name = '03_clean_cluster_assignments'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/03_clean_cluster_assignments
Data folder: ../data/03_clean_cluster_assignments


In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

In [11]:
ls $input_folder

GSM1544798_SpeciesMix_ThousandSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544798_SpeciesMix_ThousandSTAMPs_MOUSE.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_HUMAN.digital_expression.txt.gz
GSM1544799_SpeciesMix_HundredSTAMPs_MOUSE.digital_expression.txt.gz
GSM1626793_P14Retina_1.digital_expression.txt.gz
GSM1626794_P14Retina_2.digital_expression.txt.gz
GSM1626795_P14Retina_3.digital_expression.txt.gz
GSM1626796_P14Retina_4.digital_expression.txt.gz
GSM1626797_P14Retina_5.digital_expression.txt.gz
GSM1626798_P14Retina_6.digital_expression.txt.gz
GSM1626799_P14Retina_7.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_HUMAN.digital_expression.txt.gz
GSM1629192_Pure_HumanMouse_MOUSE.digital_expression.txt.gz
GSM1629193_ERCC.digital_expression.txt.gz
GSM1629193_hg19_ERCC.dict.txt.gz
GSM1629193_hg19_ERCC.refFlat.txt.gz
mmc1.pdf
mmc2.xlsx
mmc3.xlsx
mmc4.xlsx
mmc4_v2.xlsx
retina_clusteridentities.txt


In [17]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

filename = os.path.join(input_folder, 'retina_clusteridentities.txt')
filename

'../data/00_original/retina_clusteridentities.txt'

In [22]:
cluster_ids = pd.read_table(filename, squeeze=True, header=None, index_col=0)
cluster_ids.name = 'cluster_id'
cluster_ids.index.name = 'cell'
cluster_ids.head()

cell
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [24]:
cluster_ids.index = cluster_ids.index.map(lambda x: x.split('_')[1])
cluster_ids.head()

cell
GGCCGCAGTCCG     2
CTTGTGCGGGAA     2
GCGCAACTGCTC     2
GATTGGGAGGCA     2
GTGCCGCCTCTC    25
Name: cluster_id, dtype: int64

In [27]:
csv = os.path.join(data_folder, 'cluster_ids.csv')
cluster_ids.to_csv(csv, header=True)

In [28]:
! head $csv

cell,cluster_id
GGCCGCAGTCCG,2
CTTGTGCGGGAA,2
GCGCAACTGCTC,2
GATTGGGAGGCA,2
GTGCCGCCTCTC,25
CCTGTGACACAC,2
AATCTCGTTAAT,2
GATTTCCTCTGA,1
GAAGGCTGGAAC,2


In [29]:
ls $csv

../data/03_clean_cluster_assignments/cluster_ids.csv
