#### Download data

```python
!mkdir data
!wget -P ./data/ http://blood.stemcells.cam.ac.uk/data/{normalisedCounts.txt.gz,all_cell_types.txt,cluster_ids.txt}
```

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/normalisedCounts.txt.gz',sep=' ',index_col=0)
print(df.shape)
df.head()

(40594, 1656)


Unnamed: 0,HSPC_025,HSPC_031,HSPC_037,LT-HSC_001,HSPC_001,HSPC_008,HSPC_014,HSPC_020,HSPC_026,HSPC_038,...,Prog_851,Prog_809,Prog_816,Prog_822,Prog_828,Prog_834,Prog_840,Prog_846,Prog_852,Prog_810
ENSMUSG00000101673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000087456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000101615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000105499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000064900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Define cell labels and label colors

In [3]:
cell_types = pd.read_csv('./data/all_cell_types.txt',sep='\t',index_col=0)
cell_types = cell_types.loc[df.columns,] ## only keep cells that are present in `df`
cluster_ids = pd.read_csv('./data/cluster_ids.txt',sep=' ',header=None,index_col=0,names=['cluster'])

In [4]:
##define cell labels based on gating markers 
HSC_group = ['LTHSC_broad','LTHSC','STHSC_broad','STHSC','ESLAM','HSC1']
LMPP_group = ['LMPP_broad','LMPP']
MPP_group = ['MPP_broad','MPP1_broad','MPP2_broad','MPP3_broad','MPP','MPP1','MPP2','MPP3']
CMP_group = ['CMP_broad','CMP']
MEP_group = ['MEP_broad','MEP']
GMP_group = ['GMP_broad','GMP']

cell_labels = pd.DataFrame(index=cell_types.index)
cell_labels['HSC'] = cell_types[HSC_group].sum(axis=1)
cell_labels['LMPP'] = cell_types[LMPP_group].sum(axis=1)
cell_labels['MPP'] = cell_types[MPP_group].sum(axis=1)
cell_labels['CMP'] = cell_types[CMP_group].sum(axis=1)
cell_labels['MEP'] = cell_types[MEP_group].sum(axis=1)
cell_labels['GMP'] = cell_types[GMP_group].sum(axis=1)


palette = {'HSC':'#40bdbd','MPP':'#eea113','CMP':'#d84f40','GMP':'#10b460','MEP':'#286ee1','LMPP':'#7c5246'}
dict_clusters = {'purple':['HSC'],'deeppink':['MEP'],'gold':['CMP','GMP'],'darkturquoise':['CMP','MPP','LMPP']}

label_missing = list()
label_uniq = list()
label_multi = list()
metadata = pd.DataFrame(columns=['label','label_color'])
for x in cell_labels.index:
    if(sum(cell_labels.loc[x,]>0)==1):
        x_label = cell_labels.columns[cell_labels.loc[x,]>0][0]
        metadata.loc[x] = [x_label,palette[x_label]]
        label_uniq.append(x)
    elif(sum(cell_labels.loc[x,]>0)>1):
        ### randomly select one from multiple labels
        np.random.seed(2020)
        x_label = np.random.choice(cell_labels.columns[cell_labels.loc[x,]>0],1)[0]
        metadata.loc[x] = [x_label,palette[x_label]]
        label_multi.append(x)
    else:
        ### decide labels based on clustering result for cells without gating markers
        np.random.seed(2020)
        x_label = np.random.choice(dict_clusters[cluster_ids.loc[x,'cluster']],1)[0]
        metadata.loc[x] = [x_label,palette[x_label]]
        label_missing.append(x)

In [5]:
print(len(label_uniq))
print(len(label_multi))
print(len(label_missing))

1539
106
11


In [6]:
metadata.head()

Unnamed: 0,label,label_color
HSPC_025,MPP,#eea113
HSPC_031,HSC,#40bdbd
HSPC_037,HSC,#40bdbd
LT-HSC_001,HSC,#40bdbd
HSPC_001,HSC,#40bdbd


#### Ensembl id to gene name

In [7]:
from pybiomart import Dataset

In [8]:
### Ensembl 81: Jul 2015, which was used in the original manuscript, is not accessible (Last Checked: May 28,2020), 
### Instead here the oldest available archive is being used 
dataset = Dataset(name='mmusculus_gene_ensembl',host='mar2017.archive.ensembl.org')

In [9]:
df_ensembl = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])
df_ensembl.index = df_ensembl['Gene stable ID']
df_ensembl.index.name = None

In [10]:
df_ensembl.head()

Unnamed: 0,Gene stable ID,Gene name
ENSMUSG00000064372,ENSMUSG00000064372,mt-Tp
ENSMUSG00000064371,ENSMUSG00000064371,mt-Tt
ENSMUSG00000064370,ENSMUSG00000064370,mt-Cytb
ENSMUSG00000064369,ENSMUSG00000064369,mt-Te
ENSMUSG00000064368,ENSMUSG00000064368,mt-Nd6


In [11]:
gene_list = [df_ensembl.loc[x,'Gene name'] if (x in df_ensembl.index) else x for x in df.index ]

In [12]:
### check the number of unmatched ensembl ids
len([x for x in gene_list if x.startswith('ENS')])

246

In [13]:
df_output = df.copy()
df_output.index = gene_list

In [14]:
df_output.head()

Unnamed: 0,HSPC_025,HSPC_031,HSPC_037,LT-HSC_001,HSPC_001,HSPC_008,HSPC_014,HSPC_020,HSPC_026,HSPC_038,...,Prog_851,Prog_809,Prog_816,Prog_822,Prog_828,Prog_834,Prog_840,Prog_846,Prog_852,Prog_810
Gm28070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gm16322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gm8495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Igkv3-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gm25402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Output files

In [15]:
df_output.to_csv('data_nestorowa2016.tsv',sep='\t')

In [16]:
metadata.to_csv('metadata.tsv',sep='\t')
metadata['label'].to_csv('label.tsv',sep='\t',header=False)
pd.DataFrame.from_dict(palette,orient='index').to_csv('label_color.tsv',sep='\t',header=False)