In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
dir_data = '../../datasets/morrislab/'

In [3]:
input_data = pd.read_csv(os.path.join(dir_data,'./morrislab.celltag.dataset.csv'),index_col=0)
input_data = input_data.rename(columns={"Day": "time", 
                                       "CellTag.D0": "Tag_0",
                                       "CellTag.D3": "Tag_3",
                                       "CellTag.D13":"Tag_13"})
print(input_data.shape)
input_data.head()

(48515, 24)


Unnamed: 0,nGene,nUMI,CellCycle,percent.mito,Replicate,time,Timepoint,Reprogramming.Day,Outcome,Cluster.Seurat,...,CellTag.Version,Seurat,Monocle,Tag_0,Tag_3,Tag_13,tSNE_1,tSNE_2,Component.1,Component.2
HF1.ACCCACTCAGGTGGAT-1,4028,2498.0,G1,7.57,HF1,0,Day 0,0,BAB,7,...,BAB,True,True,,,,5.401516,-0.001541,0.702858,7.071697
HF1.ACCTTTACACTGTCGG-1,4079,2957.0,G2M,6.58,HF1,0,Day 0,0,BAB,7,...,BAB,True,True,,,,5.382675,-0.008172,0.713981,7.133869
HF1.CAACCTCAGCCAGGAT-1,2413,2484.0,G1,8.55,HF1,0,Day 0,0,BAB,7,...,BAB,True,True,,,,5.396813,-0.014759,-2.546645,0.413626
HF1.CCTATTATCATCACCC-1,5097,3027.0,G1,6.75,HF1,0,Day 0,0,BAB,2,...,BAB,True,False,,,,12.90523,-5.628896,,
HF1.CGAGCACGTCCTGCTT-1,4675,3171.0,G2M,7.06,HF1,0,Day 0,0,BAB,7,...,BAB,True,False,,,,5.371256,0.010826,,


### filter out cells without any cell tag

In [4]:
input_data = input_data[(input_data[['Tag_0','Tag_3','Tag_13']].isna().sum(axis=1) != 3)]

In [5]:
input_data = input_data.fillna(-1)

In [6]:
input_data.head()

Unnamed: 0,nGene,nUMI,CellCycle,percent.mito,Replicate,time,Timepoint,Reprogramming.Day,Outcome,Cluster.Seurat,...,CellTag.Version,Seurat,Monocle,Tag_0,Tag_3,Tag_13,tSNE_1,tSNE_2,Component.1,Component.2
HF1.AACCGCGCAGGTGGAT-4,3744,2263.0,G2M,8.46,HF1,9,Day 9,9,BAB,0,...,BAB,True,False,-1.0,209.0,-1.0,21.852191,0.897177,-1.0,-1.0
HF1.AGCGGTCAGAGTTGGC-4,4700,3222.0,G1,8.08,HF1,9,Day 9,9,BAB,0,...,BAB,True,False,-1.0,256.0,-1.0,11.885008,3.797163,-1.0,-1.0
HF1.ATTTCTGCATGCCCGA-4,2642,2332.0,S,6.85,HF1,9,Day 9,9,BAB,1,...,BAB,True,False,546.0,-1.0,-1.0,9.091572,10.757542,-1.0,-1.0
HF1.CCTAGCTTCCAAATGC-4,5039,2982.0,G2M,8.14,HF1,9,Day 9,9,BAB,3,...,BAB,True,False,410.0,150.0,-1.0,15.499812,-21.348926,-1.0,-1.0
HF1.CGCTTCACACTTACGA-4,4828,2552.0,G1,7.69,HF1,9,Day 9,9,BAB,0,...,BAB,True,False,375.0,133.0,-1.0,18.567765,5.876435,-1.0,-1.0


In [7]:
df_metadata = input_data[input_data.columns[:8].tolist()]
df_metadata.shape
df_metadata.head()

Unnamed: 0,nGene,nUMI,CellCycle,percent.mito,Replicate,time,Timepoint,Reprogramming.Day
HF1.AACCGCGCAGGTGGAT-4,3744,2263.0,G2M,8.46,HF1,9,Day 9,9
HF1.AGCGGTCAGAGTTGGC-4,4700,3222.0,G1,8.08,HF1,9,Day 9,9
HF1.ATTTCTGCATGCCCGA-4,2642,2332.0,S,6.85,HF1,9,Day 9,9
HF1.CCTAGCTTCCAAATGC-4,5039,2982.0,G2M,8.14,HF1,9,Day 9,9
HF1.CGCTTCACACTTACGA-4,4828,2552.0,G1,7.69,HF1,9,Day 9,9


In [8]:
np.unique(df_metadata['time'],return_counts=True)

(array([ 6,  9, 12, 15, 21, 28]), array([  87,  903, 3030, 3288, 4928, 6567]))

In [9]:
df_tags = input_data[['Tag_0','Tag_3','Tag_13']]
df_tags = df_tags.astype('int')
df_tags.head()

Unnamed: 0,Tag_0,Tag_3,Tag_13
HF1.AACCGCGCAGGTGGAT-4,-1,209,-1
HF1.AGCGGTCAGAGTTGGC-4,-1,256,-1
HF1.ATTTCTGCATGCCCGA-4,546,-1,-1
HF1.CCTAGCTTCCAAATGC-4,410,150,-1
HF1.CGCTTCACACTTACGA-4,375,133,-1


In [10]:
## impute -1
## -1 represents dropout event, -2 means the real missing value
list_tagtime = [int(x[1]) for x in df_tags.columns.str.split('_')]
for x in df_tags.index:
    id_notags = np.where(np.array(list_tagtime)>df_metadata.loc[x,'time'])[0]
    df_tags.loc[x,df_tags.columns[id_notags]] = -2

In [11]:
df_tags.shape

(18803, 3)

### filter out cells with more than one Tag dropout 

In [12]:
df_tags = df_tags[(df_tags==-1).sum(axis=1)<2]
print(df_tags.shape)
df_metadata = df_metadata.loc[df_tags.index,:]

(7058, 3)


In [13]:
cutoff_timepoint = 13
tags = np.unique(df_tags.loc[df_metadata[df_metadata['time']>cutoff_timepoint].index,:].values,axis=0)

columns = pd.MultiIndex.from_arrays(list(tags.T), names=(['Tag_0','Tag_3','Tag_13']))
df_clones = pd.DataFrame(data=0,index = df_metadata.index,columns=columns)

In [14]:
import time
time_st = time.time()
for i,x in enumerate(df_tags.index):
    tags_x = tuple(df_tags.loc[x,])
    if(tags_x in df_tags.columns):
        df_clones.loc[x,tags_x] = 1
    else:
        #check if tags_x is in the column
        if(all([xx in tags[:,ii] for ii,xx in enumerate(tags_x) if xx!=-2])):
            tags_x = tuple(slice(None) if tx == -2 else tx for tx in tags_x)
            df_clones.loc[x,tags_x] = 1
    if(i%1000==0):
        print(str(i) + ' cells have been processed...')
time_end = time.time()

0 cells have been processed...
1000 cells have been processed...
2000 cells have been processed...
3000 cells have been processed...
4000 cells have been processed...
5000 cells have been processed...
6000 cells have been processed...
7000 cells have been processed...


In [15]:
(time_end - time_st)/60

6.023768182595571

### output files

In [16]:
df_clones_output = df_clones[df_clones.sum(axis=1)>0]
df_clones_output.columns = list(df_clones_output.columns)

In [17]:
df_metadata_outpout = df_metadata.loc[df_clones_output.index,]

In [18]:
df_coord_output = input_data.loc[df_clones_output.index, ['tSNE_1','tSNE_2']]
df_coord_output.columns = range(df_coord_output.shape[1])

In [19]:
print(df_clones_output.shape)
df_clones_output.head()

(4498, 781)


Unnamed: 0,"(-1, 97, 75)","(-1, 343, 71)","(-1, 343, 257)","(-1, 357, 19)","(-1, 486, 159)","(-1, 516, 97)","(-1, 522, 32)","(-1, 593, 251)","(-1, 629, 59)","(-1, 766, 108)",...,"(3023, 2052, -1)","(3026, -1, 2068)","(3026, -1, 2101)","(3026, -1, 2160)","(3026, -1, 2283)","(3026, -1, 2288)","(3035, 2281, -1)","(3036, 2513, -1)","(3036, 2651, -1)","(3036, 2656, -1)"
HF1.CCTAGCTTCCAAATGC-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HF1.CGCTTCACACTTACGA-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HF1.GTAGTCATCGTATCAG-4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HF1.AAATGCCTCTAACTGG-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HF1.AAGACCTTCCTCTAGC-5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
workdir = 'output'
if(not os.path.exists(workdir)):
    os.makedirs(workdir)

In [21]:
df_clones_output.to_csv(os.path.join(workdir,'clones.tsv.gz'),header=True,index=True,sep='\t',compression='gzip')
df_metadata_outpout.to_csv(os.path.join(workdir,'metadata.tsv.gz'),header=True,index=True,sep='\t',compression='gzip')
df_coord_output.to_csv(os.path.join(workdir,'coordinates.tsv.gz'),header=True,index=True,sep='\t',compression='gzip')