# Import dependencies

In [1]:
import sys, os
from pandas.io.parsers import read_csv
import numpy as np
import pandas as pd
import collections
import scipy

# FACS3.2

Overview of gastruloids in this dataset:

gastruloid name | Condition | Plates
---|---|---
1-1-1 | 96h embedded, induced | A, B, C, D, E, F, G 1/2
3-2 | 120h embedded, induced | G 1/2, H, I, J, K, L 1/2
3-6 | 120h embedded, induced | L 1/2, M, N, O, P, Q, R 1/2
3 | 120h embedded, non-induced | R 1/2, S

## First: make a dictionary of all gastruloid names

We can use this dictionary to change all names correctly. Names should be as follows: plateName_cellNumber.gastruloidName. 
We need to make a dictionary to correctly assign all cells to the correct gastruloids: some plates contain cells from multiple gastruloids.

In [71]:
# plates go from 1 until 384. for 1-384: range(1,385)
def gastruloidDictionary (plateName,gastruloidName,lower,upper):
    dictGastruloidNames = {}
    for cellnumber in list(range(lower,upper)):
        oldname = (plateName+'_') + str(cellnumber)
        newname = oldname + '.' + gastruloidName
        dictGastruloidNames[oldname] = newname
    return dictGastruloidNames  

First we add all plates that contain just one gastruloid to a dictionary.

In [91]:
dict3t2_ABCDEF = {}
plates111 = ['A','B','C','D','E','F']
for plate in plates111:
    dictplate = {}
    dictplate = gastruloidDictionary((plate+'3t2'),'96hE-1-1-1',1,385)
    dict3t2_ABCDEF.update(dictplate)

dict3t2_HIJK = {}
plates32 = ['H','I','J','K']
for plate in plates32:
    dictplate = {}
    dictplate = gastruloidDictionary((plate+'3t2'),'120hE-3-2',1,385)
    dict3t2_HIJK.update(dictplate)

dict3t2_MNOPQ = {}
plates36 = ['M','N','O','P','Q']
for plate in plates36:
    dictplate = {}
    dictplate = gastruloidDictionary((plate+'3t2'),'120hE-3-6',1,385)
    dict3t2_MNOPQ.update(dictplate)

dict3t2_S = gastruloidDictionary('S3t2', '120hE-3', 1,385)


Then we just have to add the plates that contain multiple gastruloids.

In [92]:
# plate G
dict3t2_G = gastruloidDictionary('G3t2','96hE-1-1-1',1,169)
dict3t2_G.update(gastruloidDictionary('G3t2','96hE-1-1-1',173,193))
dict3t2_G.update(gastruloidDictionary('G3t2','120hE-3-2',169,173))
dict3t2_G.update(gastruloidDictionary('G3t2','120hE-3-2',193,385))

# plate L
dict3t2_L = gastruloidDictionary('L3t2','120hE-3-2',1,206)
dict3t2_L.update(gastruloidDictionary('L3t2','120hE-3-6',206,385))

# plate R
dict3t2_R = gastruloidDictionary('R3t2','120hE-3-6',1,65)
dict3t2_R.update(gastruloidDictionary('R3t2','120hE-3',65,385))

Now we can add all dictionaries together in one final dictionary.

In [99]:
alldicts3t2 = [dict3t2_ABCDEF,dict3t2_G,dict3t2_HIJK,dict3t2_L,
            dict3t2_MNOPQ,dict3t2_R,dict3t2_S]


In [100]:
finaldictionary3t2 = {}
for dict1 in alldicts3t2:
    finaldictionary3t2.update(dict1)
    

## Import and process all dataframes

First define the path to all .csv files, this is used by the function to retrieve the files.

In [102]:
pathToData = '/Users/m.blotenburg/Documents/Projects/Mouse_Scartrace/Data_analysis/Scar_analysis/20190903_clusterBackupScars_VAN2988_OUD3694_OUD3695/cluster/168hAA/raw_demultiplexed/SQfilter_90/'

In [103]:
def readInScarPlate(pathToFile, plateName,dictGastruloidNames):
    scarPlate = read_csv((pathToData + pathToFile),sep = ',', low_memory=False, index_col=(0,1,2,3))
    scarPlate = scarPlate.loc[[idx for idx in scarPlate.index if 'Unknown'!=idx]]
    scarPlate = scarPlate.iloc[1:]
    scarPlate.columns = scarPlate.columns.str.replace((scarPlate.columns[0].rsplit('_',1)[0] + '_'), (plateName + '_'))
    scarPlate.index.names = ['chrom','allele','site', 'scar']
    
    scarPlate = scarPlate.rename(index=str, columns=dictGastruloidNames)    
    scarPlate.columns = pd.MultiIndex.from_tuples( [ (x.split('.',3)[1], x.split('.',3)[0]) for x in scarPlate.columns] )
    scarPlate.columns.names = ['gastruloid','cell']
    scarPlate = scarPlate[sorted(scarPlate.columns)]
    return scarPlate

Define the full path for each plate.

In [104]:
pathA3t2 = 'MB-FACS3t2-Scars-plateA-dualIndex_000000000-CM8V3_S13_SQfiltered_Threshold90_countTable_nodedup.csv'
pathB3t2 ='MB-FACS3t2-Scars-plateB-dualIndex_000000000-CM8V3_S14_SQfiltered_Threshold90_countTable_nodedup.csv'
pathC3t2 ='MB-FACS3t2-Scars-plateC-dualIndex_000000000-CM8V3_S15_SQfiltered_Threshold90_countTable_nodedup.csv'
pathD3t2 ='MB-FACS3t2-Scars-plateD-dualIndex_000000000-CM8V3_S16_SQfiltered_Threshold90_countTable_nodedup.csv'
pathE3t2 ='MB-FACS3t2-Scars-plateE-dualIndex_000000000-CM8V3_S17_SQfiltered_Threshold90_countTable_nodedup.csv'
pathF3t2 ='MB-FACS3t2-Scars-plateF-dualIndex_000000000-CM8V3_S18_SQfiltered_Threshold90_countTable_nodedup.csv'
pathG3t2 ='MB-FACS3t2-Scars-plateG-dualIndex_000000000-CM8TV_S1_SQfiltered_Threshold90_countTable_nodedup.csv'
pathH3t2 ='MB-FACS3t2-Scars-plateH-dualIndex_000000000-CM8TV_S2_SQfiltered_Threshold90_countTable_nodedup.csv'
pathI3t2 ='MB-FACS3t2-Scars-plateI-dualIndex_000000000-CM8TV_S3_SQfiltered_Threshold90_countTable_nodedup.csv'
pathJ3t2 ='MB-FACS3t2-Scars-plateJ-dualIndex_000000000-CM8TV_S4_SQfiltered_Threshold90_countTable_nodedup.csv'
pathK3t2 ='MB-FACS3t2-Scars-plateK-dualIndex_000000000-CM8TV_S5_SQfiltered_Threshold90_countTable_nodedup.csv'
pathL3t2 ='MB-FACS3t2-Scars-plateL-dualIndex_000000000-CM8TV_S6_SQfiltered_Threshold90_countTable_nodedup.csv'
pathM3t2 ='MB-FACS3t2-Scars-plateM-dualIndex_000000000-CM8TV_S7_SQfiltered_Threshold90_countTable_nodedup.csv'
pathN3t2 ='MB-FACS3t2-Scars-plateN-dualIndex_000000000-CM8TV_S8_SQfiltered_Threshold90_countTable_nodedup.csv'
pathO3t2 ='MB-FACS3t2-Scars-plateO-dualIndex_000000000-CM8TV_S9_SQfiltered_Threshold90_countTable_nodedup.csv'
pathP3t2 ='MB-FACS3t2-Scars-plateP-dualIndex_000000000-CM8TV_S10_SQfiltered_Threshold90_countTable_nodedup.csv'
pathQ3t2 ='MB-FACS3t2-Scars-plateQ-dualIndex_000000000-CM8TV_S11_SQfiltered_Threshold90_countTable_nodedup.csv'
pathR3t2 ='MB-FACS3t2-Scars-plateR-dualIndex_000000000-CM8TV_S12_SQfiltered_Threshold90_countTable_nodedup.csv'
pathS3t2 ='MB-FACS3t2-Scars-plateS-dualIndex_000000000-CM8TV_S13_SQfiltered_Threshold90_countTable_nodedup.csv'


Read in and process all plates.

In [118]:
plateA_3t2 = readInScarPlate(pathA3t2,'A3t2',finaldictionary3t2)
plateB_3t2 = readInScarPlate(pathB3t2,'B3t2',finaldictionary3t2)
plateC_3t2 = readInScarPlate(pathC3t2,'C3t2',finaldictionary3t2)
plateD_3t2 = readInScarPlate(pathD3t2,'D3t2',finaldictionary3t2)
plateE_3t2 = readInScarPlate(pathE3t2,'E3t2',finaldictionary3t2)
plateF_3t2 = readInScarPlate(pathF3t2,'F3t2',finaldictionary3t2)
plateG_3t2 = readInScarPlate(pathG3t2,'G3t2',finaldictionary3t2)
plateH_3t2 = readInScarPlate(pathH3t2,'H3t2',finaldictionary3t2)
plateI_3t2 = readInScarPlate(pathI3t2,'I3t2',finaldictionary3t2)
plateJ_3t2 = readInScarPlate(pathJ3t2,'J3t2',finaldictionary3t2)
plateK_3t2 = readInScarPlate(pathK3t2,'K3t2',finaldictionary3t2)
plateL_3t2 = readInScarPlate(pathL3t2,'L3t2',finaldictionary3t2)
plateM_3t2 = readInScarPlate(pathM3t2,'M3t2',finaldictionary3t2)
plateN_3t2 = readInScarPlate(pathN3t2,'N3t2',finaldictionary3t2)
plateO_3t2 = readInScarPlate(pathO3t2,'O3t2',finaldictionary3t2)
plateP_3t2 = readInScarPlate(pathP3t2,'P3t2',finaldictionary3t2)
plateQ_3t2 = readInScarPlate(pathQ3t2,'Q3t2',finaldictionary3t2)
plateR_3t2 = readInScarPlate(pathR3t2,'R3t2',finaldictionary3t2)
plateS_3t2 = readInScarPlate(pathS3t2,'S3t2',finaldictionary3t2)


# FACS4.3

Overview of gastruloids in this dataset:

gastruloid name | Condition | Plates
---|---|---
1-2-4 | 120h embedded, induced | A, B, C, D, E, F, G 1/2
1-5 | 96h embedded, induced | G 1/2, H, I, J, K, L

## First: make a dictionary of all gastruloid names

We can use this dictionary to change all names correctly. Names should be as follows: plateName_cellNumber.gastruloidName. 
We need to make a dictionary to correctly assign all cells to the correct gastruloids: some plates contain cells from multiple gastruloids.

First we add all plates that contain just one gastruloid to a dictionary.
Then we just have to add the plate that contains multiple gastruloids.

In [119]:
dict4t3_ABCDEF = {}
plates124 = ['A','B','C','D','E','F']
for plate in plates124:
    dictplate = {}
    dictplate = gastruloidDictionary((plate+'4t3'),'120hE-1-2-4',1,385)
    dict4t3_ABCDEF.update(dictplate)
    
dict4t3_HIJKL = {}
plates15 = ['H','I','J','K','L']
for plate in plates15:
    dictplate = {}
    dictplate = gastruloidDictionary((plate+'4t3'),'96hE-1-5',1,385)
    dict4t3_HIJKL.update(dictplate)
    
# plate G
dict4t3_G = gastruloidDictionary('G4t3','120hE-1-2-4',1,149)
dict4t3_G.update(gastruloidDictionary('G4t3','96hE-1-5',149,385))


Now we can add all dictionaries together in one final dictionary.

In [120]:
alldicts4t3 = [dict4t3_ABCDEF,dict4t3_G,dict4t3_HIJKL]
finaldictionary4t3 = {}
for dict1 in alldicts4t3:
    finaldictionary4t3.update(dict1)
    

## Import and process all dataframes

Define the full path for each plate.

In [122]:
pathA4t3 = 'MB-FACS4t3-Scars-plateA-dualIndex_000000000-CM8V3_S1_SQfiltered_Threshold90_countTable_nodedup.csv'
pathB4t3 = 'MB-FACS4t3-Scars-plateB-dualIndex_000000000-CM8V3_S2_SQfiltered_Threshold90_countTable_nodedup.csv'
pathC4t3 = 'MB-FACS4t3-Scars-plateC-dualIndex_000000000-CM8V3_S3_SQfiltered_Threshold90_countTable_nodedup.csv'
pathD4t3 = 'MB-FACS4t3-Scars-plateD-dualIndex_000000000-CM8V3_S4_SQfiltered_Threshold90_countTable_nodedup.csv'
pathE4t3 = 'MB-FACS4t3-Scars-plateE-dualIndex_000000000-CM8V3_S5_SQfiltered_Threshold90_countTable_nodedup.csv'
pathF4t3 = 'MB-FACS4t3-Scars-plateF-dualIndex_000000000-CM8V3_S6_SQfiltered_Threshold90_countTable_nodedup.csv'
pathG4t3 = 'MB-FACS4t3-Scars-plateG-dualIndex_000000000-CM8V3_S7_SQfiltered_Threshold90_countTable_nodedup.csv'
pathH4t3 = 'MB-FACS4t3-Scars-plateH-dualIndex_000000000-CM8V3_S8_SQfiltered_Threshold90_countTable_nodedup.csv'
pathI4t3 = 'MB-FACS4t3-Scars-plateI-dualIndex_000000000-CM8V3_S9_SQfiltered_Threshold90_countTable_nodedup.csv'
pathJ4t3 = 'MB-FACS4t3-Scars-plateJ-dualIndex_000000000-CM8V3_S10_SQfiltered_Threshold90_countTable_nodedup.csv'
pathK4t3 = 'MB-FACS4t3-Scars-plateK-dualIndex_000000000-CM8V3_S11_SQfiltered_Threshold90_countTable_nodedup.csv'
pathL4t3 = 'MB-FACS4t3-Scars-plateL-dualIndex_000000000-CM8V3_S12_SQfiltered_Threshold90_countTable_nodedup.csv'

Read in and process all plates.

In [123]:
plateA_4t3 = readInScarPlate(pathA4t3,'A4t3',finaldictionary4t3)
plateB_4t3 = readInScarPlate(pathB4t3,'B4t3',finaldictionary4t3)
plateC_4t3 = readInScarPlate(pathC4t3,'C4t3',finaldictionary4t3)
plateD_4t3 = readInScarPlate(pathD4t3,'D4t3',finaldictionary4t3)
plateE_4t3 = readInScarPlate(pathE4t3,'E4t3',finaldictionary4t3)
plateF_4t3 = readInScarPlate(pathF4t3,'F4t3',finaldictionary4t3)
plateG_4t3 = readInScarPlate(pathG4t3,'G4t3',finaldictionary4t3)
plateH_4t3 = readInScarPlate(pathH4t3,'H4t3',finaldictionary4t3)
plateI_4t3 = readInScarPlate(pathI4t3,'I4t3',finaldictionary4t3)
plateJ_4t3 = readInScarPlate(pathJ4t3,'J4t3',finaldictionary4t3)
plateK_4t3 = readInScarPlate(pathK4t3,'K4t3',finaldictionary4t3)
plateL_4t3 = readInScarPlate(pathL4t3,'L4t3',finaldictionary4t3)

### Merge dataframes

In [124]:
#all
dfs_all = [plateA_3t2,plateB_3t2,plateC_3t2,plateD_3t2,plateE_3t2,plateF_3t2,plateG_3t2,plateH_3t2,plateI_3t2,
          plateJ_3t2,plateK_3t2,plateL_3t2,plateM_3t2,plateN_3t2,plateO_3t2,plateP_3t2,plateQ_3t2,plateR_3t2,
          plateS_3t2,plateA_4t3,plateB_4t3,plateC_4t3,plateD_4t3,plateE_4t3,plateF_4t3,plateG_4t3,plateH_4t3,
          plateI_4t3,plateJ_4t3,plateK_4t3,plateL_4t3]

all_scars = dfs_all[0].join(dfs_all[1:],how='outer')

In [125]:
all_scars.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gastruloid,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,96hE-1-1-1,...,96hE-1-5,96hE-1-5,96hE-1-5,96hE-1-5,96hE-1-5,96hE-1-5,96hE-1-5,96hE-1-5,96hE-1-5,96hE-1-5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,cell,A3t2_1,A3t2_100,A3t2_102,A3t2_103,A3t2_104,A3t2_107,A3t2_108,A3t2_109,A3t2_111,A3t2_112,...,L4t3_90,L4t3_91,L4t3_92,L4t3_93,L4t3_94,L4t3_95,L4t3_96,L4t3_97,L4t3_98,L4t3_99
chrom,allele,site,scar,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
1,,116611825,WT,,,,,,,,,,,...,,,,,,,,,,
1,,118624739,N.118624928.I,,,,,,,,,,,...,,,,,,,,,,
1,,128363706,G.128363806.I,,,,,,,,,,,...,,,,,,,,,,
1,,13187847,WT,,,,,,,,,,,...,,,,,,,,,,
1,,131932286,WT,,,,,,,,,,,...,,,,,,,,,,


# Saving full dataframe

In [36]:
all_scars.to_pickle(pathToData + '168hAA_Miseq_dataFrame_allscars_allplates_SQ90Filtered.pickle.gz')