# Import dependencies

In [1]:
import sys, os
from pandas.io.parsers import read_csv
import numpy as np
import pandas as pd
import collections
import scipy

# All functions

First we make a dictionary containing information of all plates and all gastruloids within those plates. 

We can use this dictionary to change all names correctly. Names should be as follows: plateName_cellNumber.gastruloidName. 
We need to make a dictionary to correctly assign all cells to the correct gastruloids: some plates contain cells from multiple gastruloids.

In [2]:
# plates go from 1 until 384. for 1-384: range(1,385)
def gastruloidDictionary (plateName,gastruloidName,lower,upper):
    dictGastruloidNames = {}
    for cellnumber in list(range(lower,upper)):
        oldname = (plateName+'_') + str(cellnumber)
        newname = oldname + '.' + gastruloidName
        dictGastruloidNames[oldname] = newname
    return dictGastruloidNames  

First define the path to all .csv files, this is used by the function to retrieve the files.

In [3]:
pathToData = '/Users/m.blotenburg/Documents/Projects/Lianne/data/'

In [4]:
def readInTransPlate(pathToFile, plateName,dictGastruloidNames):
    transPlate = read_csv((pathToData + pathToFile),sep = ',', low_memory=False, index_col=(0,1))
    transPlate = transPlate.loc[[idx for idx in transPlate.index if 'Unknown'!=idx]]
    transPlate = transPlate.iloc[1:]
    transPlate.columns = transPlate.columns.str.replace((transPlate.columns[0].rsplit('_',1)[0] + '_'), (plateName + '_'))

    transPlate = transPlate.rename(index=str, columns=dictGastruloidNames) 
    transPlate.index = [f'{gene}_{chrom}' for gene,chrom in transPlate.index]
    transPlate = transPlate[sorted(transPlate.columns)]
    return transPlate

# Dataset: Medium and matrigel gastruloids

Overview of gastruloids in this dataset:

Conditions | plates 
---|---
Pooled medium gastruloids | 1,2,3
Single medium gastruloid A | 4
Single medium gastruloid B | 5
Pooled matrigel gastruloids | 6,7,8
Single matrigel gastruloid A | 9
Single matrigel gastruloid B | 10

### Make a dictionary of all gastruloid names

First we add all plates that contain just one gastruloid to a dictionary.
Then we just have to add the plates that contain multiple gastruloids.

In [5]:
dictg1 = gastruloidDictionary('g1','g-D2_con6',1,385)
dictg2 = gastruloidDictionary('g2','g-C1_con6',1,385)
dictg3 = gastruloidDictionary('g3','g-F2_con6',1,385)
dictg4 = gastruloidDictionary('g4','g-B3_con5',1,385)
dictg5 = gastruloidDictionary('g5','g-C3_con5',1,385)
dictg6 = gastruloidDictionary('g6','g-D2_con5',1,385)
dictg7 = gastruloidDictionary('g7','g-F1_con5',1,385)
dictg8 = gastruloidDictionary('g8','g-C2_con5',1,385)
dictg9 = gastruloidDictionary('g9','g-A1_con1',1,385)
dictg10 = gastruloidDictionary('g10','g-B2_con1',1,385)
dictg11 = gastruloidDictionary('g11','g-C1b_con1',1,385)
dictg12 = gastruloidDictionary('g12','g-H3_con1',1,385)
dictg13 = gastruloidDictionary('g13','g-F4_con1',1,385)
dictg14 = gastruloidDictionary('g14','g-G2_con6',1,385)
dictg15 = gastruloidDictionary('g15','g-H2_con6',1,385)

dicte1 = gastruloidDictionary('e1','m-ESC_con1',1,385)
dicte2 = gastruloidDictionary('e2','m-ESC_con1',1,385)
dicte3 = gastruloidDictionary('e3','m-ESC_con2',1,385)
dicte4 = gastruloidDictionary('e4','m-ESC_con2',1,385)
dicte5 = gastruloidDictionary('e5','m-ESC_con3',1,385)
dicte6 = gastruloidDictionary('e6','m-ESC_con3',1,385)
dicte7 = gastruloidDictionary('e7','m-ESC_con4',1,385)
dicte8 = gastruloidDictionary('e8','m-ESC_con4',1,385)
dicte9 = gastruloidDictionary('e9','m-ESC_con5',1,385)
dicte10 = gastruloidDictionary('e10','m-ESC_con5',1,385)
dicte11 = gastruloidDictionary('e11','m-ESC_con6',1,385)
dicte12 = gastruloidDictionary('e12','m-ESC_con6',1,385)


Now we can add all dictionaries together in one final dictionary.

In [6]:
alldicts = [dictg1, dictg2, dictg3,
            dictg4,dictg5,dictg6,dictg7,dictg8,dictg9,dictg10,dictg11,
            dictg12,dictg13,dictg14,dictg15,
            dicte1, dicte2, dicte3,dicte4, dicte5, dicte6, dicte7, dicte8, 
            dicte9, dicte10, dicte11, dicte12]

In [7]:
finaldictionary = {}
for dicts in alldicts:
    finaldictionary.update(dicts)
    

## Import and process all dataframes

Define the full path for each plate.

In [8]:
pathg1 = 'LS-gastruloids-plate1-con6-D2/count_table_deduplicated.csv'
pathg2 = 'LS-gastruloids-plate2-con6-C1/count_table_deduplicated.csv'
pathg3 = 'LS-gastruloids-plate3-con6-F2/count_table_deduplicated.csv'
pathg4 = 'LS-gastruloids-plate4-con5-B3/count_table_deduplicated.csv'
pathg5 = 'LS-gastruloids-plate5-con5-C3/count_table_deduplicated.csv'
pathg6 = 'LS-gastruloids-plate6-con5-D2/count_table_deduplicated.csv'
pathg7 = 'LS-gastruloids-plate7-con5-F1/count_table_deduplicated.csv'
pathg8 = 'LS-gastruloids-plate8-con5-C2/count_table_deduplicated.csv'
pathg9 = 'LS-gastruloids-plate9-con1-A1/count_table_deduplicated.csv'
pathg10 = 'LS-gastruloids-plate10-con1-B2/count_table_deduplicated.csv'
pathg11 = 'LS-gastruloids-plate11-con1-C1/count_table_deduplicated.csv'
pathg12 = 'LS-gastruloids-plate12-con1-H3/count_table_deduplicated.csv'
pathg13 = 'LS-gastruloids-plate13-con1-F4/count_table_deduplicated.csv'
pathg14 = 'LS-gastruloids-plate14-con6-G2/count_table_deduplicated.csv'
pathg15 = 'LS-gastruloids-plate15-con6-H2/count_table_deduplicated.csv'

pathe1 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate1-con1/count_table_deduplicated.csv'
pathe2 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate2-con1/count_table_deduplicated.csv'
pathe3 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate3-con2/count_table_deduplicated.csv'
pathe4 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate4-con2/count_table_deduplicated.csv'
pathe5 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate5-con3/count_table_deduplicated.csv'
pathe6 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate6-con3/count_table_deduplicated.csv'
pathe7 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate7-con4/count_table_deduplicated.csv'
pathe8 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate8-con4/count_table_deduplicated.csv'
pathe9 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate9-con5/count_table_deduplicated.csv'
pathe10 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate10-con5/count_table_deduplicated.csv'
pathe11 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate11-con6/count_table_deduplicated.csv'
pathe12 = '../2022_Exp_Marloes/cluster/cs2/mESC/processed/MB-mESCs-plate12-con6/count_table_deduplicated.csv'

Read in and process all plates.

In [9]:
g1 = readInTransPlate(pathg1, 'g1', finaldictionary)
g2 = readInTransPlate(pathg2, 'g2', finaldictionary)
g3 = readInTransPlate(pathg3, 'g3', finaldictionary)
g4 = readInTransPlate(pathg4, 'g4', finaldictionary)
g5 = readInTransPlate(pathg5, 'g5', finaldictionary)
g6 = readInTransPlate(pathg6, 'g6', finaldictionary)
g7 = readInTransPlate(pathg7, 'g7', finaldictionary)
g8 = readInTransPlate(pathg8, 'g8', finaldictionary)
g9 = readInTransPlate(pathg9, 'g9', finaldictionary)
g10 = readInTransPlate(pathg10, 'g10', finaldictionary)
g11 = readInTransPlate(pathg11, 'g11', finaldictionary)
g12 = readInTransPlate(pathg12, 'g12', finaldictionary)
g13 = readInTransPlate(pathg13, 'g13', finaldictionary)
g14 = readInTransPlate(pathg14, 'g14', finaldictionary)
g15 = readInTransPlate(pathg15, 'g15', finaldictionary)

pe1 = readInTransPlate(pathe1, 'e1', finaldictionary)
pe2 = readInTransPlate(pathe2, 'e2', finaldictionary)
pe3 = readInTransPlate(pathe3, 'e3', finaldictionary)
pe4 = readInTransPlate(pathe4, 'e4', finaldictionary)
pe5 = readInTransPlate(pathe5, 'e5', finaldictionary)
pe6 = readInTransPlate(pathe6, 'e6', finaldictionary)
pe7 = readInTransPlate(pathe7, 'e7', finaldictionary)
pe8 = readInTransPlate(pathe8, 'e8', finaldictionary)
pe9 = readInTransPlate(pathe9, 'e9', finaldictionary)
pe10 = readInTransPlate(pathe10, 'e10', finaldictionary)
pe11 = readInTransPlate(pathe11, 'e11', finaldictionary)
pe12 = readInTransPlate(pathe12, 'e12', finaldictionary)

### Merge dataframes

In [10]:
#all
ESC = [pe1,pe2,pe3,pe4,pe5,pe6,pe7,pe8,pe9,pe10,pe11,pe12]
gastr = [g1,g2,g3,g4,g5,g6,g7,g8,g9,g10,g11,g12,g13,g14,g15]

ESC_trans = ESC[0].join(ESC[1:],how='outer')
gastr_trans = gastr[0].join(gastr[1:],how='outer')

# Saving full dataframe

In [12]:
ESC_trans.to_pickle(pathToData + 'dataframes/20220217_all_mESCs.pickle.gz')
gastr_trans.to_pickle(pathToData + 'dataframes/20201210_Lianne_gastr.pickle.gz')