# Import dependencies

In [1]:
import sys, os
from pandas.io.parsers import read_csv
import numpy as np
import pandas as pd
import collections
import scipy

# FACS 25/07 and 01/08 2018

Overview of gastruloids in this dataset:

gastruloid name| Plates
---|---
C5A4 | A1: 1-5
C5C3 | A1: 6-120, 130-144
C5F2 | A1: 121-130, 145-384; B1: 1-24, 45-48
C5B4 | B1: 25-44, 49-216, 240
C5H10| B1: 232-239
C5G5 | B1: 217-231, 241-384

## First: make a dictionary of all gastruloid names

We can use this dictionary to change all names correctly. Names should be as follows: plateName_cellNumber.gastruloidName. 
We need to make a dictionary to correctly assign all cells to the correct gastruloids: some plates contain cells from multiple gastruloids.

In [2]:
# plates go from 1 until 384. for 1-384: range(1,385)
def gastruloidDictionary (plateName,gastruloidName,lower,upper):
    dictGastruloidNames = {}
    for cellnumber in list(range(lower,upper)):
        oldname = (plateName+'_') + str(cellnumber)
        newname = oldname + '.' + gastruloidName
        dictGastruloidNames[oldname] = newname
    return dictGastruloidNames  

Then we just have to add the plates that contain multiple gastruloids.

In [3]:
# plate A1
#dictA1 = gastruloidDictionary('A1','C5A4',1,6)
#dictA1.update(gastruloidDictionary('A1','C5C3',6,121))
#dictA1.update(gastruloidDictionary('A1','C5C3',131,145))
#dictA1.update(gastruloidDictionary('A1','C5F2',121,131))
#dictA1.update(gastruloidDictionary('A1','C5F2',145,385))

# plate B1
#dictB1 = gastruloidDictionary('B1','C5F2',1,25)
#dictB1.update(gastruloidDictionary('B1','C5F2',45,49))
#dictB1.update(gastruloidDictionary('B1','C5B4',25,45))
#dictB1.update(gastruloidDictionary('B1','C5B4',49,217))
#dictB1.update(gastruloidDictionary('B1','C5B4',240,241))
#dictB1.update(gastruloidDictionary('B1','C5H10',232,240))
#dictB1.update(gastruloidDictionary('B1','C5G5',217,232))
#dictB1.update(gastruloidDictionary('B1','C5G5',241,385))

# plate A2
dictA2 = gastruloidDictionary('A2','C5A9',1,261)
dictA2.update(gastruloidDictionary('A2','C5B10',261,385))

# plate B2
dictB2 = gastruloidDictionary('B2','C5B10',1,217)
dictB2.update(gastruloidDictionary('B2','C5B10',230,241))
dictB2.update(gastruloidDictionary('B2','C5C6',217,230))
dictB2.update(gastruloidDictionary('B2','C5C6',241,385))

# plate C2
dictC2 = gastruloidDictionary('C2','C5C6',1,169)
dictC2.update(gastruloidDictionary('C2','C5C6',188,193))
dictC2.update(gastruloidDictionary('C2','C5H7',169,188))
dictC2.update(gastruloidDictionary('C2','C5H7',193,385))

# plate D2
dictD2 = gastruloidDictionary('D2','C5H7',1,151)
dictD2.update(gastruloidDictionary('D2','C5H9',151,385))

# plate E2
dictE2 = gastruloidDictionary('E2','C5H9',1,109)
dictE2.update(gastruloidDictionary('E2','C5H11',109,385))

# plate F2
dictF2 = gastruloidDictionary('F2','C5H11',1,217)
dictF2.update(gastruloidDictionary('F2','C5H11',235,241))
dictF2.update(gastruloidDictionary('F2','C5G7',217,235))
dictF2.update(gastruloidDictionary('F2','C5G7',241,385))

# plate G2
dictG2 = gastruloidDictionary('G2','C5G7',1,385)

# plate H2
dictH2 = gastruloidDictionary('H2','C5F10',1,385)

Now we can add all dictionaries together in one final dictionary.

In [4]:
#A1 and B1 not included!

alldicts = [dictA2,dictB2,dictC2,
            dictD2,dictE2,dictF2,dictG2,dictH2]


In [5]:
finaldictionary = {}
for dict1 in alldicts:
    finaldictionary.update(dict1)
    

## Import and process all dataframes

First define the path to all .csv files, this is used by the function to retrieve the files.

In [6]:
pathToData = '/Users/m.blotenburg/Documents/Projects/Mouse_Scartrace/Data_analysis/Scar_analysis/20200324_VAN2988_remap_BWAfilters-alleleCalling/'

In [7]:
def readInScarPlate(pathToFile, plateName,dictGastruloidNames):
    scarPlate = read_csv((pathToData + pathToFile),sep = ',', low_memory=False, index_col=(0,1,2,3))
    scarPlate = scarPlate.loc[[idx for idx in scarPlate.index if 'Unknown'!=idx]]
    scarPlate = scarPlate.iloc[1:]
    scarPlate.columns = scarPlate.columns.str.replace((scarPlate.columns[0].rsplit('_',1)[0] + '_'), (plateName + '_'))
    scarPlate.index.names = ['chrom','allele','site', 'scar']
    
    scarPlate = scarPlate.rename(index=str, columns=dictGastruloidNames)    
    scarPlate.columns = pd.MultiIndex.from_tuples( [ (x.split('.',3)[1], x.split('.',3)[0]) for x in scarPlate.columns] )
    scarPlate.columns.names = ['gastruloid','cell']
    scarPlate = scarPlate[sorted(scarPlate.columns)]
    return scarPlate

Define the full path for each plate.

In [8]:
#pathA1 = 'MB-FACS2507-Scars-PlateA-C5-Miseq-SingleIndex/count_table_SQfilteredbam.csv'
#pathB1 = 'MB-FACS2507-Scars-PlateB-C5-Miseq-SingleIndex/count_table_SQfilteredbam.csv'
pathA2 = 'MB-miseq-gastruloid-plateA-C5-scars-FACS0108/count_table_filteredBam.csv'
pathB2 = 'MB-miseq-gastruloid-plateB-C5-scars-FACS0108/count_table_filteredBam.csv'
pathC2 = 'MB-miseq-gastruloid-plateC-C5-scars-FACS0108/count_table_filteredBam.csv'
pathD2 = 'MB-miseq-gastruloid-plateD-C5-scars-FACS0108/count_table_filteredBam.csv'
pathE2 = 'MB-miseq-gastruloid-plateE-C5-scars-FACS0108/count_table_filteredBam.csv'
pathF2 = 'MB-miseq-gastruloid-plateF-C5-scars-FACS0108/count_table_filteredBam.csv'
pathG2 = 'MB-miseq-gastruloid-plateG-C5-scars-FACS0108/count_table_filteredBam.csv'
pathH2 = 'MB-miseq-gastruloid-plateH-C5-scars-FACS0108/count_table_filteredBam.csv'



Read in and process all plates.

In [9]:
#plateA1 = readInScarPlate(pathA1, 'A1', finaldictionary)
#plateB1 = readInScarPlate(pathB1, 'B1', finaldictionary)

plateA2 = readInScarPlate(pathA2, 'A2', finaldictionary)
plateB2 = readInScarPlate(pathB2, 'B2', finaldictionary)
plateC2 = readInScarPlate(pathC2, 'C2', finaldictionary)
plateD2 = readInScarPlate(pathD2, 'D2', finaldictionary)
plateE2 = readInScarPlate(pathE2, 'E2', finaldictionary)
plateF2 = readInScarPlate(pathF2, 'F2', finaldictionary)
plateG2 = readInScarPlate(pathG2, 'G2', finaldictionary)
plateH2 = readInScarPlate(pathH2, 'H2', finaldictionary)

### Merge dataframes

In [10]:
#all
dfs_all = [plateA2,plateB2,plateC2,plateD2,plateE2,plateF2,plateG2,plateH2]

all_scars = dfs_all[0].join(dfs_all[1:],how='outer')

In [11]:
all_scars.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gastruloid,C5A9,C5A9,C5A9,C5A9,C5A9,C5A9,C5A9,C5A9,C5A9,C5A9,...,C5F10,C5F10,C5F10,C5F10,C5F10,C5F10,C5F10,C5F10,C5F10,C5F10
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,cell,A2_1,A2_10,A2_100,A2_101,A2_102,A2_103,A2_104,A2_105,A2_106,A2_107,...,H2_90,H2_91,H2_92,H2_93,H2_94,H2_95,H2_96,H2_97,H2_98,H2_99
chrom,allele,site,scar,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
1,,12847807,WT,,,,,,,,,,,...,,,,,,,,,,
1,,12847810,WT,,,,,,,,,,,...,,,,,,,,,,
1,,12847812,WT,,,,,,,,,,,...,,,,,,,,,,
1,,143473230,143473406.D,,,,,,,,,,,...,,,,,,,,,,
1,,143473231,WT,,,,,,,,,,,...,,,,,,,,,,


# Saving full dataframe

In [12]:
all_scars.to_pickle(pathToData + '120hAA_miseq_remapped_filtered.pickle.gz')