In [1]:
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import LabelEncoder

In [2]:
unpublished_mouse_sheet = pd.read_excel("./unpublished_mouse_WES.xlsx")

In [3]:
unpublished_mouse_sheet

Unnamed: 0,Mouse ID,Seq Probe,Organ,tumor line,bam_ID,SRA upload ID
0,C38,CLL,SPL,C38,CLLC38,CLL1
1,C38,GL,GL,C38,GLC38,GERM1
2,659,CLL,SPL,C38,CLL659,CLL2
3,659,CLL,LNP,C38,CLL659LNP,CLL3
4,659,CLL,BL,C38,CLL659BL,CLL4
5,702,CLL,PC,347/20,CLL702PC,CLL5
6,Q67,CLL,Liver,C25,CLLQ67LIV,CLL6
7,Q67,CLL,LNP,C25,CLLQ67LNP,CLL7
8,1785/93,CLL,SPL,E31,CLL1785NSG,CLL8
9,1786/93,CLL,SPL,E31,CLL1786NSG,CLL9


In [4]:
ID_renamer = {old_ID: new_ID for k, (old_ID, new_ID) in unpublished_mouse_sheet[["bam_ID", "SRA upload ID"]].iterrows()}

In [5]:
ID_renamer

{'CLLC38': 'CLL1',
 'GLC38': 'GERM1',
 'CLL659': 'CLL2',
 'CLL659LNP': 'CLL3',
 'CLL659BL': 'CLL4',
 'CLL702PC': 'CLL5',
 'CLLQ67LIV': 'CLL6',
 'CLLQ67LNP': 'CLL7',
 'CLL1785NSG': 'CLL8',
 'CLL1786NSG': 'CLL9',
 'CLLCD92PC': 'CLL10',
 'CLLC776': 'CLL11',
 'CLLC777': 'CLL12',
 'CLLC752': 'CLL13',
 'CLLC755': 'CLL14'}

In [6]:
line_renamer = {"GLC38": "GL1", "GERM_C25": "GL2", "GERM_E31": "GL3", "GERM_347": "GL4"}

In [7]:
absolute_bamlist = pd.read_csv("./bamlist_Maria.csv", sep=";")

In [8]:
absolute_bamlist

Unnamed: 0,bam_ID,line_ID,flowcell_ID,library_ID,bam_path,bai_path
0,CLL_212,GERM_212,4,2,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_212.so...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_212.so...
1,CLL_221,GERM_221,4,2,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_221.so...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_221.so...
2,CLL_347,GERM_347,4,2,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_347.so...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_347.so...
3,CLL_C25,GERM_C25,4,2,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_C25.so...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_C25.so...
4,CLL_D22,GERM_D22,4,2,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_D22.so...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_D22.so...
5,CLL_E31,GERM_E31,4,2,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_E31.so...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_E31.so...
6,CLL_F3,GERM_F3,4,2,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_F3.sor...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL_F3.sor...
7,CLL111TCL,GL111TCL,1,1,/limcr-ngs/Marc/mouse_seq_data/bams/CLL111TCL....,/limcr-ngs/Marc/mouse_seq_data/bams/CLL111TCL....
8,CLL121TX,GL82TCL,1,1,/limcr-ngs/Marc/mouse_seq_data/bams/CLL121TX.s...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL121TX.s...
9,CLL122TX,GL82TCL,1,1,/limcr-ngs/Marc/mouse_seq_data/bams/CLL122TX.s...,/limcr-ngs/Marc/mouse_seq_data/bams/CLL122TX.s...


In [9]:
relative_bamlist = absolute_bamlist.copy()
for path_col in ["bam_path", "bai_path"]:
    relative_bamlist[path_col] = relative_bamlist[path_col].apply(lambda x: x.split("/")[-1])
    relative_bamlist[path_col] = "./04_processed_bam/" + relative_bamlist[path_col]
    
    for old_ID, new_ID in ID_renamer.items():
        def exact_path_replacer(path):
            whole_ID = path.split("/")[-1].split(".")[0]
            if whole_ID == old_ID:
                return path.replace(old_ID, new_ID)
            else:
                return path
        
        
        relative_bamlist[path_col] = relative_bamlist[path_col].apply(exact_path_replacer)

        
def exact_ID_replacer(ID):
    for old_ID, new_ID in ID_renamer.items():
        if ID == old_ID:
            return new_ID
    
    return ID

def exact_line_replacer(line_ID):
    for old_lID, new_lID in line_renamer.items():
        if line_ID == old_lID:
            return new_lID
    
    return line_ID

relative_bamlist["bam_ID"] = relative_bamlist["bam_ID"].apply(exact_ID_replacer)
# relative_bamlist["line_ID"] = relative_bamlist["line_ID"].apply(exact_line_replacer)

line_encoder = LabelEncoder().fit(relative_bamlist["line_ID"])

relative_bamlist["line_ID"] = relative_bamlist["line_ID"].apply(lambda x: f"line_{line_encoder.transform([x])[0]}")

In [10]:
relative_bamlist

Unnamed: 0,bam_ID,line_ID,flowcell_ID,library_ID,bam_path,bai_path
0,CLL_212,line_0,4,2,./04_processed_bam/CLL_212.sorted.marked.reali...,./04_processed_bam/CLL_212.sorted.marked.reali...
1,CLL_221,line_1,4,2,./04_processed_bam/CLL_221.sorted.marked.reali...,./04_processed_bam/CLL_221.sorted.marked.reali...
2,CLL_347,line_2,4,2,./04_processed_bam/CLL_347.sorted.marked.reali...,./04_processed_bam/CLL_347.sorted.marked.reali...
3,CLL_C25,line_3,4,2,./04_processed_bam/CLL_C25.sorted.marked.reali...,./04_processed_bam/CLL_C25.sorted.marked.reali...
4,CLL_D22,line_4,4,2,./04_processed_bam/CLL_D22.sorted.marked.reali...,./04_processed_bam/CLL_D22.sorted.marked.reali...
5,CLL_E31,line_5,4,2,./04_processed_bam/CLL_E31.sorted.marked.reali...,./04_processed_bam/CLL_E31.sorted.marked.reali...
6,CLL_F3,line_6,4,2,./04_processed_bam/CLL_F3.sorted.marked.realig...,./04_processed_bam/CLL_F3.sorted.marked.realig...
7,CLL111TCL,line_7,1,1,./04_processed_bam/CLL111TCL.sorted.marked.rea...,./04_processed_bam/CLL111TCL.sorted.marked.rea...
8,CLL121TX,line_8,1,1,./04_processed_bam/CLL121TX.sorted.marked.real...,./04_processed_bam/CLL121TX.sorted.marked.real...
9,CLL122TX,line_8,1,1,./04_processed_bam/CLL122TX.sorted.marked.real...,./04_processed_bam/CLL122TX.sorted.marked.real...


In [11]:
absolute_annovarlist = pd.read_csv("annovarlist_Maria.csv", sep=";", dtype={"GL_BAM_ID": str, "CL_bam_ID": str, "annovar_path": str, "snplist_path": str}).dropna(how="any", axis=0)

In [12]:
absolute_annovarlist

Unnamed: 0,GL_bam_ID,CL_bam_ID,annovar_path,snplist_path
0,GERM_212,CLL_212,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_21...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_2...
1,GERM_221,CLL_221,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_22...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_2...
2,GERM_347,CLL_347,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_34...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_3...
3,GERM_347,CLL702PC,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_34...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_3...
4,GERM_347,CLL702,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_34...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_3...
5,GERM_347,CLL703,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_34...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_3...
6,GERM_C25,CLL642,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_C2...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_C...
7,GERM_C25,CLL_C25,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_C2...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_C...
8,GERM_C25,CLLC752,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_C2...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_C...
9,GERM_C25,CLLC755,/limcr-ngs/Marc/mouse_seq_data/annovar/GERM_C2...,/limcr-ngs/Marc/mouse_seq_data/snplists/GERM_C...


In [13]:
relative_annovarlist = absolute_annovarlist.copy()

relative_annovarlist["annovar_path"] = relative_annovarlist["annovar_path"].apply(lambda x: x.split("/")[-1])
relative_annovarlist["annovar_path"] = "./07_annovar/" + relative_annovarlist["annovar_path"]

relative_annovarlist["snplist_path"] = relative_annovarlist["snplist_path"].apply(lambda x: x.split("/")[-1])
relative_annovarlist["snplist_path"] = "./snplists/" + relative_annovarlist["snplist_path"]

relative_annovarlist["GL_bam_ID"] = relative_annovarlist["GL_bam_ID"].apply(exact_ID_replacer)
relative_annovarlist["CL_bam_ID"] = relative_annovarlist["CL_bam_ID"].apply(exact_ID_replacer)

In [14]:
relative_annovarlist

Unnamed: 0,GL_bam_ID,CL_bam_ID,annovar_path,snplist_path
0,GERM_212,CLL_212,./07_annovar/GERM_212vsCLL_212.VarScan.snp_ind...,./snplists/GERM_212vsCLL_212.xlsx
1,GERM_221,CLL_221,./07_annovar/GERM_221vsCLL_221.VarScan.snp_ind...,./snplists/GERM_221vsCLL_221.xlsx
2,GERM_347,CLL_347,./07_annovar/GERM_347vsCLL_347.VarScan.snp_ind...,./snplists/GERM_347vsCLL_347.xlsx
3,GERM_347,CLL5,./07_annovar/GERM_347vsCLL702PC.VarScan.snp_in...,./snplists/GERM_347vsCLL702PC.xlsx
4,GERM_347,CLL702,./07_annovar/GERM_347vsCLL702.VarScan.snp_inde...,./snplists/GERM_347vsCLL702.xlsx
5,GERM_347,CLL703,./07_annovar/GERM_347vsCLL703.VarScan.snp_inde...,./snplists/GERM_347vsCLL703.xlsx
6,GERM_C25,CLL642,./07_annovar/GERM_C25vsCLL642.VarScan.snp_inde...,./snplists/GERM_C25vsCLL642.xlsx
7,GERM_C25,CLL_C25,./07_annovar/GERM_C25vsCLL_C25.VarScan.snp_ind...,./snplists/GERM_C25vsCLL_C25.xlsx
8,GERM_C25,CLL13,./07_annovar/GERM_C25vsCLLC752.VarScan.snp_ind...,./snplists/GERM_C25vsCLLC752.xlsx
9,GERM_C25,CLL14,./07_annovar/GERM_C25vsCLLC755.VarScan.snp_ind...,./snplists/GERM_C25vsCLLC755.xlsx


In [15]:
for path_col in ["annovar_path", "snplist_path"]:
    for old_ID, new_ID in ID_renamer.items():
        
        def exact_path_replacer(path):
            if path is None:
                raise ValueError("Path is None!")
            
            ID_vs_ID = path.split("/")[-1].split(".")[0]
            
            ID1, ID2 = ID_vs_ID.split("vs")
            
            return_path = path
            if ID1 == old_ID:
                return_path = path.replace(ID1, new_ID)
            if ID2 == old_ID:
                return_path = return_path.replace(ID2, new_ID)
                
            return return_path
        
        relative_annovarlist[path_col] = relative_annovarlist[path_col].apply(exact_path_replacer)

In [16]:
relative_annovarlist

Unnamed: 0,GL_bam_ID,CL_bam_ID,annovar_path,snplist_path
0,GERM_212,CLL_212,./07_annovar/GERM_212vsCLL_212.VarScan.snp_ind...,./snplists/GERM_212vsCLL_212.xlsx
1,GERM_221,CLL_221,./07_annovar/GERM_221vsCLL_221.VarScan.snp_ind...,./snplists/GERM_221vsCLL_221.xlsx
2,GERM_347,CLL_347,./07_annovar/GERM_347vsCLL_347.VarScan.snp_ind...,./snplists/GERM_347vsCLL_347.xlsx
3,GERM_347,CLL5,./07_annovar/GERM_347vsCLL5.VarScan.snp_indel....,./snplists/GERM_347vsCLL5.xlsx
4,GERM_347,CLL702,./07_annovar/GERM_347vsCLL702.VarScan.snp_inde...,./snplists/GERM_347vsCLL702.xlsx
5,GERM_347,CLL703,./07_annovar/GERM_347vsCLL703.VarScan.snp_inde...,./snplists/GERM_347vsCLL703.xlsx
6,GERM_C25,CLL642,./07_annovar/GERM_C25vsCLL642.VarScan.snp_inde...,./snplists/GERM_C25vsCLL642.xlsx
7,GERM_C25,CLL_C25,./07_annovar/GERM_C25vsCLL_C25.VarScan.snp_ind...,./snplists/GERM_C25vsCLL_C25.xlsx
8,GERM_C25,CLL13,./07_annovar/GERM_C25vsCLL13.VarScan.snp_indel...,./snplists/GERM_C25vsCLL13.xlsx
9,GERM_C25,CLL14,./07_annovar/GERM_C25vsCLL14.VarScan.snp_indel...,./snplists/GERM_C25vsCLL14.xlsx


In [17]:
relative_bamlist.to_csv("../input_data/in_facility/relative_bamlist.csv", sep=";")

In [18]:
relative_annovarlist.to_csv("../input_data/in_facility/relative_annovarlist.csv", sep=";")

In [19]:
kotani_absolute_bamlist = pd.read_csv("bamlist_kotani_old.csv", sep=";")
kotani_absolute_bamlist["flowcell_ID"] = 1

In [20]:
def path_fixer_kotani(path):
    filename = path.split("/")[-1]
    return "./04_processed_bam/" + filename

kotani_absolute_bamlist["bam_path"] = kotani_absolute_bamlist["bam_path"].apply(path_fixer_kotani)
kotani_absolute_bamlist["bai_path"] = kotani_absolute_bamlist["bai_path"].apply(path_fixer_kotani)

In [21]:
kotani_absolute_bamlist.to_csv("../input_data/kotani/relative_bamlist.csv")

In [22]:
kotani_absolute_bamlist

Unnamed: 0,bam_ID,line_ID,flowcell_ID,library_ID,bam_path,bai_path
0,B1-4-9_tail,B,1,1,./04_processed_bam/B1-4-9_tail.realigned.recal...,./04_processed_bam/B1-4-9_tail.realigned.recal...
1,A1-2-2_tail,A,1,1,./04_processed_bam/A1-2-2_tail.realigned.recal...,./04_processed_bam/A1-2-2_tail.realigned.recal...
2,D1-2-2_tail,D,1,1,./04_processed_bam/D1-2-2_tail.realigned.recal...,./04_processed_bam/D1-2-2_tail.realigned.recal...
3,B1-4-1_tail,B,1,1,./04_processed_bam/B1-4-1_tail.realigned.recal...,./04_processed_bam/B1-4-1_tail.realigned.recal...
4,B1-4-5_BM,B,1,1,./04_processed_bam/B1-4-5_BM.realigned.recalBa...,./04_processed_bam/B1-4-5_BM.realigned.recalBa...
...,...,...,...,...,...,...
79,A1-3-2_tail,A,1,1,./04_processed_bam/A1-3-2_tail.realigned.recal...,./04_processed_bam/A1-3-2_tail.realigned.recal...
80,B1-3-4_tail,B,1,1,./04_processed_bam/B1-3-4_tail.realigned.recal...,./04_processed_bam/B1-3-4_tail.realigned.recal...
81,A1-2-3_BM,A,1,1,./04_processed_bam/A1-2-3_BM.realigned.recalBa...,./04_processed_bam/A1-2-3_BM.realigned.recalBa...
82,B1-1-1_BM,B,1,1,./04_processed_bam/B1-1-1_BM.realigned.recalBa...,./04_processed_bam/B1-1-1_BM.realigned.recalBa...
