In [1]:
from SigProfilerExtractor import sigpro as sig
import pandas as pd
from Bio.Seq import Seq

#from SigProfilerMatrixGenerator import install as genInstall
#genInstall.install("GRCh38")
path_to_example_table = sig.importdata("matrix")
data = path_to_example_table 
# This "data" variable can be used as a parameter of the "project" argument of the sigProfilerExtractor function.

In [10]:
from os import rename


def convert_mutyper_mutation(mutation):
    """Convert a MuTyper mutation to a SigProfiler mutation."""
    if mutation[1] == 'A':
        first = Seq(mutation[0:3]).reverse_complement()
        second = Seq(mutation[4:7]).reverse_complement()
        mutation = first + ">" + second 
    c_base = mutation[1]
    m_base = mutation[5]
    new = f"{mutation[0]}[{c_base}>{m_base}]{mutation[2]}"
    #print(mutation, new)
    return new

def read_mutyper_file(mutyper_path, tag = None, write = True):        
    df = pd.read_csv(mutyper_path, sep="\t")
    df.set_index("sample", inplace=True)
    df = df.transpose()
    if tag is not None:
        df.columns = [f"{col}_{tag}" for col in df.columns]
    df.insert(0,"Mutation Types", df.index.to_series().apply(convert_mutyper_mutation))
    df.sort_values("Mutation Types", inplace=True)
    df.reset_index(drop=True, inplace=True)
    if write:
        df.to_csv(f"{mutyper_path}.cosmic.txt", sep="\t", index=False)
    return f"{mutyper_path}.cosmic.txt"

mutyper_path = "../data/mutyper-results-no-igc/spectra/stratify/Unique_spectra.txt"
unique = read_mutyper_file(mutyper_path, tag="Unique")

mutyper_path = "../data/mutyper-results-no-igc/spectra/stratify/SD_spectra.txt"
sd = read_mutyper_file(mutyper_path, tag="SD")

mutyper_path = "../data/mutyper-results-no-igc/spectra/stratify/SDnoIGC_spectra.txt"
sd_no_igc = read_mutyper_file(mutyper_path, tag="SD_no_IGC")

mutyper_path = "../data/mutyper-results-no-igc/spectra/stratify/SDIGC_spectra.txt"
igc = read_mutyper_file(mutyper_path, tag="IGC")


In [13]:
#print(pd.read_csv(unique, sep="\t").head())
print(pd.read_csv(sd, sep="\t").head())


  Mutation Types  HG002_SD  HG00438_SD  HG005_SD  HG00621_SD  HG00673_SD  \
0        A[C>A]A      2959        2980      2871        2788        2819   
1        A[C>A]C      2873        2715      2700        2697        2699   
2        A[C>A]G       875         853       843         881         872   
3        A[C>A]T      1524        1484      1476        1479        1475   
4        A[C>G]A      2812        2749      2634        2659        2732   

   HG00733_SD  HG00735_SD  HG00741_SD  HG01071_SD  ...  HG03579_SD  \
0        2972        3046        2855        2889  ...        2903   
1        2828        2850        2756        2732  ...        2770   
2         844         905         852         842  ...         831   
3        1495        1529        1472        1511  ...        1543   
4        2889        2886        2806        2657  ...        2747   

   NA18906_SD  NA19240_SD  NA20129_SD  NA21309_SD  CHM1_SD  GRCh38_SD  \
0        2922        2930        2862        2807

In [21]:

for model_name, model in {"Unique":unique, "SD":sd, "SD-no-IGC":sd_no_igc, "IGC":igc}.items():
    print(model_name)
    print(pd.read_csv(model, sep="\t").iloc[0:3,1:4])
    if False:
        sig.sigProfilerExtractor("matrix",
                                f"sigprof_results/{model_name}", 
                                model, 
                                reference_genome="GRCh38",
                                minimum_signatures=1,
                                maximum_signatures=5,
                                nmf_replicates=100,
                                cpu=80)

Unique
   HG002_Unique  HG00438_Unique  HG005_Unique
0         55223           55398         55011
1         53046           53193         53041
2         13496           13451         13424
SD
   HG002_SD  HG00438_SD  HG005_SD
0      2959        2980      2871
1      2873        2715      2700
2       875         853       843
SD-no-IGC
   HG002_SD_no_IGC  HG00438_SD_no_IGC  HG005_SD_no_IGC
0             1941               1983             1870
1             1863               1787             1752
2              596                562              567
IGC
   HG002_IGC  HG00438_IGC  HG005_IGC
0       1018          997       1001
1       1010          928        948
2        279          291        276
