In [1]:
import glob
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from plotActivity import plotActivity

import warnings
warnings.filterwarnings("ignore")

### All our mutagen contribution in different spectra are storaged in contribution.csv tables
do not forget to unzip raw_ouput.zip file

In [2]:
contr_data_total = pd.DataFrame()

for contr_solo in glob.glob('./data/signal/output/raw_output/*/*/sbs_signatures/1/contributions.csv'):
    
    contr_df = pd.read_csv(contr_solo)
    contr_df['Spectra_Type'] = contr_solo.split('/')[5]
    contr_df['Group_Type'] = contr_solo.split('/')[6]
    contr_data_total = pd.concat([contr_data_total, contr_df], axis=0)

contr_data_total['Contribution'] = np.round(contr_data_total['Contribution'], 3)
contr_data_total

Unnamed: 0,Signature,Contribution,Spectra_Type,Group_Type
0,ΔOGG1,0.000,high,Mammalia__Ts only
1,ΔEXO1,0.000,high,Mammalia__Ts only
2,ΔMSH6,0.000,high,Mammalia__Ts only
3,ΔPMS1,0.000,high,Mammalia__Ts only
4,ΔPMS2,597.180,high,Mammalia__Ts only
...,...,...,...,...
5,ΔRNF168,0.000,diff,Amphibia__Ts only
6,ΔUNG,397.184,diff,Amphibia__Ts only
7,ΔMLH1,0.000,diff,Amphibia__Ts only
8,ΔMSH2,0.000,diff,Amphibia__Ts only


In [3]:
contr_per = contr_data_total[contr_data_total['Contribution'] != 0].reset_index(drop=True) # delete 0 assigned signatures
contr_per['Percentage'] = contr_per.groupby(['Spectra_Type', 'Group_Type'])['Contribution'].apply(lambda x: round(100 * x / x.sum(), 3)) # count per for each signature in type of spectrum
contr_per = contr_per[['Spectra_Type', 'Group_Type', 'Signature', 'Percentage']]

contr_per

Unnamed: 0,Spectra_Type,Group_Type,Signature,Percentage
0,high,Mammalia__Ts only,ΔPMS2,49.232
1,high,Mammalia__Ts only,ΔUNG,50.768
2,high,Actinopteri__Ts only,ΔPMS2,38.196
3,high,Actinopteri__Ts only,ΔUNG,61.804
4,high,Lepidosauria__Ts only,ΔPMS2,40.991
...,...,...,...,...
75,diff,Amphibia__Ts & Tv,ΔUNG,56.481
76,diff,Amphibia__Ts & Tv,Unassigned,2.724
77,diff,Amphibia__Ts only,ΔPMS2,17.440
78,diff,Amphibia__Ts only,ΔUNG,70.050


In [4]:
contr_per['Signature'].unique()

array(['ΔPMS2', 'ΔUNG', 'Unassigned', 'ΔRNF168', 'ΔMSH6'], dtype=object)

PMS2, MSH6 - MMR ; UNG - BER; ΔRNF168 - DSB

In [5]:
set_order = ['high-Actinopteri', 'low-Actinopteri', 'diff-Actinopteri', 
             'high-Amphibia', 'low-Amphibia', 'diff-Amphibia',
             'high-Lepidosauria', 'low-Lepidosauria', 'diff-Lepidosauria', 
             'high-Mammalia', 'low-Mammalia', 'diff-Mammalia',
             'high-Aves', 'low-Aves', 'diff-Aves']

cat_type = CategoricalDtype(categories=set_order, ordered=True)

OUTDIR = "./data/signal/output/"
pt_del = r'__Ts & Tv|__Ts' # pattern to delete 

sig_plot_df = contr_per.copy()
sig_plot_df['Samples'] = sig_plot_df['Spectra_Type'] + '_' + sig_plot_df['Group_Type'].str.replace(' only', '')
sig_plot_df = sig_plot_df.drop(['Group_Type', 'Spectra_Type'], 1)
sig_plot_df = sig_plot_df.pivot(index='Samples', columns='Signature', values='Percentage').reset_index()
sig_plot_df = sig_plot_df.fillna(0)

sig_plot_df['Samples_Sort'] = sig_plot_df['Samples'].str.replace(pt_del, '', regex=True).str.replace('_', '-')
sig_plot_df['WithTv'] = sig_plot_df.Samples.str.contains('Tv')

sig_plot_df['Samples_Sort'] = sig_plot_df['Samples_Sort'].astype(cat_type)
sig_plot_df = sig_plot_df.sort_values(['Samples_Sort', 'WithTv']).drop(['Samples_Sort', 'WithTv'], 1)
sig_plot_df = sig_plot_df.set_index('Samples').astype(int)

sig_ts_only = sig_plot_df.loc[~sig_plot_df.index.str.contains('Tv')]
sig_ts_only.index = sig_ts_only.index.str.replace('__Ts', '').str.replace('_', '-')

sig_plot_df.to_csv(OUTDIR + 'Assignment_Solution_Signal.txt', sep='\t')
sig_ts_only.to_csv(OUTDIR + 'Assignment_Solution_Signal_Ts.txt', sep='\t')

sig_plot_df.head()


Signature,Unassigned,ΔMSH6,ΔPMS2,ΔRNF168,ΔUNG
Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
high_Actinopteri__Ts,0,0,38,0,61
high_Actinopteri__Ts & Tv,5,0,36,0,57
low_Actinopteri__Ts,0,0,69,0,30
low_Actinopteri__Ts & Tv,2,0,58,14,24
diff_Actinopteri__Ts,10,0,0,0,89


In [6]:
## order of signal Unnasign, MSH, PMS, RNF, UNG
custom_colors = ['#4f31e4','#797979','#c3c3c3','#e1a80c','#ff0000']

outpath = f"{OUTDIR}/total.pdf"

plotActivity(
    OUTDIR + 'Assignment_Solution_Signal.txt', outpath, 
    bin_size=30, 
    custom_colors=custom_colors,
    delimiter_step=6, delimiter_size=2
)

Colors replaced
['#4f31e4', '#797979', '#c3c3c3', '#e1a80c', '#ff0000']


In [7]:
custom_colors = ['#4f31e4','#c3c3c3','#ff0000']

outpath = f"{OUTDIR}/Ts_only.pdf"

plotActivity(
    OUTDIR + 'Assignment_Solution_Signal_Ts.txt', outpath, 
    bin_size=30, 
    custom_colors=custom_colors,
    delimiter_step=3, delimiter_size=1,
    rename=True
)

Colors replaced
['#4f31e4', '#c3c3c3', '#ff0000']
