In [1]:
import os
import sys
from collections import defaultdict
from functools import reduce, partial

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
path_to_cancer = "../data/external/signal_cancer/Degasperi, 2020_substitution_reference_cancer_signatures.csv"
path_to_env = "../data/external/signal_env/"

In [4]:
df_can = pd.read_csv(path_to_cancer)
df_can.head(2)

Unnamed: 0,Signature,A[C>A]A,A[C>A]C,A[C>A]G,A[C>A]T,C[C>A]A,C[C>A]C,C[C>A]G,C[C>A]T,G[C>A]A,...,C[T>G]G,C[T>G]T,G[T>G]A,G[T>G]C,G[T>G]G,G[T>G]T,T[T>G]A,T[T>G]C,T[T>G]G,T[T>G]T
0,RefSig 1,0.014425,0.01036,0.001777,0.006919,0.007879,0.006093,0.001494,0.00749,0.010094,...,0.002305,0.004384,0.00115,0.001121,0.002445,0.002846,0.002392,0.002223,0.002443,0.007562
1,RefSig 2,0.006003,0.004071,0.001034,0.004914,0.005428,0.003421,0.000969,0.003906,0.004089,...,0.001351,0.001888,0.000213,0.000377,0.000896,0.000947,0.001549,0.001016,0.001184,0.003057


Transpose Signal **cancer** signatures table

In [22]:
df_can_mod = df_can.transpose()
columns = df_can_mod.loc["Signature"].str.replace(" ", "_").values
df_can_mod.columns = columns
df_can_mod.drop("Signature", inplace=True)
df_can_mod.index.name = "Type"
df_can_mod.reset_index(inplace=True)
df_can_mod.sort_values("Type", inplace=True)
df_can_mod.head()

Unnamed: 0,Type,RefSig_1,RefSig_2,RefSig_3,RefSig_4,RefSig_5,RefSig_MMR1,RefSig_MMR2,RefSig_7,RefSig_8,...,RefSig_N3,RefSig_N4,RefSig_N5,RefSig_N6,RefSig_N7,RefSig_N8,RefSig_N9,RefSig_N10,RefSig_N11,RefSig_N12
0,A[C>A]A,0.014425,0.006003,0.015534,0.046829,0.01974,0.001285,0.005883,0.006263,0.032921,...,3.2e-05,0.006371,0.011394,0.014383,0.017836,0.021711,0.024486,0.018063,0.012342,0.010892
1,A[C>A]C,0.01036,0.004071,0.01266,0.029543,0.013059,0.002134,0.004158,0.002922,0.029357,...,0.000158,0.012162,0.017159,0.024441,1.4e-05,0.018174,0.006367,0.00485,0.008841,0.00304
2,A[C>A]G,0.001777,0.001034,0.002916,0.008774,0.004065,0.000391,0.000721,0.000426,0.003438,...,1e-06,0.00095,0.001105,0.004101,0.000983,0.006046,0.000636,0.003283,0.003049,0.000195
3,A[C>A]T,0.006919,0.004914,0.014087,0.029801,0.011391,0.003651,0.005277,0.00443,0.03191,...,0.003335,0.008824,0.00827,0.012185,0.00898,0.006306,0.008538,0.00792,0.005989,0.002121
16,A[C>G]A,0.003699,0.002596,0.02853,0.011423,0.007313,0.002916,0.007239,0.004232,0.012002,...,0.000645,0.008364,0.0,0.006887,0.009658,0.005835,0.02247,0.008094,0.00157,0.007276


In [23]:
df_can_mod.to_csv("../data/signal_cancer.txt", sep="\t", index=None)

Concat **environment** signatures

In [47]:
data_env = []
morder = None
for fn in os.listdir(path_to_env):
    fp = os.path.join(path_to_env, fn)
    _df = pd.read_csv(fp)
    # check order of mutation
    if morder is None:
        morder = _df.substitution.values
        data_env.append(_df[["substitution"]])

    if not np.all(_df.substitution.values == morder):
        break
    morder = _df.substitution.values
    data_env.append(_df.drop("substitution", axis=1))

df_env = pd.concat(data_env, axis=1)
df_env.rename({"substitution": "Type"}, axis=1, inplace=True)
df_env.sort_values("Type", inplace=True)
df_env.head()

Unnamed: 0,Type,Dimethyl sulfate 0.078 mM,"Dibenzo[a,l]pyrene 0.0313 µM + rat S9",Carboplatin 5 µM,N-Nitrosopyrrolidine 50 mM,"1,2-Dimethylhydrazine 11.6 mM + rat S9","Dibenzo[a,l]pyrene-diol epoxide 0.000625 µM",Methyleugenol 1.25 mM,Semustine 150 µM,"Dibenz[a,h]anthracene 75 µM + rat S9",...,Propylene oxide 10 mM,AZD 7762 (Chk1/2 inhibitor) 1.625 µM,Aflatoxin B1 0.25 µM + rat S9,Aristolochic acid II 37.5 µM,PhIP 4 µM + rat S9,Mechlorethamine 0.3 µM,5-Methylchrysene 1.6 µM + rat S9,Simulated solar radiation 1.25 J,Cyclophosphamide 18.75 µM + rat S9,Diethyl sulfate 0.938 mM
0,A[C>A]A,0.000462,0.008699,0.001942,0.006856,0.00665,0.018642,0.111458,0.000122,0.032361,...,0.010414,0.000395,0.03794,0.000162,0.032668,0.000134,0.028628,2.2e-05,0.005559,0.014167
1,A[C>A]C,0.003832,0.000375,0.057526,6.8e-05,0.003603,0.006272,0.000306,0.000127,0.016155,...,0.00142,0.0,0.055834,0.000689,0.07384,0.001083,0.02311,3.9e-05,0.012496,0.001431
2,A[C>A]G,0.000685,0.003253,0.000446,0.000208,0.001161,0.004201,0.000901,0.019274,0.019,...,0.00012,0.0,0.040743,0.000684,0.008274,1.6e-05,0.016826,5e-06,0.001332,0.001674
3,A[C>A]T,0.000184,0.009761,0.034462,0.0,0.006191,0.005996,0.004353,0.000153,0.019978,...,0.002265,0.008859,0.004433,0.00111,0.065813,0.003369,0.029308,6.2e-05,0.004327,0.005733
16,A[C>G]A,0.004702,0.000179,0.01187,0.000285,0.00106,0.005934,0.000135,0.002541,0.005322,...,0.000784,0.001033,0.034885,0.001543,0.006536,0.001804,0.004797,9.3e-05,0.001859,0.002986


In [49]:
df_env.to_csv("../data/signal_environment.txt", index=None, sep="\t")