In [62]:
import numpy as np
import pandas as pd
from scipy import spatial

### Download mutspec of vertebrates and SBS cosmic data 

In [50]:
mut_vert = pd.read_csv('../data/MutSpecVertebratescytb.csv')
cosm_sbs = pd.read_table('../data/external/cosmic_SBS.txt')

### Change format of cosmic data

In [51]:
cosm_sbs = cosm_sbs.rename(index=cosm_sbs.Type).drop(['Type'], axis=1)
cosm_sbs = cosm_sbs.transpose().reset_index()

In [52]:
cosm_sbs = cosm_sbs.melt("index", cosm_sbs.columns[1:], var_name="Mut")
cosm_sbs['MutBase'] = cosm_sbs.Mut.str.slice(2,5)
cosm_sbs = cosm_sbs.rename(columns={'index':'sbs_type', 'value':'MutSpec'})
cosm_sbs['AncestorCodon'] = cosm_sbs.Mut.str.get(0) + cosm_sbs.Mut.str.get(2) + cosm_sbs.Mut.str.get(-1)
cosm_sbs.head()

Unnamed: 0,sbs_type,Mut,MutSpec,MutBase,AncestorCodon
0,SBS1,A[C>A]A,0.0008861572,C>A,ACA
1,SBS2,A[C>A]A,5.800168e-07,C>A,ACA
2,SBS3,A[C>A]A,0.02080832,C>A,ACA
3,SBS4,A[C>A]A,0.0421965,C>A,ACA
4,SBS5,A[C>A]A,0.0119976,C>A,ACA


In [53]:
cosm_comp = cosm_sbs[cosm_sbs.MutBase == 'C>T']
cosm_comp.head()

Unnamed: 0,sbs_type,Mut,MutSpec,MutBase,AncestorCodon
632,SBS1,A[C>T]A,0.025004,C>T,ACA
633,SBS2,A[C>T]A,6.1e-05,C>T,ACA
634,SBS3,A[C>T]A,0.014206,C>T,ACA
635,SBS4,A[C>T]A,0.008699,C>T,ACA
636,SBS5,A[C>T]A,0.032593,C>T,ACA


### Count MutSpec for all vertebrates

In [54]:
mut_vert.head()

Unnamed: 0,Species,Mut,Expected,Observed,RawMutSpec,RawMutSpecSum,MutSpec,MutBase,Context,Class
0,Abbottina_obtusirostris,T[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,TTT,Actinopterygii
1,Abbottina_obtusirostris,G[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,GTT,Actinopterygii
2,Abbottina_obtusirostris,C[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,CTT,Actinopterygii
3,Abbottina_obtusirostris,A[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,ATT,Actinopterygii
4,Abbottina_obtusirostris,T[T>C]T,8.0,0.0,0.0,1.088889,0.0,T>C,TTT,Actinopterygii


In [55]:
mut_vert = mut_vert.groupby(['Mut'])['RawMutSpec'].sum().reset_index()
mut_vert["RawMutSpecSum"] = mut_vert.RawMutSpec.sum()
mut_vert['MutSpec'] = mut_vert.RawMutSpec / mut_vert.RawMutSpecSum
mut_vert = mut_vert.drop(['RawMutSpec', 'RawMutSpecSum'], axis=1)


### Save 192 comp MutSpec for ALL vertebrates 

In [151]:
mut_vert.to_csv('../data/MutSpecALLvert.csv', index=False)

In [59]:
mut_vert['MutBase'] = mut_vert.Mut.str.slice(2,5)
mut_vert['AncestorCodon'] = mut_vert.Mut.str.get(0) + mut_vert.Mut.str.get(2) + mut_vert.Mut.str.get(-1)
mut_to_comp = mut_vert[mut_vert.MutBase == 'G>A']


### Count similarity betweeen each SBSX and MutSpec of vertebrates (G>A)

In [141]:
cos_res = []
for sbs in cosm_comp.sbs_type.drop_duplicates():
    sbs_to_cosine = cosm_comp[cosm_comp.sbs_type == sbs]
    cos = spatial.distance.cosine(sbs_to_cosine['MutSpec'], mut_to_comp['MutSpec'])
    comp_r = {'SBS': sbs, 'Similarity': 1-cos}
    cos_res.append(comp_r)
cos_res = pd.DataFrame(cos_res)

In [142]:
cos_res.sort_values(by='Similarity').tail()

Unnamed: 0,SBS,Similarity
24,SBS18,0.914264
11,SBS9,0.927928
4,SBS5,0.952137
44,SBS38,0.954477
42,SBS36,0.961308


### Count simialrity between eqch SBSX and full MutSpec

In [87]:
def gettranslate(mut, reverse=True):
    translator = str.maketrans("ACGT", "TGCA")
    if reverse is True:
        mut = mut[-1] + mut[1:-1] + mut[0]
    new_mut = mut.translate(translator)
    return new_mut


def transform192_to96(asymmetry_df, df_transform):
    to_96_comp = []
    for num_subs in range(0,96):
        # define ancestral and mutbase from mutspec according on 96 comp from asymmetry
        anc = asymmetry_df.iloc[num_subs, 4]
        mb = asymmetry_df.iloc[num_subs, 3]
        # get reverse subs with context
        rev_anc = gettranslate(anc, reverse=True)
        rev_mb = gettranslate(mb, reverse=False)
        # find mutspec num for the mutbase and context same with reverse, than devide on reverse
        rev_mutspec = df_transform[(df_transform.MutBase == rev_mb) & (df_transform.AncestorCodon == rev_anc)].MutSpec.values[0]
        mutspec = df_transform[(df_transform.MutBase == mb) & (df_transform.AncestorCodon == anc)].MutSpec.values[0]
        new_96_mut = mutspec/rev_mutspec
        #define row for future df
        out_mut = {'AncestorCodon': anc, 'MutBase': mb, 'MutSpec': new_96_mut}
        to_96_comp.append(out_mut)

    mut_96_comp = pd.DataFrame(to_96_comp)
    mut_96_comp["MutSpec"] = mut_96_comp["MutSpec"].replace(np.inf, 0)
    mut_96_comp["MutSpec"] = mut_96_comp["MutSpec"].fillna(0)
    return(mut_96_comp)

In [74]:
templ_as = cosm_sbs[cosm_sbs.sbs_type == 'SBS1']
templ_as.head()

Unnamed: 0,sbs_type,Mut,MutSpec,MutBase,AncestorCodon
0,SBS1,A[C>A]A,0.000886,C>A,ACA
79,SBS1,A[C>A]C,0.00228,C>A,ACC
158,SBS1,A[C>A]G,0.000177,C>A,ACG
237,SBS1,A[C>A]T,0.00128,C>A,ACT
316,SBS1,A[C>G]A,0.00186,C>G,ACA


In [90]:
mut_96_comp = transform192_to96(asymmetry_df=templ_as, df_transform=mut_vert)
mut_96_comp.head()

  new_96_mut = mutspec/rev_mutspec
  new_96_mut = mutspec/rev_mutspec


Unnamed: 0,AncestorCodon,MutBase,MutSpec
0,ACA,C>A,0.0
1,ACC,C>A,0.0
2,ACG,C>A,0.0
3,ACT,C>A,0.0
4,ACA,C>G,0.0


In [91]:
cos_res = []
for sbs in cosm_sbs.sbs_type.drop_duplicates():
    sbs_to_cosine = cosm_sbs[cosm_sbs.sbs_type == sbs]
    cos = spatial.distance.cosine(sbs_to_cosine['MutSpec'], mut_96_comp['MutSpec'])
    comp_r = {'SBS': sbs, 'Similarity': 1-cos}
    cos_res.append(comp_r)
cos_res = pd.DataFrame(cos_res)

In [93]:
cos_res.sort_values(by='Similarity').tail()

Unnamed: 0,SBS,Similarity
4,SBS5,0.573457
38,SBS32,0.604126
16,SBS11,0.611283
29,SBS23,0.647296
36,SBS30,0.674522
