In [143]:
import pandas as pd
from scipy import spatial
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

### Download mutspec of vertebrates and SBS cosmic data 

In [144]:
mut_vert = pd.read_csv('../data/MutSpecVertebratescytb.csv')
cosm_sbs = pd.read_table('../data/external/cosmic_SBS.txt')
cosm_sbs.head()

Unnamed: 0,Type,SBS1,SBS2,SBS3,SBS4,SBS5,SBS6,SBS7a,SBS7b,SBS7c,...,SBS86,SBS87,SBS88,SBS89,SBS90,SBS91,SBS92,SBS93,SBS94,SBS95
0,A[C>A]A,0.000886,5.800168e-07,0.020808,0.042196,0.011998,0.000425,6.7e-05,0.002329,0.00483,...,0.002954,0.008973,1e-18,0.032169,0.002202,0.002945,0.011329,0.011573,0.01558,0.038408
1,A[C>A]C,0.00228,0.0001480043,0.016507,0.033297,0.009438,0.000524,0.000179,0.000461,0.00115,...,0.003775,0.004573,1e-18,0.017694,0.000708,0.052997,0.009745,0.008096,0.024746,0.017384
2,A[C>A]G,0.000177,5.230151e-05,0.001751,0.015599,0.00185,5.2e-05,7.1e-05,0.000186,0.000377,...,0.000385,0.006209,1e-18,0.009671,0.000139,0.000204,0.004697,0.001761,0.001574,0.00836
3,A[C>A]T,0.00128,9.780282e-05,0.012205,0.029498,0.006609,0.00018,0.000248,0.00071,0.00196,...,0.003624,0.004957,0.001731102,0.020744,0.001755,0.000131,0.007758,0.008421,0.011076,0.023294
4,A[C>G]A,0.00186,2.230064e-16,0.019708,0.006889,0.010098,0.000471,6.5e-05,9e-06,0.00112,...,0.052516,0.007866,1e-18,0.014817,0.000508,0.000243,0.003056,0.008857,0.007004,0.003617


### Change format of cosmic data

In [145]:
cosm_sbs = cosm_sbs.rename(index=cosm_sbs.Type).drop(['Type'], axis=1)
cosm_sbs = cosm_sbs.transpose().reset_index()


In [146]:
cosm_sbs = cosm_sbs.melt("index", cosm_sbs.columns[1:], var_name="Mut")
cosm_sbs['MutBase'] = cosm_sbs.Mut.str.slice(2,5)
cosm_comp = cosm_sbs[cosm_sbs.MutBase == 'C>T']

In [147]:
cosm_comp = cosm_comp.rename(columns={'index':'sbs_type', 'value':'MutSpec'})
cosm_comp.head()

Unnamed: 0,sbs_type,Mut,MutSpec,MutBase
632,SBS1,A[C>T]A,0.025004,C>T
633,SBS2,A[C>T]A,6.1e-05,C>T
634,SBS3,A[C>T]A,0.014206,C>T
635,SBS4,A[C>T]A,0.008699,C>T
636,SBS5,A[C>T]A,0.032593,C>T


### Count MutSpec for all vertebrates

In [148]:
mut_vert.head()

Unnamed: 0,Species,Mut,Expected,Observed,RawMutSpec,RawMutSpecSum,MutSpec,MutBase,Context,Class
0,Abbottina_obtusirostris,T[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,TTT,Actinopterygii
1,Abbottina_obtusirostris,G[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,GTT,Actinopterygii
2,Abbottina_obtusirostris,C[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,CTT,Actinopterygii
3,Abbottina_obtusirostris,A[T>G]T,0.0,0.0,0.0,1.088889,0.0,T>G,ATT,Actinopterygii
4,Abbottina_obtusirostris,T[T>C]T,8.0,0.0,0.0,1.088889,0.0,T>C,TTT,Actinopterygii


In [149]:
mut_vert = mut_vert.groupby(['Mut'])['RawMutSpec'].sum().reset_index()
mut_vert["RawMutSpecSum"] = mut_vert.RawMutSpec.sum()
mut_vert['MutSpec'] = mut_vert.RawMutSpec / mut_vert.RawMutSpecSum
mut_vert = mut_vert.drop(['RawMutSpec', 'RawMutSpecSum'], axis=1)


### Save 192 comp MutSpec for ALL vertebrates 

In [151]:
mut_vert.to_csv('../data/MutSpecALLvert.csv', index=False)

In [140]:
mut_vert['MutBase'] = mut_vert.Mut.str.slice(2,5)
mut_to_comp = mut_vert[mut_vert.MutBase == 'G>A']


### Count similarity betweeen each SBSX and MutSpec of vertebrates

In [141]:
cos_res = []
for sbs in cosm_comp.sbs_type.drop_duplicates():
    sbs_to_cosine = cosm_comp[cosm_comp.sbs_type == sbs]
    cos = spatial.distance.cosine(sbs_to_cosine['MutSpec'], mut_to_comp['MutSpec'])
    comp_r = {'SBS': sbs, 'Similarity': 1-cos}
    cos_res.append(comp_r)
cos_res = pd.DataFrame(cos_res)

In [142]:
cos_res.sort_values(by='Similarity').tail()

Unnamed: 0,SBS,Similarity
24,SBS18,0.914264
11,SBS9,0.927928
4,SBS5,0.952137
44,SBS38,0.954477
42,SBS36,0.961308
