In [1]:
import pandas as pd

### Download table of subtitutions obtained from Alya

In [2]:
vert_codons = pd.read_table('../data/VertebratePolymorphisms.MutSpecDataWithNonSyn.txt', sep=' ')
vert_codons.head()

Unnamed: 0,Species,Gene,AncestralSeqName,DescendantSeqName,Branch,CodonPosition,PreviousAncCodon,AncestorCodon,NextAncCodon,PreviousDesCodon,DescendantCodon,NextDesCodon,AncestralAA,DescendantAA,Subs,MutType,A,T,G,C
1,Abbottina_obtusirostris,CytB,10,7,Internal,241,ACA,TCA,CTA,ACA,TCG,CTA,S,S,A_G,FourFold,504,172,97,248
2,Abbottina_obtusirostris,CytB,8,10,Internal,259,CCT,GCT,AAC,CCT,GCC,AAC,A,A,T_C,FourFold,504,172,97,248
3,Abbottina_obtusirostris,CytB,7,RN_2,External,135,CCA,TGG,GGG,CCA,TGA,GGG,W,W,G_A,Syn,504,172,97,248
4,Abbottina_obtusirostris,CytB,9,RN_3,External,211,GGC,TTA,AAC,GGC,CTA,AAC,L,L,T_C,Syn,504,172,97,248
5,Abbottina_obtusirostris,CytB,10,7,Internal,83,CTT,CAT,GCC,CTT,CAC,GCC,H,H,T_C,Syn,504,172,97,248


In [4]:
def get_sub_pos(seq1, seq2):
    '''Get position of substitution between Ancestor and Descendant codons.
    Args:
        seq1: Ancestral codon
        seq2: Descendant codon
    Returns:
        Number of position 
    '''
    for nuc_num in range(len(seq1)):
        if seq1[nuc_num] != seq2[nuc_num]:
            return nuc_num

def get_sub(seq1, seq2):
    '''Get substitution between Ancestor and Descendant codons.
    Args:
        seq1: Ancestral codon
        seq2: Descendant codon
    Returns:
        Codon with substitution in format: A[A>T]C
    '''
    pos = get_sub_pos(seq1, seq2)
    if pos == 0:
        return '[' + seq1[0] + '>' + seq2[0] + ']' + seq1[1] + seq1[2]
    elif pos == 1:
        return seq1[0] + '[' + seq1[1] + '>' + seq2[1] + ']' + seq1[2]
    elif pos == 2:
        return seq1[0] +  seq1[1] + '[' + seq1[2] + '>'+ seq2[2] + ']'

### Check if substitution in 3d position and also define type of substitution

Description of types
- 0 - Non Synonymous
- 1 - Synonymous
- 2 - FourFold Synonymous

In [7]:
vert_codons['TypeOfSubs'] = vert_codons['MutType'].apply(lambda x: 0 if x == 'NSyn' else(1 if x == 'Syn'  else 2))
vert_codons['3Pos'] = vert_codons.apply(lambda x: 1 if get_sub_pos(x.AncestorCodon, x.DescendantCodon) == 2 else 0, axis=1) 

In [7]:
def get_comp(row: pd.Series):
    ''' Get 3 and 5 components mutational spectrum from table
    Args:
        row - contains Ancestor and  Descendant codons
    Returns:
        out - pd.DataFrame with 3 and 5 component mutspec
    '''
    prev_anc_codon = row['PreviousAncCodon']
    anc_codon = row['AncestorCodon']
    next_anc_codon = row['NextAncCodon']
    des_codon = row['DescendantCodon']

    # Get position of substitution
    pos = get_sub_pos(anc_codon, des_codon)

    # Add subs 
    codon_with_sub = get_sub(anc_codon, des_codon)

    # Extract
    complete = list(prev_anc_codon + codon_with_sub + next_anc_codon)
    pos_in_complete = 3 + pos
    mut3 = complete[pos_in_complete - 1:pos_in_complete + 6]
    mut5 = complete[pos_in_complete - 2:pos_in_complete + 7]

    # Check if component is full (without - and ?)
    if '-' not in mut3 and '-' not in mut5 and '?' not in mut3 and '?' not in mut5:
        out = pd.DataFrame({'Species': row['Species'], 'Gene': row['Gene'],
                            'MutType': [row['TypeOfSubs']], '3Pos': row['3Pos'],
                            'Mut3': ''.join(mut3), 'Mut5': ''.join(mut5), 'Pos': pos + 1})
        return out
    

### Get 3 and 5 MutSpec

In [8]:
final_df = pd.concat(vert_codons.apply(lambda x: get_comp(x), axis=1).values)

In [12]:
final_df.to_csv('../data/ObsMutSpec.csv', index=False) # save table