# Notebook to format sample layout by generating all expected mutants

## Import libraries

In [1]:
import pandas as pd

## Specify paths

In [2]:
layout_path = snakemake.params.layout
frag_seq_path = snakemake.params.seqs
codon_table_path = snakemake.params.codon_table
output_file = snakemake.output[0]

## Import codon table

In [3]:
codon_table = pd.read_csv(codon_table_path, header=0)
#codon_table['codon'] = codon_table['codon'].str.lower()
codon_table.head(3)

Unnamed: 0,codon,aminoacid,freq,number
0,TTT,F,26.26,76999
1,TTC,F,17.89,52459
2,TTA,L,26.31,77131


In [4]:
# Convert to dictionary
codon_dic = dict(zip(codon_table['codon'], codon_table['aminoacid']))

## Define and test functions

In [5]:
def get_alt_codons(seq, codon_dic, mode='NNN'):
    '''
    Based on a DNA sequence, the function returns two lists:
    1) A list containing all 0-based amino acid positions for the sequence
    2) A list containing all possible alternative codons (other than WT codon) at the matching positions
    For list 2, the mode defines which codons are acceptable: NNN by default, or NNK
    Codons are fetched in the provided codon table (dictionary)
    '''
    
    if mode=='NNN':
        alt = [x for x in codon_dic.keys()]
    elif mode=='NNK':
        alt = [x for x in codon_dic.keys() if x[2] in ['G', 'T']]
    else:
        print('Pleae specify a correct mode: either NNN or NNK')
    
    pos_l = []
    var_l = []

    for i in range(0,len(seq),3):
        list_var = [x for x in alt if x != seq[i:i+3]]
        pos_l.append(i//3) # 0-based position (aa)
        var_l.append(list_var) # list of possible codons other than WT
    
    return pos_l, var_l

In [6]:
print(get_alt_codons('TCTCCTGTT', codon_dic, 'NNN'))

([0, 1, 2], [['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTT', 'GTC', 'GTA', 'GTG', 'TCC', 'TCA', 'TCG', 'CCT', 'CCC', 'CCA', 'CCG', 'ACT', 'ACC', 'ACA', 'ACG', 'GCT', 'GCC', 'GCA', 'GCG', 'TAT', 'TAC', 'TAA', 'TAG', 'CAT', 'CAC', 'CAA', 'CAG', 'AAT', 'AAC', 'AAA', 'AAG', 'GAT', 'GAC', 'GAA', 'GAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CGT', 'CGC', 'CGA', 'CGG', 'AGT', 'AGC', 'AGA', 'AGG', 'GGT', 'GGC', 'GGA', 'GGG'], ['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTT', 'GTC', 'GTA', 'GTG', 'TCT', 'TCC', 'TCA', 'TCG', 'CCC', 'CCA', 'CCG', 'ACT', 'ACC', 'ACA', 'ACG', 'GCT', 'GCC', 'GCA', 'GCG', 'TAT', 'TAC', 'TAA', 'TAG', 'CAT', 'CAC', 'CAA', 'CAG', 'AAT', 'AAC', 'AAA', 'AAG', 'GAT', 'GAC', 'GAA', 'GAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CGT', 'CGC', 'CGA', 'CGG', 'AGT', 'AGC', 'AGA', 'AGG', 'GGT', 'GGC', 'GGA', 'GGG'], ['TTT', 'TTC', 'TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG', 'ATT', 'ATC', 'ATA', 'ATG', 'GTC', 'GTA', 'G

In [7]:
def get_nt_seq(seq, mut_dic):
    list_codons = [seq[i:i+3] for i in range(0, len(seq), 3)] # Convert nucleotide sequence to list of codons
    seq_l = [mut_dic[a] if a in mut_dic.keys() else wtcodon for a, wtcodon in enumerate(list_codons)]
    return ''.join(seq_l)

In [8]:
get_nt_seq('TCTCCTGTT', {0: 'TTC', 2:'TTA'})

'TTCCCTTTA'

In [9]:
def get_aa_seq(seq, codon_dic):
    clist = [seq[i:i+3] for i in range(0, len(seq), 3)] # Converting nucleotide sequence to list of codons
    return ''.join([codon_dic[x] for x in clist])

In [10]:
get_aa_seq('TTCCCTTTA', codon_dic)

'FPL'

In [11]:
def get_Hamming_distances(wt, alt_aaseq, pos, alt_c, alt_aa):
    '''
    Based on the WT DNA sequence and corresponding amino acid sequence,
    return Hamming distances in nucleotides, codons and amino acids of mutation
    Mutation is defined by an alternative codon, potentially resulting in an alternative amino acid at a given position
    '''
    import numpy as np
    
    # Test if WT DNA sequence
    if 'non-applicable' in [pos, alt_c, alt_aa]:
        Nham_nt, Nham_a = 0, 0
    else:
        Nham_c = 1
        pos = int(pos)
        
        #Retrieve WT codon and WT aa
        wtc = wt[pos*3:(pos+1)*3]
        wta = alt_aaseq[pos]
    
        # Hamming distance in amino acids
        if alt_aa == wta:
            Nham_a = 0
        else:
            Nham_a = 1
        
        # Hamming distance in nucleotides
        Nham_nt = sum(1 for x,y in zip(wtc, alt_c) if x != y)
    
    return Nham_nt, Nham_a

In [12]:
get_Hamming_distances('TTCCCTTTA', 'FPL', 1, 'CTA', codon_dic['CTA'])

(2, 1)

In [13]:
get_Hamming_distances('TTCCCTTTA', 'FPL', 1, 'CCC', codon_dic['CCC'])

(1, 0)

## Import layout and sequences

In [14]:
layout = pd.read_csv(layout_path)
layout

Unnamed: 0,Sample_name,R1_file,R2_file,N_forward,N_reverse,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint
0,CN_a_r1_F1_T0,CN_a_r1_F1_T0_R1.fastq.gz,CN_a_r1_F1_T0_R2.fastq.gz,TTCATAGCTAATG,GGAACTCTGCCTAAATTTTC,2,71,CN_F1,CN,a,r1,F1,T0
1,CN_a_r2_F1_T0,CN_a_r2_F1_T0_R1.fastq.gz,CN_a_r2_F1_T0_R2.fastq.gz,TTCATAGCTAATG,GGAACTCTGCCTAAATTTTC,2,71,CN_F1,CN,a,r2,F1,T0
2,CN_alp_r1_F1_T0,CN_alp_r1_F1_T0_R1.fastq.gz,CN_alp_r1_F1_T0_R2.fastq.gz,TTCATAGCTAATG,GGAACTCTGCCTAAATTTTC,2,71,CN_F1,CN,alp,r1,F1,T0
3,CN_alp_r2_F1_T0,CN_alp_r2_F1_T0_R1.fastq.gz,CN_alp_r2_F1_T0_R2.fastq.gz,TTCATAGCTAATG,GGAACTCTGCCTAAATTTTC,2,71,CN_F1,CN,alp,r2,F1,T0
4,SC_a_r1_F1_T0,SC_a_r1_F1_T0_R1.fastq.gz,SC_a_r1_F1_T0_R2.fastq.gz,TTCATAGCTAATG,TCCAATCTACCACAGTTTTC,2,68,SC_F1,SC,a,r1,F1,T0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,CN_alp_r2_F3_T3,CN_alp_r2_F3_T3_R1.fastq.gz,CN_alp_r2_F3_T3_R2.fastq.gz,TTACCACCCTATCACCATGT,ACAGCGTGCTGCGTGCTTCA,95,165,CN_F3,CN,alp,r2,F3,T3
92,SC_a_r1_F3_T3,SC_a_r1_F3_T3_R1.fastq.gz,SC_a_r1_F3_T3_R2.fastq.gz,ACACTACTTTGTCACCATGC,ACAGCGTGCTGCGTGCTTCA,92,158,SC_F3,SC,a,r1,F3,T3
93,SC_a_r2_F3_T3,SC_a_r2_F3_T3_R1.fastq.gz,SC_a_r2_F3_T3_R2.fastq.gz,ACACTACTTTGTCACCATGC,ACAGCGTGCTGCGTGCTTCA,92,158,SC_F3,SC,a,r2,F3,T3
94,SC_alp_r1_F3_T3,SC_alp_r1_F3_T3_R1.fastq.gz,SC_alp_r1_F3_T3_R2.fastq.gz,ACACTACTTTGTCACCATGC,ACAGCGTGCTGCGTGCTTCA,92,158,SC_F3,SC,alp,r1,F3,T3


In [15]:
frag_seq = pd.read_csv(frag_seq_path, sep='\t')
frag_seq['WT_aa'] = frag_seq.WT_seq.apply(lambda x: get_aa_seq(x, codon_dic))
frag_seq

Unnamed: 0,Mutated_seq,WT_seq,WT_aa
0,SC_F1,GTTACCGGTGGTATGGCTTCTAAATGGGACCAAAAAGGTATGGACA...,VTGGMASKWDQKGMDIAYEEAALGYKEGGVPIGGCLINNKDGSVLG...
1,SC_F2,GTCTTAGGTAGAGGTCACAACATGAGATTCCAAAAGGGTTCTGCCA...,VLGRGHNMRFQKGSATLHGEISTLENCGRLEGKVYKDTTLYTTLSP...
2,SC_F3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...
3,CN_F1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
4,CN_F2,ATTATTTCCAGAGGTCACAACAACAGAGTCCAATTATCTTCTAACG...,IISRGHNNRVQLSSNVRHGEMDCLENLGRVPEGLLSECAMFTTLSP...
5,CN_F3,ATCATGTGTAGTGCCACTTGCATCTTGTACAAGATCAGAACCGTTG...,IMCSATCILYKIRTVVLAENENFLGGEQLLRDNGANVINLDSEEIK...


In [16]:
withSeqs = layout.drop(['R1_file','R2_file','N_forward','N_reverse'], axis=1).merge(right=frag_seq, on='Mutated_seq')
withSeqs

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
1,CN_a_r2_F1_T0,2,71,CN_F1,CN,a,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
2,CN_alp_r1_F1_T0,2,71,CN_F1,CN,alp,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
3,CN_alp_r2_F1_T0,2,71,CN_F1,CN,alp,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
4,CN_a_r1_F1_T1,2,71,CN_F1,CN,a,r1,F1,T1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
...,...,...,...,...,...,...,...,...,...,...,...
91,SC_alp_r2_F3_T2,92,158,SC_F3,SC,alp,r2,F3,T2,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...
92,SC_a_r1_F3_T3,92,158,SC_F3,SC,a,r1,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...
93,SC_a_r2_F3_T3,92,158,SC_F3,SC,a,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...
94,SC_alp_r1_F3_T3,92,158,SC_F3,SC,alp,r1,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...


In [17]:
WTdf = withSeqs.copy()
WTdf['nt_seq'] = WTdf.WT_seq
WTdf['WT'] = True
for x in ['pos','aa_pos','alt_codons','alt_aa']:
    WTdf[x] = 'non-applicable'
WTdf

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,nt_seq,WT,pos,aa_pos,alt_codons,alt_aa
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
1,CN_a_r2_F1_T0,2,71,CN_F1,CN,a,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
2,CN_alp_r1_F1_T0,2,71,CN_F1,CN,alp,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
3,CN_alp_r2_F1_T0,2,71,CN_F1,CN,alp,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
4,CN_a_r1_F1_T1,2,71,CN_F1,CN,a,r1,F1,T1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,SC_alp_r2_F3_T2,92,158,SC_F3,SC,alp,r2,F3,T2,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,True,non-applicable,non-applicable,non-applicable,non-applicable
92,SC_a_r1_F3_T3,92,158,SC_F3,SC,a,r1,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,True,non-applicable,non-applicable,non-applicable,non-applicable
93,SC_a_r2_F3_T3,92,158,SC_F3,SC,a,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,True,non-applicable,non-applicable,non-applicable,non-applicable
94,SC_alp_r1_F3_T3,92,158,SC_F3,SC,alp,r1,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,True,non-applicable,non-applicable,non-applicable,non-applicable


## Generate expected variants

In [18]:
withSeqs['pos'], withSeqs['alt_codons'] = zip(*withSeqs.WT_seq.apply(lambda x: get_alt_codons(x, codon_dic, 'NNN')))
withSeqs.head(2)

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,pos,alt_codons
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[TTT, TTC, TTA, TTG, CTT, CTC, CTA, CTG, ATT,..."
1,CN_a_r2_F1_T0,2,71,CN_F1,CN,a,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[[TTT, TTC, TTA, TTG, CTT, CTC, CTA, CTG, ATT,..."


In [19]:
singles_compact = withSeqs.explode(['pos','alt_codons'])
singles_compact.head(2)

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,pos,alt_codons
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,"[TTT, TTC, TTA, TTG, CTT, CTC, CTA, CTG, ATT, ..."
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,1,"[TTT, TTC, TTA, TTG, CTT, CTC, CTA, CTG, ATT, ..."


In [20]:
singles_df = singles_compact.explode('alt_codons')
singles_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,pos,alt_codons
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTT
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTC
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTA
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTG
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,CTT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,AGG
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGT
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGC
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGA


In [21]:
singles_df['mutations'] = singles_df.apply(lambda row: {row[f'pos']: row[f'alt_codons']}, axis=1)
singles_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,pos,alt_codons,mutations
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTT,{0: 'TTT'}
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTC,{0: 'TTC'}
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTA,{0: 'TTA'}
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTG,{0: 'TTG'}
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,CTT,{0: 'CTT'}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,AGG,{66: 'AGG'}
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGT,{66: 'GGT'}
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGC,{66: 'GGC'}
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGA,{66: 'GGA'}


In [22]:
singles_df['nt_seq'] = singles_df.apply(lambda row: get_nt_seq(row.WT_seq, row.mutations), axis=1)
singles_df.drop(columns='mutations', inplace=True)
singles_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,pos,alt_codons,nt_seq
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTT,TTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTC,TTCCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTA,TTACCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTG,TTGCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,CTT,CTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,AGG,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGT,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGC,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGA,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...


In [23]:
singles_df['alt_aa'] = singles_df.alt_codons.apply(lambda x: codon_dic[x])
singles_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,pos,alt_codons,nt_seq,alt_aa
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTT,TTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,F
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTC,TTCCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,F
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTA,TTACCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,L
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTG,TTGCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,L
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,CTT,CTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,AGG,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,R
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGT,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,G
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGC,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,G
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGA,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,G


In [24]:
singles_df['aa_pos'] = singles_df['pos'] + singles_df['Pos_start']
singles_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,pos,alt_codons,nt_seq,alt_aa,aa_pos
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTT,TTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,F,2
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTC,TTCCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,F,2
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTA,TTACCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,L,2
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,TTG,TTGCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,L,2
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,CTT,CTTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,L,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,AGG,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,R,158
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGT,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,G,158
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGC,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,G,158
95,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,66,GGA,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,G,158


In [25]:
expected_df = pd.concat([WTdf.convert_dtypes(), singles_df], ignore_index=True)
expected_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,nt_seq,WT,pos,aa_pos,alt_codons,alt_aa
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
1,CN_a_r2_F1_T0,2,71,CN_F1,CN,a,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
2,CN_alp_r1_F1_T0,2,71,CN_F1,CN,alp,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
3,CN_alp_r2_F1_T0,2,71,CN_F1,CN,alp,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
4,CN_a_r1_F1_T1,2,71,CN_F1,CN,a,r1,F1,T1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420427,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,AGG,R
420428,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGT,G
420429,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGC,G
420430,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGA,G


In [26]:
expected_df['aa_seq'] = expected_df.nt_seq.apply(lambda x: get_aa_seq(x, codon_dic))
expected_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,WT_aa,nt_seq,WT,pos,aa_pos,alt_codons,alt_aa,aa_seq
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
1,CN_a_r2_F1_T0,2,71,CN_F1,CN,a,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
2,CN_alp_r1_F1_T0,2,71,CN_F1,CN,alp,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
3,CN_alp_r2_F1_T0,2,71,CN_F1,CN,alp,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
4,CN_a_r1_F1_T1,2,71,CN_F1,CN,a,r1,F1,T1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420427,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,AGG,R,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...
420428,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGT,G,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...
420429,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGC,G,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...
420430,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGA,G,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...


In [27]:
expected_df['Nham_codons'] = expected_df.WT.isnull().astype(int)
expected_df['Nham_nt'], expected_df['Nham_aa'] = zip(*expected_df.apply(lambda row: get_Hamming_distances(row.WT_seq, row.WT_aa, row.pos, row.alt_codons, row.alt_aa), axis=1))
expected_df

Unnamed: 0,Sample_name,Pos_start,Pos_stop,Mutated_seq,Species,Mating_type,Replicate,Fragment,Timepoint,WT_seq,...,nt_seq,WT,pos,aa_pos,alt_codons,alt_aa,aa_seq,Nham_codons,Nham_nt,Nham_aa
0,CN_a_r1_F1_T0,2,71,CN_F1,CN,a,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,0,0
1,CN_a_r2_F1_T0,2,71,CN_F1,CN,a,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,0,0
2,CN_alp_r1_F1_T0,2,71,CN_F1,CN,alp,r1,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,0,0
3,CN_alp_r2_F1_T0,2,71,CN_F1,CN,alp,r2,F1,T0,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,0,0
4,CN_a_r1_F1_T1,2,71,CN_F1,CN,a,r1,F1,T1,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,...,TCTCCTGTTGAAGGTTCTCCAGCTAAGCCAGAAGATTACCCACACT...,True,non-applicable,non-applicable,non-applicable,non-applicable,SPVEGSPAKPEDYPHFMSVAHEQALKSLSEGGIPIGAALVHLPTSR...,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420427,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,AGG,R,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,1,3,1
420428,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGT,G,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,1,2,1
420429,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGC,G,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,1,2,1
420430,SC_alp_r2_F3_T3,92,158,SC_F3,SC,alp,r2,F3,T3,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,...,GACATGTGTACTGGTGCCATTATTATGTACGGTATCCCAAGATGTG...,,66,158,GGA,G,DMCTGAIIMYGIPRCVVGENVNFKSKGEKYLQTRGHEVVVVDDERC...,1,1,1


In [28]:
expected_df.drop(['WT_seq','WT_aa'], axis=1, inplace=True)

In [29]:
expected_df.to_csv(output_file)

In [30]:
expected_df.groupby(['Species','Fragment'])[['nt_seq','aa_seq']].nunique().reset_index()

Unnamed: 0,Species,Fragment,nt_seq,aa_seq
0,CN,F1,4411,1401
1,CN,F2,4474,1421
2,CN,F3,4474,1421
3,SC,F1,4222,1341
4,SC,F2,4474,1421
5,SC,F3,4222,1341
