# Notebook for computing amino acid fitness mutations

## Snakemake input

In [None]:
orf1ab_to_nsps=snakemake.params.orf_to_nsps
gene_overlaps=snakemake.params.gene_ov
fitness_pseudocount=snakemake.params.fit_pseudo
ntmut_fit=snakemake.input.ntfit_csv
output=snakemake.output.aafit_csv

## Import packages

In [1]:
import numpy as np
import pandas as pd
import sys
import os

In [2]:
# Adding module folder to system paths
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from modules import aamutfit

Columns to be exploded

In [6]:
explode_cols = [
    "gene",
    "clade_founder_aa",
    "mutant_aa",
    "codon_site",
    "aa_mutation",
]



Read data, then:

* Exclude mutations in overlapping reading frames specified for exclusion
* Explode dataframe to make a separate line for each gene.
* Drop ORF1a, the reason being that after we exclude overlapping reading frame sites there aren't any ORF1a sites not also in ORF1ab.
* Aggregate all expected and actual counts for the same amino acid change for each clade / amino-acid mutation

In [22]:
# Read-in fitness of nucleotide mutations
ntmut_fit = pd.read_csv(ntmut_fit)

In [23]:
ntmut_fit.head()

Unnamed: 0,cluster,nt_site,nt_mutation,gene,codon_site,aa_mutation,synonymous,noncoding,expected_count,predicted_count,actual_count,tau_squared,f_max,left_conf_int,right_conf_int,f_mean,f_st_dev
0,BA.2,1,A1C,noncoding,noncoding,noncoding,False,True,2.763,6.22933,0,1.133304,-2.573587,1.732517,1.361119,-2.916866,1.615931
1,BA.2,1,A1T,noncoding,noncoding,noncoding,False,True,5.85249,2.605232,0,1.528482,-1.90773,1.98031,1.364161,-2.404429,1.708155
2,BA.2,1,A1G,noncoding,noncoding,noncoding,False,True,22.546,33.485375,0,1.264059,-3.651029,1.574816,1.351596,-3.921,1.502344
3,BA.2,2,T2A,noncoding,noncoding,noncoding,False,True,4.85726,1.214439,0,1.432641,-1.428756,2.124832,1.303251,-2.010018,1.777772
4,BA.2,2,T2C,noncoding,noncoding,noncoding,False,True,19.8496,37.529563,0,1.408045,-3.679257,1.6302,1.296212,-3.980191,1.495832


Get only coding mutations

In [12]:
ntmut_fit_coding = aamutfit.get_coding(ntmut_fit, gene_overlaps, explode_cols)

In [13]:
ntmut_fit_coding.head()

Unnamed: 0,cluster,nt_site,nt_mutation,gene,codon_site,aa_mutation,synonymous,noncoding,expected_count,predicted_count,...,tau_squared,f_max,left_conf_int,right_conf_int,f_mean,f_st_dev,is_overlapping,overlap_to_exclude,clade_founder_aa,mutant_aa
795,BA.2,266,A266G,ORF1ab,1,M1V,False,False,22.546,33.016702,...,0.885954,-3.757701,1.551756,1.291045,-4.02732,1.458397,False,False,M,V
796,BA.2,266,A266C,ORF1ab,1,M1L,False,False,2.763,8.187283,...,1.047344,-2.774285,1.699045,1.394591,-3.096819,1.584339,False,False,M,L
797,BA.2,266,A266T,ORF1ab,1,M1L,False,False,5.85249,1.668554,...,1.416492,-0.003785,1.254955,1.175321,-0.058746,1.221781,False,False,M,L
798,BA.2,267,T267C,ORF1ab,1,M1T,False,False,19.8496,41.542965,...,1.41454,-2.914588,1.296379,1.175352,-3.048273,1.241749,False,False,M,T
799,BA.2,267,T267A,ORF1ab,1,M1K,False,False,4.85726,4.71238,...,1.432641,-0.792537,1.223349,1.174102,-0.852807,1.206019,False,False,M,K


Aggregate counts for amino acid mutations

In [14]:
aa_counts = aamutfit.aggregate_counts(ntmut_fit_coding, explode_cols)

Adding naive fitness estimates

In [15]:
aamutfit.naive_fitness(aa_counts, fitness_pseudocount=fitness_pseudocount)

In [16]:
aa_counts.head()

Unnamed: 0,cluster,gene,clade_founder_aa,mutant_aa,aa_site,aa_mutation,expected_count,predicted_count,actual_count,tau_squared,naive_fitness
0,BA.2,E,A,A,22,A22A,27.39653,28.863105,10,4.393839,-0.977127
1,BA.2,E,A,A,32,A32A,194.79774,103.35718,472,3.655382,0.883512
2,BA.2,E,A,A,36,A36A,121.9427,146.546479,17,5.260903,-1.945442
3,BA.2,E,A,A,41,A41A,121.9427,203.566927,76,5.492444,-0.470352
4,BA.2,E,A,D,22,A22D,12.6359,12.906202,0,1.567256,-3.268496


Dataframe with refined fitness estimates

In [18]:
aa_fit = aamutfit.aa_fitness(ntmut_fit_coding, explode_cols)

In addition to the entries for full ORF1ab, we also want to have mutations numbered by the nsp naming.

First, make a data frame that converts the numbering:

In [17]:
orf1ab_to_nsps_df = aamutfit.map_orf1ab_to_nsps(orf1ab_to_nsps)

Now we add to our dataframes that have ORF1ab the estimates for the nsp proteins. Note that these means mutations in both ORF1ab and nsp show up **twice** in the data frame with different names, so we add a column to indicate which genes are a subset of ORF1ab:

In [19]:
aa_counts = aamutfit.add_nsps(aa_counts, orf1ab_to_nsps_df)
aa_fit = aamutfit.add_nsps(aa_fit, orf1ab_to_nsps_df)

Merge counts and fitness dataframes and write to file

In [20]:
aamut_fitness = aamutfit.merge_aa_df(aa_fit, aa_counts, explode_cols)

In [21]:
aamut_fitness.head()

Unnamed: 0,cluster,gene,clade_founder_aa,mutant_aa,aa_site,aa_mutation,expected_count,predicted_count,actual_count,tau_squared,naive_fitness,subset_of_ORF1ab,delta_fitness,uncertainty
0,BA.2,E,A,A,22,A22A,27.39653,28.863105,10,4.393839,-0.977127,False,-1.238764,0.750704
1,BA.2,E,A,A,32,A32A,194.79774,103.35718,472,3.655382,0.883512,False,0.518985,0.575201
2,BA.2,E,A,A,36,A36A,121.9427,146.546479,17,5.260903,-1.945442,False,-1.777703,0.654703
3,BA.2,E,A,A,41,A41A,121.9427,203.566927,76,5.492444,-0.470352,False,-0.823123,0.624146
4,BA.2,E,A,D,22,A22D,12.6359,12.906202,0,1.567256,-3.268496,False,-3.319033,1.571656


In [None]:
aamut_fitness.to_csv(output, index=False)