# Annotate clade-wise mutation counts

## Inputs from snakemake

In [None]:
clade_founder = snakemake.input.clade_founder
counts = snakemake.input.counts
rna_struct = snakemake.input.rna_struct
founder_csv = snakemake.output.founder_csv
counts_csv = snakemake.output.counts_csv

## Read in Python modules

In [1]:
import os
import pandas as pd
import numpy as np

## Clade Founders

Read in clade founder sequences adding a column giving a site's sequence context.

In [2]:
# Download and read in data clade founder data from jbloomlab/SARS2-mut_fitness/results 
founder_df = pd.read_csv(clade_founder)
founder_df.sort_values(['clade', 'site'], inplace=True)

# Get founder seqs
founder_seq_dict = {}
for (clade, data) in founder_df.groupby('clade'):
    founder_seq_dict[clade] = ''.join(data['nt'])

# For each row, get the site's 3mer motif in the corresponding founder sequence
def get_motif(site, clade):
    founder_seq = founder_seq_dict[clade]
    return founder_seq[site-2:site+1]
min_and_max_sites = [founder_df['site'].min(), founder_df['site'].max()]
founder_df['motif'] = founder_df.apply(
    lambda row: np.nan if row['site'] in min_and_max_sites \
        else get_motif(row['site'], row['clade']),
    axis=1
)

# Add columns giving the reference codon and motif
founder_df = founder_df.merge(
    (
        founder_df[founder_df['clade'] == '19A']
        .rename(columns={'codon' : 'ref_codon', 'motif' : 'ref_motif'})
    )[['site', 'ref_codon', 'ref_motif']], on='site', how='left'
)

founder_df.rename(columns={'site': 'nt_site'}, inplace=True)

In [None]:
founder_df.to_csv(founder_csv, index=False)

## Read in and annotate counts data

Read in dataframe on actual and expected counts, and add columns with metadata.

In [3]:
# Read in data

# Download and read counts data from jbloomlab/SARS2-mut_fitness/results 
counts_df = pd.read_csv(counts)
counts_df = counts_df.query("subset == 'all'")

# Add metadata
counts_df[['wt_nt', 'mut_nt']] = counts_df['nt_mutation'].str.extract(r'(\w)\d+(\w)')
counts_df['mut_type'] = counts_df['wt_nt'] + counts_df['mut_nt']

def get_mut_class(row):
    if row['synonymous']:
        return 'synonymous'
    elif row['noncoding']:
        return 'noncoding'
    elif '*' in row['mutant_aa']:
        return 'nonsense'
    elif row['mutant_aa'] != row['clade_founder_aa']:
        return 'nonsynonymous'
    else:
        raise ValueError(row['mutant_aa'], row['clade_founder_aa'])

counts_df['mut_class'] = counts_df.apply(lambda row: get_mut_class(row), axis=1)

# Add column indicating if clade is pre-Omicron or Omicron
pre_omicron_clades = [
    '20A', '20B', '20C', '20E', '20G', '20H', '20I', '20J', '21C','21I', '21J'
]
counts_df['pre_omicron_or_omicron'] = counts_df['clade'].apply(
    lambda x: 'pre_omicron' if x in pre_omicron_clades else 'omicron'
)

# Add column indicating if a site is before the light switch boundary
def light_switch(mut, site, lb1=13467, lb2=21562):
    if mut in ["AT", "CG", "GC"]:
        pos_bool = True if site < lb2 else False
    elif mut == "CT":
        pos_bool = True if site < lb1 else False
    else:
        pos_bool = False

    return pos_bool

counts_df['nt_site_before_boundary'] = counts_df.apply(lambda x: light_switch(x.mut_type, x.nt_site), axis=1)

# Add column indicating whether RNA sites from the Lan, 2022, Nature Comm. structure
# are predicted to be paired, using code from Hensel, 2023, biorxiv
#filename = '../data/lan_2022/41467_2022_28603_MOESM11_ESM.txt'
with open(rna_struct) as f:
    lines = [line.rstrip().split() for line in f]
paired = np.array([[int(x[0]),int(x[4])] for x in lines[1:]])
paired_dict = dict(zip(paired[:,0], paired[:,1]))
def assign_ss_pred(site):
    if site not in paired_dict:
        return 'nd'
    elif paired_dict[site] == 0:
        return 'unpaired'
    else:
        return 'paired'
counts_df['ss_prediction'] = counts_df['nt_site'].apply(lambda x: assign_ss_pred(x))
counts_df['unpaired'] = counts_df['ss_prediction'].apply(lambda x: 1 if x == 'unpaired' else 0)

# Add columns giving a site's motif relative to the clade founder
# and the reference sequence
counts_df = counts_df.merge(
    founder_df[['nt_site', 'clade', 'motif', 'ref_motif']],
    on = ['nt_site', 'clade'], how='left',
)

In [4]:
# Assign motif to genome edges
nt_1 = counts_df.loc[counts_df.nt_site ==1, 'wt_nt'].unique()
for n in nt_1:
    counts_df.loc[(counts_df.nt_site ==1) & (counts_df.wt_nt == n), 'motif'] = "A" + n + "T"
counts_df.loc[counts_df.nt_site == 29903, 'motif'] = 'AAA'

In [5]:
counts_df.head()

Unnamed: 0,clade,subset,nt_site,nt_mutation,exclude,masked_in_usher,expected_count,actual_count,clade_founder_nt,gene,...,wt_nt,mut_nt,mut_type,mut_class,pre_omicron_or_omicron,nt_site_before_boundary,ss_prediction,unpaired,motif,ref_motif
0,20A,all,1,A1C,True,True,0.96873,0,A,noncoding,...,A,C,AC,noncoding,pre_omicron,False,unpaired,1,AAT,
1,20A,all,1,A1G,True,True,3.6091,0,A,noncoding,...,A,G,AG,noncoding,pre_omicron,False,unpaired,1,AAT,
2,20A,all,1,A1T,True,True,1.2782,0,A,noncoding,...,A,T,AT,noncoding,pre_omicron,True,unpaired,1,AAT,
3,20A,all,2,T2A,True,True,0.90342,0,T,noncoding,...,T,A,TA,noncoding,pre_omicron,False,unpaired,1,ATT,ATT
4,20A,all,2,T2C,True,True,3.5889,0,T,noncoding,...,T,C,TC,noncoding,pre_omicron,False,unpaired,1,ATT,ATT


In [None]:
# Save to file
counts_df.drop(columns=['subset'], inplace=True)
if not os.path.isfile(counts_csv):
    counts_df.to_csv(counts_csv, index=False)