# Make a dataframe with curated mutation counts

## Import Python modules

In [None]:
import os
import pandas as pd
import numpy as np
import random

## Identify sites that are conserved in all clade founders

Read in clade founder sequences, add a column giving a site's sequence context, then make a list of sites where the site, its codon, and its 3mer sequence context are conserved across all founders and identical to the Wuhan-Hu-1 reference sequence (ignoring the codon requirement for nonidentical sites).

**NOTE**: our GitHub repository does not track the directory defined by the variable `fitness_results_dir`. This directory contains results of cloning the [Bloom and Neher repository](https://github.com/jbloomlab/SARS2-mut-fitness) for estimating fitness effects and running the pipeline in that repository on an UShER tree with all sequences in GISAID as of 2024-04-24.

However, we do track the files that this notebook generates, including the curated site-specific mutation counts in the file `results/curated_mut_counts.csv`.

In [None]:
# Read in data
fitness_results_dir = '../SARS2-mut-fitness/results_gisaid_2024-04-24'
founder_df = pd.read_csv(os.path.join(fitness_results_dir, 'clade_founder_nts/clade_founder_nts.csv'))
founder_df.sort_values(['clade', 'site'], inplace=True)

# Get founder seqs
founder_seq_dict = {}
for (clade, data) in founder_df.groupby('clade'):
    founder_seq_dict[clade] = ''.join(data['nt'])

# For each row, get the site's 3mer motif in the corresponding founder sequence
def get_motif(site, clade):
    founder_seq = founder_seq_dict[clade]
    return founder_seq[site-2:site+1]
min_and_max_sites = [founder_df['site'].min(), founder_df['site'].max()]
founder_df['motif'] = founder_df.apply(
    lambda row: np.nan if row['site'] in min_and_max_sites \
        else get_motif(row['site'], row['clade']),
    axis=1
)

# Add columns giving the reference codon and motif
founder_df = founder_df.merge(
    (
        founder_df[founder_df['clade'] == '19A']
        .rename(columns={'codon' : 'ref_codon', 'motif' : 'ref_motif'})
    )[['site', 'ref_codon', 'ref_motif']], on='site', how='left'
)

# Identify sites where the codon and motif are conserved across all clade founders
# by subsetting data to entries with identical codons/motifs to reference, then
# identifying sites that still have entries for all clades
data = founder_df[
    (founder_df['codon'] == founder_df['ref_codon']) &
    (founder_df['motif'] == founder_df['ref_motif'])
]
site_counts = data['site'].value_counts()
nclades = len(founder_df['clade'].unique())
conserved_sites = site_counts[site_counts == nclades].index
founder_df['same_context_all_founders'] = founder_df['site'].isin(conserved_sites)
founder_df['nt_site'] = founder_df['site']

print('Number of sites in genome:', len(founder_df['site'].unique()))
print('Number of conserved sites:', len(conserved_sites))

## Read in and curate counts data

Read in dataframe on actual and expected counts, and add columns with metadata.

In [None]:
# Read in data
counts_df = pd.read_csv(os.path.join(
    fitness_results_dir,
    'expected_vs_actual_mut_counts/expected_vs_actual_mut_counts.csv'
))

# Add metadata
counts_df[['wt_nt', 'mut_nt']] = counts_df['nt_mutation'].str.extract(r'(\w)\d+(\w)')
counts_df['mut_type'] = counts_df['wt_nt'] + counts_df['mut_nt']

def get_mut_class(row):
    if row['synonymous']:
        return 'synonymous'
    elif row['noncoding']:
        return 'noncoding'
    elif '*' in row['mutant_aa']:
        return 'nonsense'
    elif row['mutant_aa'] != row['clade_founder_aa']:
        return 'nonsynonymous'
    else:
        raise ValueError(row['mutant_aa'], row['clade_founder_aa'])

counts_df['mut_class'] = counts_df.apply(lambda row: get_mut_class(row), axis=1)

# Add column indicating if clade is pre-Omicron or Omicron
pre_omicron_clades = [
    '20A', '20B', '20C', '20E', '20G', '20H', '20I', '20J', '21C','21I', '21J'
]
counts_df['pre_omicron_or_omicron'] = counts_df['clade'].apply(
    lambda x: 'pre_omicron' if x in pre_omicron_clades else 'omicron'
)

# Add column indicating if a site is before site 21,555
counts_df['nt_site_before_21555'] = counts_df['nt_site'] < 21555

# Add column indicating whether RNA sites from the Lan, 2022, Nature Comm. structure
# are predicted to be paired, using code from Hensel, 2023, biorxiv
filename = '../data/lan_2022/41467_2022_28603_MOESM11_ESM.txt'
with open(filename) as f:
    lines = [line.rstrip().split() for line in f]
paired = np.array([[int(x[0]),int(x[4])] for x in lines[1:]])
paired_dict = dict(zip(paired[:,0], paired[:,1]))
def assign_ss_pred(site):
    if site not in paired_dict:
        return 'nd'
    elif paired_dict[site] == 0:
        return 'unpaired'
    else:
        return 'paired'
counts_df['ss_prediction'] = counts_df['nt_site'].apply(lambda x: assign_ss_pred(x))

# Add columns giving a site's motif relative to the clade founder
# and the reference sequence
counts_df = counts_df.merge(
    founder_df[['nt_site', 'clade', 'motif', 'ref_motif']],
    on = ['nt_site', 'clade'], how='left',
)

Create a dataframe with curated counts. We curate the data in the following ways:
* only analyze sites that pass the above conservation criteria
* ignore sites that are annotated as being masked in any clade of the UShER tree (`masked_in_usher == True`), are annotated for exclusion (`exclude == True`), or were identified to highly homoplastic by De Maio et al. (https://virological.org/t/issues-with-sars-cov-2-sequencing-data/473)

Then, subset the dataframe to one row for each possible mutation, including the following columns:
* `actual_count`: gives the mutation's count for `subset == all` from the above dataframe of counts
* additional columns give actual counts for subsets of the data, such as geographical subsets (England vs. USA) or phylogenetic subsets (pre-Omicron vs. Omicron)

In [None]:
# Ignore sites that are masked or excluded in any clade of the UShER tree
sites_to_ignore = list(counts_df[
    (counts_df['masked_in_usher'] == True) |
    (counts_df['exclude'] == True)
]['nt_site'].unique())

# Homoplastic sites from De Maio et al., which we will also ignore
sites_to_ignore += [
    187, 1059, 2094, 3037, 3130, 6990, 8022, 10323, 10741, 11074, 13408,
    14786, 19684, 20148, 21137, 24034, 24378, 25563, 26144, 26461, 26681, 28077,
    28826, 28854, 29700, 4050, 13402, 11083, 15324, 21575
]

# Aggregate counts across...
# ... all clades for "all" subset
ignore_cols = [
    'expected_count', 'actual_count', 'count_terminal', 'count_non_terminal', 'mean_log_size',
    'clade', 'pre_omicron_or_omicron'
]
groupby_cols = [
    col for col in counts_df.columns.values
    if col not in ignore_cols
]
curated_counts_df = counts_df[
    (counts_df['nt_site'].isin(conserved_sites)) &
    ~(counts_df['nt_site'].isin(sites_to_ignore)) &
    (counts_df['subset'] == 'all')
].groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)
del curated_counts_df['mean_log_size']
assert sum(curated_counts_df['nt_mutation'].duplicated(keep=False)) == 0

# ... England or USA, and merge counts column with above dataframe
subsets = ['England', 'USA']
for subset in subsets:
    subset_data = counts_df[
        (counts_df['nt_site'].isin(conserved_sites)) &
        ~(counts_df['nt_site'].isin(sites_to_ignore)) &
        (counts_df['subset'] == subset)
    ].groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)
    assert sum(subset_data['nt_mutation'].duplicated(keep=False)) == 0
    assert len(subset_data) == len(curated_counts_df)
    curated_counts_df = curated_counts_df.merge(
        (
            subset_data
            .rename(columns={'actual_count' : f'actual_count_{subset}'})
        )[['nt_mutation', f'actual_count_{subset}']], on='nt_mutation'
    )

# ... pre-Omicron or Omicron clades, and merge counts column with above dataframe
subsets = ['pre_omicron', 'omicron']
for subset in subsets:
    subset_data = counts_df[
        (counts_df['nt_site'].isin(conserved_sites)) &
        ~(counts_df['nt_site'].isin(sites_to_ignore)) &
        (counts_df['subset'] == 'all') &
        (counts_df['pre_omicron_or_omicron'] == subset)
    ].groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)
    assert sum(subset_data['nt_mutation'].duplicated(keep=False)) == 0
    assert len(subset_data) == len(curated_counts_df)
    curated_counts_df = curated_counts_df.merge(
        (
            subset_data
            .rename(columns={'actual_count' : f'actual_count_{subset}'})
        )[['nt_mutation', f'actual_count_{subset}']], on='nt_mutation'
    )

# Save curated counts to an output file
assert sum(curated_counts_df['motif'] != curated_counts_df['ref_motif']) == 0
assert len(curated_counts_df) == len(curated_counts_df['nt_mutation'].unique())
curated_counts_df.drop(columns=['subset', 'exclude', 'masked_in_usher'], inplace=True)
outfile = '../results/curated_mut_counts.csv'
if not os.path.isfile(outfile):
    curated_counts_df.to_csv(outfile, index=False)

curated_counts_df.head()

Summary statistics of mutations in datset

In [None]:
print('Number of unique muts:')
print('In the full dataset:', len(counts_df['nt_mutation'].unique()))
print('In the curated dataset:', len(curated_counts_df['nt_mutation'].unique()))

In [None]:
print('Number of unique curated mutations per category:')
curated_counts_df['mut_class'].value_counts()

In [None]:
print('Total number of actual counts in the full dataset or specific subsets:')
curated_counts_df[['actual_count', 'actual_count_pre_omicron', 'actual_count_omicron']].sum()

## Create train and test splits using data on synonymous mutations

In [None]:
# Get data for synonymous mutations
splits_df = curated_counts_df[curated_counts_df['synonymous'] == True].copy()

# Determine the number of mutations to be included in training vs. testing data
# using 80/20 splits
ntotal = len(splits_df)
ntrain = int(np.floor(ntotal * 0.8))
ntest = ntotal - ntrain
print('Total number of synonymous mutations in data:', ntotal)
print('Number in each training set:', ntrain)
print('Number in each test set:', ntest)

# Generate 10 random train/test splits
train_list = ['train'] * ntrain
test_list = ['test'] * ntest
split_list = train_list + test_list
assert len(split_list) == ntotal
nsplits = 10
random.seed(1)
for i in range(nsplits):
    random.shuffle(split_list)
    splits_df[f'split_{i}'] = split_list

# Write dataframe to an output file
outfile = '../results/syn_mut_train_test_splits.csv'
if not os.path.isfile(outfile):
    splits_df.to_csv(outfile, index=False)

splits_df

## Use the dataframe with counts at all sites to make a list of gene boundaires

In [None]:
# Get gene boundaires
gene_boundaries_df = counts_df.groupby('gene', as_index=False).agg(
    min_site = ('nt_site', 'min'),
    max_site = ('nt_site', 'max'),
)
gene_boundaries_df['gene'].replace('ORF1a;ORF1ab', 'ORF1a', inplace=True)
gene_boundaries_df['gene'].replace('ORF1ab', 'ORF1b', inplace=True)
gene_boundaries_df = gene_boundaries_df[
    ~(gene_boundaries_df['gene'].str.contains(';')) &
    ~(gene_boundaries_df['gene'].isin(['noncoding']))
].reset_index(drop=True).sort_values('min_site')

# Save list to file
outfile = '../results/gene_boundaries.csv'
if not os.path.isfile(outfile):
    gene_boundaries_df.to_csv(outfile, index=False)

gene_boundaries_df

## Compute the total number of actual counts in a given subtree


In [None]:
(
    counts_df[
        (counts_df['subset'] == 'all') &
        ~(counts_df['nt_site'].isin(sites_to_ignore))
    ]
    .groupby(['pre_omicron_or_omicron'], as_index=False)['actual_count'].sum()
)