# Define pre-post Omicron curated counts datasets

## Inputs from Snakemake

In [None]:
counts_df_file = snakemake.input.mut_counts
clade_founder = snakemake.input.clade_founder
outfile_pre_omicron = snakemake.output.pre_omicron
outfile_omicron = snakemake.output.omicron

## Read in Python modules

In [None]:
import os
import pandas as pd

## Read-in counts and clade founders

In [None]:
# Read in data
counts_df = pd.read_csv(counts_df_file, low_memory=False)

In [None]:
# Read in clade founder
founder_df = pd.read_csv(clade_founder)

## Curate counts data

Determine conserved sites

In [None]:
# Identify sites where the codon and motif are conserved across all clade founders
# by subsetting data to entries with identical codons/motifs to reference, then
# identifying sites that still have entries for all clades
data = founder_df[
    (founder_df['codon'] == founder_df['ref_codon']) &
    (founder_df['motif'] == founder_df['ref_motif'])
]
site_counts = data['nt_site'].value_counts()
nclades = len(founder_df['clade'].unique())
conserved_sites = site_counts[site_counts == nclades].index

Ignore sites that are annotated as being masked in any clade of the UShER tree (`masked_in_usher == True`), are annotated for exclusion (`exclude == True`), or were identified to highly homoplastic by De Maio et al. (https://virological.org/t/issues-with-sars-cov-2-sequencing-data/473)

In [None]:
# Ignore sites that are masked or excluded in any clade of the UShER tree
sites_to_ignore = list(counts_df[
    (counts_df['masked_in_usher'] == True) |
    (counts_df['exclude'] == True)
]['nt_site'].unique())

# Homoplastic sites from De Maio et al., which we will also ignore
sites_to_ignore += [
    187, 1059, 2094, 3037, 3130, 6990, 8022, 10323, 10741, 11074, 13408,
    14786, 19684, 20148, 21137, 24034, 24378, 25563, 26144, 26461, 26681, 28077,
    28826, 28854, 29700, 4050, 13402, 11083, 15324, 21575
]

# Retain only non-excluded and conserved sites
curated_counts_df = counts_df[
    counts_df['nt_site'].isin(conserved_sites) &
    ~(counts_df['nt_site'].isin(sites_to_ignore))
]

curated_counts_df.head()

Create two dataframes of curated counts:

* Cluster of pre-Omicron clades
* Cluster of Omicron clades

Mutation counts are aggregated accros clades belonging to the same cluster

In [None]:
# Split Omicron/non-Omicron clades
curated_pre_omicron = curated_counts_df.query('pre_omicron_or_omicron == "pre_omicron"')
curated_omicron = curated_counts_df.query('pre_omicron_or_omicron == "omicron"')

# Check that motifs are conserved
assert sum(curated_pre_omicron['motif'] != curated_pre_omicron['ref_motif']) == 0
assert sum(curated_omicron['motif'] != curated_omicron['ref_motif']) == 0

# Aggregate counts across all clades
ignore_cols = [
    'expected_count', 'actual_count', 'count_terminal', 'count_non_terminal', 'mean_log_size',
    'clade', 'pre_omicron_or_omicron'
]
groupby_cols = [
    col for col in curated_counts_df.columns.values
    if col not in ignore_cols
]
curated_pre_omicron = curated_pre_omicron.groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)
curated_omicron = curated_omicron.groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)



In [None]:
# Check there are no duplicate n.t. mutations
assert sum(curated_pre_omicron['nt_mutation'].duplicated(keep=False)) == 0
assert sum(curated_omicron['nt_mutation'].duplicated(keep=False)) == 0

Summary statistics of mutations in dataset

In [None]:

print('Number of unique muts:')
print('In the full dataset:', len(counts_df['nt_mutation'].unique()))
print('In the curated pre-omicron dataset:', len(curated_pre_omicron['nt_mutation'].unique()))
print('In the curated omicron dataset:', len(curated_omicron['nt_mutation'].unique()))

In [None]:
print('Number of curated mutations pre-omicron per category:')
curated_pre_omicron['mut_class'].value_counts()

In [None]:
print('Number of curated mutations omicron per category:')
curated_omicron['mut_class'].value_counts()

Save curated counts dataframes

In [None]:
# Drop columns for site exclusions and masking
curated_pre_omicron.drop(columns=['exclude', 'masked_in_usher'], inplace=True)
curated_omicron.drop(columns=['exclude', 'masked_in_usher'], inplace=True)

In [None]:
# Write curated dataframes to file
if not os.path.isfile(outfile_pre_omicron):
    curated_pre_omicron.to_csv(outfile_pre_omicron, index=False)

if not os.path.isfile(outfile_omicron):
    curated_omicron.to_csv(outfile_omicron, index=False)
