# Define pre-post Omicron curated counts datasets

## Inputs from Snakemake

In [None]:
counts_df_file = snakemake.input.mut_counts
# clade_founder = snakemake.input.clade_founder
outfile = snakemake.output.outfile


## Read in Python modules

In [None]:
import os
import pandas as pd

## Read-in counts and clade founders

In [None]:
# Read in data
counts_df = pd.read_csv(counts_df_file, low_memory=False)

In [None]:
# Read in clade founder
# founder_df = pd.read_csv(clade_founder)

## Curate counts data

Determine conserved sites

In [None]:
# Identify sites where the codon and motif are conserved across all clade founders
# by subsetting data to entries with identical codons/motifs to reference, then
# identifying sites that still have entries for all clades
# data = founder_df[
#     (founder_df['codon'] == founder_df['ref_codon']) &
#     (founder_df['motif'] == founder_df['ref_motif'])
# ]
# site_counts = data['nt_site'].value_counts()
# nclades = len(founder_df['clade'].unique())
# conserved_sites = site_counts[site_counts == nclades].index
conserved_sites = counts_df['nt_site'].value_counts().index

Ignore sites that are annotated as being masked in any clade of the UShER tree (`masked_in_usher == True`), are annotated for exclusion (`exclude == True`), or were identified to highly homoplastic by De Maio et al. (https://virological.org/t/issues-with-sars-cov-2-sequencing-data/473)

In [None]:
# Ignore sites that are masked or excluded in any clade of the UShER tree
sites_to_ignore = list(counts_df[
    (counts_df['masked_in_usher'] == True) |
    (counts_df['exclude'] == True)
]['nt_site'].unique())

# Retain only non-excluded and conserved sites
curated_counts_df = counts_df[
    counts_df['nt_site'].isin(conserved_sites) &
    ~(counts_df['nt_site'].isin(sites_to_ignore))
]
curated_counts_df.head()

Create a dataframe of curated counts:

Mutation counts are aggregated accros clades belonging to the same cluster

In [None]:
# Check that motifs are conserved
assert sum(curated_counts_df['motif'] != curated_counts_df['ref_motif']) == 0

# Aggregate counts across all clades
ignore_cols = [
    'expected_count', 'actual_count', 'count_terminal', 'count_non_terminal', 'mean_log_size',
    'clade', 'pre_omicron_or_omicron'
]
groupby_cols = [
    col for col in curated_counts_df.columns.values
    if col not in ignore_cols
]
curated = curated_counts_df.groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)

In [None]:
# Check there are no duplicate n.t. mutations
assert sum(curated['nt_mutation'].duplicated(keep=False)) == 0

Summary statistics of mutations in dataset

In [None]:
print('Number of unique muts:')
print('In the full dataset:', len(counts_df['nt_mutation'].unique()))
print('In the curated dataset:', len(curated['nt_mutation'].unique()))

In [None]:
print('Number of curated mutations per category:')
curated['mut_class'].value_counts()

Save curated counts dataframes

In [None]:
# Drop columns for site exclusions and masking
curated.drop(columns=['exclude', 'masked_in_usher'], inplace=True)

In [None]:
# Write curated dataframes to file
if not os.path.isfile(outfile):
    curated.to_csv(outfile, index=False)