# Define pre-post Omicron curated counts datasets

## Read in Python modules

In [1]:
import os
import pandas as pd

## Read in and curate counts data

In [2]:
# Read in data
counts_df = pd.read_csv('../results/mut_counts_by_clade.csv', low_memory=False)

Ignore sites that are annotated as being masked in any clade of the UShER tree (`masked_in_usher == True`), are annotated for exclusion (`exclude == True`), or were identified to highly homoplastic by De Maio et al. (https://virological.org/t/issues-with-sars-cov-2-sequencing-data/473)

In [3]:
# Ignore sites that are masked or excluded in any clade of the UShER tree
sites_to_ignore = list(counts_df[
    (counts_df['masked_in_usher'] == True) |
    (counts_df['exclude'] == True)
]['nt_site'].unique())

# Homoplastic sites from De Maio et al., which we will also ignore
sites_to_ignore += [
    187, 1059, 2094, 3037, 3130, 6990, 8022, 10323, 10741, 11074, 13408,
    14786, 19684, 20148, 21137, 24034, 24378, 25563, 26144, 26461, 26681, 28077,
    28826, 28854, 29700, 4050, 13402, 11083, 15324, 21575
]

# Retain only non-excluded sites
curated_counts_df = counts_df[
    ~(counts_df['nt_site'].isin(sites_to_ignore))
]

# Save curated counts to an output file
print(sum(curated_counts_df['motif'] != curated_counts_df['ref_motif']))

curated_counts_df.head()

7698


Unnamed: 0,clade,nt_site,nt_mutation,exclude,masked_in_usher,expected_count,actual_count,clade_founder_nt,gene,clade_founder_codon,...,wt_nt,mut_nt,mut_type,mut_class,pre_omicron_or_omicron,nt_site_before_boundary,ss_prediction,unpaired,motif,ref_motif
795,20A,266,A266C,False,False,0.96873,0,A,ORF1a;ORF1ab,ATG;ATG,...,A,C,AC,nonsynonymous,pre_omicron,False,paired,0,GAT,GAT
796,20A,266,A266G,False,False,3.6091,0,A,ORF1a;ORF1ab,ATG;ATG,...,A,G,AG,nonsynonymous,pre_omicron,False,paired,0,GAT,GAT
797,20A,266,A266T,False,False,1.2782,0,A,ORF1a;ORF1ab,ATG;ATG,...,A,T,AT,nonsynonymous,pre_omicron,True,paired,0,GAT,GAT
798,20A,267,T267A,False,False,0.90342,0,T,ORF1a;ORF1ab,ATG;ATG,...,T,A,TA,nonsynonymous,pre_omicron,False,paired,0,ATG,ATG
799,20A,267,T267C,False,False,3.5889,0,T,ORF1a;ORF1ab,ATG;ATG,...,T,C,TC,nonsynonymous,pre_omicron,False,paired,0,ATG,ATG


Create two dataframes of curated counts:

* Cluster of pre-Omicron clades
* Cluster of Omicron clades

Sites are retained if the corresponding codon and motif are conserved. Then aggregates mutation counts accros clades belonging to the same cluster

In [4]:
# Split Omicron/non-Omicron clades
counts_pre_omicron = curated_counts_df.query('pre_omicron_or_omicron == "pre_omicron"')
counts_omicron = curated_counts_df.query('pre_omicron_or_omicron == "omicron"')

# Retain only sites with conserved motifs
curated_pre_omicron = counts_pre_omicron.query('motif == ref_motif')
curated_omicron = counts_omicron.query('motif == ref_motif')
assert sum(curated_pre_omicron['motif'] != curated_pre_omicron['ref_motif']) == 0
assert sum(curated_omicron['motif'] != curated_omicron['ref_motif']) == 0

# Aggregate counts across all clades
ignore_cols = [
    'expected_count', 'actual_count', 'count_terminal', 'count_non_terminal', 'mean_log_size',
    'clade', 'pre_omicron_or_omicron'
]
groupby_cols = [
    col for col in curated_counts_df.columns.values
    if col not in ignore_cols
]
curated_pre_omicron = curated_pre_omicron.groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)
curated_omicron = curated_omicron.groupby(groupby_cols, as_index=False).agg('sum', numeric_only=True)



In [5]:
# Drop multiple nt_mutation which correspond to different clade founder codons
curated_pre_omicron.drop_duplicates(subset='nt_mutation', inplace=True)
curated_omicron.drop_duplicates(subset='nt_mutation', inplace=True)

In [6]:
# Check 
assert sum(curated_pre_omicron['nt_mutation'].duplicated(keep=False)) == 0
assert sum(curated_omicron['nt_mutation'].duplicated(keep=False)) == 0

Summary statistics of mutations in dataset

In [7]:

print('Number of unique muts:')
print('In the full dataset:', len(counts_df['nt_mutation'].unique()))
print('In the curated pre-omicron dataset:', len(curated_pre_omicron['nt_mutation'].unique()))
print('In the curated omicron dataset:', len(curated_omicron['nt_mutation'].unique()))

Number of unique muts:
In the full dataset: 90621
In the curated pre-omicron dataset: 86721
In the curated omicron dataset: 86556


In [8]:
print('Number of curated mutations pre-omicron per category:')
curated_pre_omicron['mut_class'].value_counts()

Number of curated mutations pre-omicron per category:


mut_class
nonsynonymous    63376
synonymous       18743
nonsense          4080
noncoding          522
Name: count, dtype: int64

In [9]:
print('Number of curated mutations omicron per category:')
curated_omicron['mut_class'].value_counts()

Number of curated mutations omicron per category:


mut_class
nonsynonymous    63261
synonymous       18690
nonsense          4086
noncoding          519
Name: count, dtype: int64

In [10]:
curated_pre_omicron.drop(columns=['exclude', 'masked_in_usher'], inplace=True)
curated_omicron.drop(columns=['exclude', 'masked_in_usher'], inplace=True)

In [11]:
outfile_pre_om = '../results/curated/curated_mut_counts_pre_omicron.csv'
if not os.path.isfile(outfile_pre_om):
    curated_pre_omicron.to_csv(outfile_pre_om, index=False)

outfile_om = '../results/curated/curated_mut_counts_omicron.csv'
if not os.path.isfile(outfile_om):
    curated_omicron.to_csv(outfile_om, index=False)
