# Notebook for defining predicted counts master tables

## Snakemake input

In [None]:
pre_omicron_counts = snakemake.input.pre_omicron_counts
omicron_counts = snakemake.input.omicron_counts
pre_omicron_ms = snakemake.output.pre_omicron_ms
omicron_ms = snakemake.output.omicron_ms

## Import packages

In [1]:
import numpy as np
import pandas as pd
import sys
import os

In [2]:
# Adding module folder to system paths
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from modules import rates
from modules import load

## Load training dataframes

In [4]:
counts_pre_omicron = load.load_synonymous_muts(pre_omicron_counts)

In [5]:
counts_omicron = load.load_synonymous_muts(omicron_counts)

## Initialize rates objects

In [6]:
rate_pre_om = rates.Rates()
rate_om = rates.Rates()

## Populate rates and add predicted counts

In [7]:
rate_pre_om.populate_rates(counts_pre_omicron)
rate_om.populate_rates(counts_omicron)

In [14]:
rate_pre_om.rates["cond_count"] = rate_pre_om.genome_composition(counts_pre_omicron)

In [15]:
rate_om.rates["cond_count"] = rate_om.genome_composition(counts_omicron)

In [16]:
rate_pre_om.rates.head()

Unnamed: 0,mut_type,motif,unpaired,nt_site_before_boundary,rate,predicted_count,condition,residual,cond_count
0,AC,AAA,0,False,0.110585,10.517707,"(AC, AAA, 0, False)",0.0,0
1,AC,AAA,1,False,0.128033,12.177177,"(AC, AAA, 1, False)",0.0,0
2,AC,AAC,0,False,0.09745,9.268445,"(AC, AAC, 0, False)",0.0,0
3,AC,AAC,1,False,0.112919,10.739754,"(AC, AAC, 1, False)",0.0,0
4,AC,AAG,0,False,0.189517,18.024974,"(AC, AAG, 0, False)",0.0,21


In [17]:
rate_om.rates.head()

Unnamed: 0,mut_type,motif,unpaired,nt_site_before_boundary,rate,predicted_count,condition,cond_count
0,AC,AAA,0,False,0.084546,11.307477,"(AC, AAA, 0, False)",0
1,AC,AAA,1,False,0.09878,13.211244,"(AC, AAA, 1, False)",0
2,AC,AAC,0,False,0.072427,9.686594,"(AC, AAC, 0, False)",0
3,AC,AAC,1,False,0.084707,11.329019,"(AC, AAC, 1, False)",0
4,AC,AAG,0,False,0.145828,19.503485,"(AC, AAG, 0, False)",21


## Computing residual variance

In [18]:
counts_pre_omicron['predicted_count'] = rate_pre_om.predicted_counts_by_clade(counts_pre_omicron)

In [19]:
tau_pre_omicron = counts_pre_omicron.groupby("mut_type").apply(
        lambda x: np.mean(
            (np.log(x.actual_count + 0.5) - np.log(x.predicted_count + 0.5)) ** 2
        ), include_groups=False
    )

In [20]:
rate_pre_om.residual_variance(counts_pre_omicron, tau_pre_omicron)

In [21]:
rate_pre_om.rates.head()

Unnamed: 0,mut_type,motif,unpaired,nt_site_before_boundary,rate,predicted_count,condition,residual,cond_count
0,AC,AAA,0,False,0.110585,10.517707,"(AC, AAA, 0, False)",0.588548,0
1,AC,AAA,1,False,0.128033,12.177177,"(AC, AAA, 1, False)",0.588548,0
2,AC,AAC,0,False,0.09745,9.268445,"(AC, AAC, 0, False)",0.588548,0
3,AC,AAC,1,False,0.112919,10.739754,"(AC, AAC, 1, False)",0.588548,0
4,AC,AAG,0,False,0.189517,18.024974,"(AC, AAG, 0, False)",0.506125,21


In [22]:
counts_omicron['predicted_count'] = rate_om.predicted_counts_by_clade(counts_omicron)

In [23]:
tau_omicron = counts_omicron.groupby("mut_type").apply(
        lambda x: np.mean(
            (np.log(x.actual_count + 0.5) - np.log(x.predicted_count + 0.5)) ** 2
        ), include_groups=False
    )

In [24]:
rate_om.residual_variance(counts_omicron, tau_omicron)

In [25]:
rate_om.rates.head()

Unnamed: 0,mut_type,motif,unpaired,nt_site_before_boundary,rate,predicted_count,condition,cond_count,residual
0,AC,AAA,0,False,0.084546,11.307477,"(AC, AAA, 0, False)",0,0.642931
1,AC,AAA,1,False,0.09878,13.211244,"(AC, AAA, 1, False)",0,0.642931
2,AC,AAC,0,False,0.072427,9.686594,"(AC, AAC, 0, False)",0,0.642931
3,AC,AAC,1,False,0.084707,11.329019,"(AC, AAC, 1, False)",0,0.642931
4,AC,AAG,0,False,0.145828,19.503485,"(AC, AAG, 0, False)",21,0.874491


## Formatting master tables

### Adding lightswitch boundaries

In [26]:
rate_pre_om.rates['nt_site_boundary'] = np.zeros(rate_pre_om.rates.shape[0], int)
rate_om.rates['nt_site_boundary'] = np.zeros(rate_om.rates.shape[0], int)

In [27]:
rate_pre_om.rates.loc[rate_pre_om.rates.mut_type == 'CT', 'nt_site_boundary'] = int(13467)
rate_pre_om.rates.loc[(rate_pre_om.rates.mut_type == 'AT') | (rate_pre_om.rates.mut_type == 'GC') | (rate_pre_om.rates.mut_type == 'CG'), 'nt_site_boundary'] = int(21562)

In [28]:
rate_om.rates.loc[rate_om.rates.mut_type == 'CT', 'nt_site_boundary'] = int(13467)
rate_om.rates.loc[(rate_om.rates.mut_type == 'AT') | (rate_om.rates.mut_type == 'GC') | (rate_om.rates.mut_type == 'CG'), 'nt_site_boundary'] = int(21562)

## Save master tables

In [29]:
cols = ['mut_type', 'motif', 'unpaired', 'nt_site_boundary', 'nt_site_before_boundary', 'rate', 'predicted_count', 'residual']

In [30]:
rate_pre_om.rates[cols].head()

Unnamed: 0,mut_type,motif,unpaired,nt_site_boundary,nt_site_before_boundary,rate,predicted_count,residual
0,AC,AAA,0,0,False,0.110585,10.517707,0.588548
1,AC,AAA,1,0,False,0.128033,12.177177,0.588548
2,AC,AAC,0,0,False,0.09745,9.268445,0.588548
3,AC,AAC,1,0,False,0.112919,10.739754,0.588548
4,AC,AAG,0,0,False,0.189517,18.024974,0.506125


In [31]:
rate_pre_om.rates.drop(columns=['condition'], inplace=True)
rate_pre_om.rates[cols].to_csv(pre_omicron_ms, index=False)

In [32]:
rate_om.rates.drop(columns=['condition'], inplace=True)
rate_om.rates[cols].to_csv(omicron_ms, index=False)