# Notebook for defining predicted counts master tables

## Snakemake input

In [None]:
pre_omicron_counts = snakemake.input.pre_omicron_counts
omicron_counts = snakemake.input.omicron_counts
pre_omicron_ms = snakemake.output.pre_omicron_ms
omicron_ms = snakemake.output.omicron_ms

## Import packages

In [None]:
import numpy as np
import pandas as pd
import sys
import os

In [None]:
# Adding module folder to system paths
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from modules import rates
from modules import load

## Load training dataframes

In [None]:
counts_pre_omicron = load.load_synonymous_muts(pre_omicron_counts)

In [None]:
counts_omicron = load.load_synonymous_muts(omicron_counts)

## Initialize rates objects

In [None]:
rate_pre_om = rates.Rates()
rate_om = rates.Rates()

## Populate rates and add predicted counts

In [None]:
rate_pre_om.populate_rates(counts_pre_omicron)
rate_om.populate_rates(counts_omicron)

In [None]:
rate_pre_om.rates["cond_count"] = rate_pre_om.genome_composition(counts_pre_omicron)

In [None]:
rate_om.rates["cond_count"] = rate_om.genome_composition(counts_omicron)

In [None]:
rate_pre_om.rates.head()

In [None]:
rate_om.rates.head()

## Computing residual variance

In [None]:
counts_pre_omicron['predicted_count'] = rate_pre_om.predicted_counts_by_clade(counts_pre_omicron)

In [None]:
tau_pre_omicron = counts_pre_omicron.groupby("mut_type").apply(
        lambda x: np.mean(
            (np.log(x.actual_count + 0.5) - np.log(x.predicted_count + 0.5)) ** 2
        ), include_groups=False
    )

In [None]:
rate_pre_om.residual_variance(counts_pre_omicron, tau_pre_omicron)

In [None]:
rate_pre_om.rates.head()

In [None]:
counts_omicron['predicted_count'] = rate_om.predicted_counts_by_clade(counts_omicron)

In [None]:
tau_omicron = counts_omicron.groupby("mut_type").apply(
        lambda x: np.mean(
            (np.log(x.actual_count + 0.5) - np.log(x.predicted_count + 0.5)) ** 2
        ), include_groups=False
    )

In [None]:
rate_om.residual_variance(counts_omicron, tau_omicron)

In [None]:
rate_om.rates.head()

## Formatting master tables

### Adding lightswitch boundaries

In [None]:
rate_pre_om.rates['nt_site_boundary'] = np.zeros(rate_pre_om.rates.shape[0], int)
rate_om.rates['nt_site_boundary'] = np.zeros(rate_om.rates.shape[0], int)

In [None]:
rate_pre_om.rates.loc[rate_pre_om.rates.mut_type == 'CT', 'nt_site_boundary'] = int(13467)
rate_pre_om.rates.loc[(rate_pre_om.rates.mut_type == 'AT') | (rate_pre_om.rates.mut_type == 'GC') | (rate_pre_om.rates.mut_type == 'CG'), 'nt_site_boundary'] = int(21562)

In [None]:
rate_om.rates.loc[rate_om.rates.mut_type == 'CT', 'nt_site_boundary'] = int(13467)
rate_om.rates.loc[(rate_om.rates.mut_type == 'AT') | (rate_om.rates.mut_type == 'GC') | (rate_om.rates.mut_type == 'CG'), 'nt_site_boundary'] = int(21562)

## Save master tables

In [None]:
cols = ['mut_type', 'motif', 'unpaired', 'nt_site_boundary', 'nt_site_before_boundary', 'rate', 'predicted_count', 'residual']

In [None]:
rate_pre_om.rates[cols].head()

In [None]:
rate_pre_om.rates.drop(columns=['condition'], inplace=True)
rate_pre_om.rates[cols].to_csv(pre_omicron_ms, index=False)

In [None]:
rate_om.rates.drop(columns=['condition'], inplace=True)
rate_om.rates[cols].to_csv(omicron_ms, index=False)