# Notebook for defining predicted counts master tables

## Snakemake input

In [None]:
counts = snakemake.input.counts
ms = snakemake.output.ms

## Import packages

In [1]:
import numpy as np
import sys
import os

In [None]:
# Adding module folder to system paths
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from modules import rates
from modules import load

## Load training dataframes

In [None]:
counts = load.load_synonymous_muts(counts)

In [None]:
counts


## Initialize rates objects

In [None]:
rate = rates.Rates()

## Populate rates and add predicted counts

In [None]:
rate.populate_rates(counts)

In [None]:
rate.rates["cond_count"] = rate.genome_composition(counts)

In [None]:
rate.rates.head(16)

## Computing residual variance

In [None]:
counts['predicted_count'] = rate.predicted_counts_by_clade(counts)

In [None]:
tau = counts.groupby("mut_type").apply(
        lambda x: np.mean(
            (np.log(x.actual_count + 0.5) - np.log(x.predicted_count + 0.5)) ** 2
        ), include_groups=False
    )

In [None]:
rate.residual_variance(counts, tau)

In [None]:
rate.rates.head()

## Formatting master tables

### Adding lightswitch boundaries

In [None]:
rate.rates['nt_site_boundary'] = np.zeros(rate.rates.shape[0], int)

In [None]:
# rate.rates.loc[rate.rates.mut_type == 'CT', 'nt_site_boundary'] = int(13467)
# rate.rates.loc[(rate.rates.mut_type == 'AT') | (rate.rates.mut_type == 'GC') | (rate.rates.mut_type == 'CG'), 'nt_site_boundary'] = int(21562)

## Save master tables

In [None]:
cols = ['mut_type', 'motif', 'unpaired', 'nt_site_boundary', 'nt_site_before_boundary', 'rate', 'predicted_count', 'residual']

In [None]:
rate.rates[cols].head()

In [None]:
rate.rates.drop(columns=['condition'], inplace=True)
rate.rates[cols].to_csv(ms, index=False)