# Phase 3: Setting up the BEAST xmls

```
Parameters
-------------
save_dir: str  
    Path to directory for saving outputs in.

cache_dir: str 
    Path to directory for cached objects in.

template_xml_path: str
    Path to template BEAST xml.

use_initial_tree:  bool, default True
    Is there an inital tree to be used. If not the initial tree will not be used in generating a BEAST 2 xml
    and will BEAST 2 generate its own.

rt_dimensions: int, default=None
    Number of Rt dimensions (time periods).

rt_changes: dict: dict of strings {'unit': 'days, weeks or years', 'every': int/float, 'end': 'oldest tip'}, optional
    Instructions for setting rt_change date, going backwards from youngest sample in fasta_file.
    rt_changes["end"] currently only supports "oldest tip".
    If given rt_dimensions must equal None.

metadata_path: str
       Path to csv or tsv containing metadata.

collection_date_field: str
    Name of field in metadata_db containing collection dates of sequences. Should be format YYYY-MM-DD.

initial_tree_path: str
    Path to initial_tree. Should .nwk file.

fasta_path: str
    Path to fasta file containing sequences.

origin_start_addition float
    This + initial temporal tree height is used as starting value of origin.
    We recommend using an estimate of infection period for the pathogen being studied. **Value should be in years.**
    Origin prio will be unform:
        Lower value: time in years from oldest to youngest sequence in fasta_file
        Start value: origin_start_addition + initial temporal tree height
        Upper value:  initial temporal tree height + origin_upper_addition.

origin_upper_addition: float/int
    This + initial temporal tree height is used as upper value of origin prior. **Value should be in years.**
    Origin prio will be unform:
        Lower value: time in years from oldest to youngest sequence in fasta_file
        Start value: origin_start_addition + initial temporal tree height
        Upper value:  initial temporal tree height + origin_upper_addition.

origin_prior: dict {'lower': float, 'upper': float, 'start': float}, optional
       Details of the origin prior. assumed to be uniformly distributed.

log_file_basename: str, optional
    If provided .tree, .log and .state files from running BEAST 2 will have this name prefixed by 'run-{number}-',
    number being that of the chain.

chain_length: int
    Number of chains to use for BEAST runs.

trace_log_every: int
    How often to save a log file during BEAST runs.

tree_log_every: int
    How often to save a tree file during BEAST runs.

screen_log_every: int
    How often to output to screen during BEAST runs.

store_state_every: int 
    How often to store MCMC state during BEAST runs.
```


In [None]:
save_dir = 'runs_of_pipeline/2025-02-05'
template_xml_path = 'template_beast_xmls/BDSKY_serial_COVID-19_template.xml'
fasta_file = None
use_initial_tree = True
initial_tree_path = None
metadata_path = None
rt_dimensions = None
rt_changes = None
collection_date_field = 'date'
origin_upper_addition = None
origin_prior = None
origin_start_addition = 10/365.25
log_file_basename=None
chain_length = int(1e7)
trace_log_every = int(1e4)
tree_log_every = int(1e4)
screen_log_every = int(1e4)
store_state_every = int(1e4)

Import packages. 

In [None]:
import json
import pandas as pd
import os
from pandas.tseries.offsets import DateOffset
from beast_pype.beast_xml_gen import gen_bdsky_serial_xml
from beast_pype.date_utilities import date_to_decimal
from copy import deepcopy
from Bio import SeqIO

### Search for files in save_dir if not provided

In [None]:
if use_initial_tree:
    if initial_tree_path is None:
        initial_tree_path = f'{save_dir}/down_sampled_time.nwk'
        if not os.path.exists(initial_tree_path):
            initial_tree_path = f'{save_dir}/full_time.nwk'
            if not os.path.exists(initial_tree_path):
                initial_tree_path = f'{save_dir}/iqtree.treefile'
                if not os.path.exists(initial_tree_path):
                    raise FileNotFoundError(f'Initial tree file not found. initial_tree_path has not been provided and none of the files down_sampled_time.nwk, full_time.nwk or iqtree.treefile can not be found in save_dir ({save_dir}).')

if metadata_path is None:
    metadata_path = f'{save_dir}/metadata.csv'
    if not os.path.exists(metadata_path):
        raise FileNotFoundError(f'Metadata file not found. metadata_path has not been provided and the file metadata.csv can not be found in save_dir ({save_dir}).')

if fasta_file is None:
    fasta_file = f'{save_dir}/sequences.fasta_file'
    if not os.path.exists(fasta_file):
        raise FileNotFoundError(f'Fasta file not found. fasta_file has not been provided and the file sequences.fasta_file can not be found in save_dir ({save_dir}).')


if not use_initial_tree and initial_tree_path is not None:
    raise AssertionError('use_initial_tree is False but you have provided an initial_tree_path?')
    

## IQtree fix

IQ tree is offended by the charachters: '/'. The code below corrects this.

In [None]:
if use_initial_tree:
    correction_dict = {seq_record.id.replace('/', '_'): seq_record.id
                       for seq_record in SeqIO.parse(fasta_file, "fasta")}
    tree_file = initial_tree_path
    fh = open(tree_file)
    tree = fh.read()
    for changed, original in correction_dict.items():
        tree = tree.replace(changed, original)

    oh = open(tree_file, 'w')
    oh.write(tree)
    oh.close()

## Generate the $R_t$ change dates.  

Back every 4 weeks from the youngest tip. For VOIs stop at the youngest tip out of the oldest tips for each sample. For DR back an extra 4 weeks. 

In [None]:
if rt_changes in ['', 'None']:
    rt_changes = None

if rt_changes is not None:
    metadata = pd.read_csv(metadata_path, parse_dates=[collection_date_field])
    youngest_tip = metadata[collection_date_field].max()
    if rt_changes['end'] == 'oldest tip':
        end = metadata[collection_date_field].min()
    else:
        raise ValueError('rt_changes["end"] currently only supports "oldest tip".')
    rt_change_dates = []
    offsets = {rt_changes['unit']: rt_changes['every']}
    date_to_append = deepcopy(youngest_tip)
    while date_to_append > end:
        date_to_append = date_to_append - DateOffset(**offsets)
        rt_change_dates.append(date_to_append)
else:
    rt_change_dates = None

## Actually Generating the BEAST2 xmls.

In [None]:
if use_initial_tree:
    gen_bdsky_serial_xml(
       template_path=template_xml_path,
       sequences_path=fasta_file,
       metadata_path=metadata_path,
       initial_tree_path=initial_tree_path,
       origin_prior=origin_prior,
       origin_upper_height_addition=origin_upper_addition,
       origin_start_addition=origin_start_addition,
       output_path=f"{save_dir}/beast.xml",
       rt_dimensions=rt_dimensions,
       rt_change_dates=rt_change_dates,
       log_file_basename=log_file_basename,
       chain_length=chain_length,
       trace_log_every=trace_log_every,
       tree_log_every=tree_log_every,
       screen_log_every=screen_log_every,
       store_state_every=store_state_every
    )
else:
    gen_bdsky_serial_xml(template_path=template_xml_path,
                 sequences_path=fasta_file,
                 metadata_path=metadata_path,
                 output_path=f"{save_dir}/beast.xml",
                 origin_prior=origin_prior,
                 rt_dimensions=rt_dimensions,
                 rt_change_dates=rt_change_dates,
                 log_file_basename=log_file_basename,
                 chain_length=chain_length,
                 trace_log_every=trace_log_every,
                 tree_log_every=tree_log_every,
                 screen_log_every=screen_log_every,
                 store_state_every=store_state_every)

### Add Information to pipeline_run_info

In [None]:
with open(save_dir + "/pipeline_run_info.json", "r") as file:
    data = file.read()
file.close()
pipeline_run_info = json.loads(data)
if metadata_path.endswith('.tsv'):
    delimiter = '\t'
elif metadata_path.endswith('.csv'):
    delimiter = ','
else:
    raise TypeError(
        f"metadata_path must be a csv or tsv file, ending with the appropriate file extension. Value given is {metadata_path}")
metadata= pd.read_csv(metadata_path, parse_dates=[collection_date_field], sep=delimiter)
youngest_tip = metadata[collection_date_field].max()
youngest_tip = date_to_decimal(youngest_tip)
pipeline_run_info = {'youngest tip': youngest_tip}
if rt_change_dates is not None:
    pipeline_run_info['Rt change dates'] =[str(value.date()) for value in rt_change_dates]
if rt_dimensions is not None:
    pipeline_run_info['Rt dimensions'] = rt_dimensions
with open(save_dir +'/pipeline_run_info.json', 'w') as fp:
    json.dump(pipeline_run_info, fp, sort_keys=True, indent=4)

fp.close()