# Workflow BDSKY-serial Strain Surveillance Notebook


This Workflow Notebook is for Comaring different lineages/varients of a pathogen agianst each other.


```
Parameters
-------------
overall_save_dir: str
    Path to where you are saving all the runs of this pipeline.

specific_run_save_dir: str, optional
    Subdirecotry of overall_save_dir you wish to save the outputs from this pipeline iine. 
    If None, 'None' or an emply string a timestamp of format 'YYYY-MM-DD_hour-min-sec' is used instead.

cache_dir: str 
       Name to use for cache directory. Saved within overall_save_dir/specific_run_save_dir bet deleted the end of this workflow notebook.

 metadata_db: str
       Path to csv or tsv containing metadata.

seperation_field: str
        Field in metadata on which to seperate data and sequences. These seperated data set from an xml_set (all the data that goes into a BEAST 2 xml).

root_strain_names: list of strs
    IDs of sequences to be used as root.

sample_id_field: str
    Name of field in metadata_db containing sequence IDs.

collection_date_field: str
    Name of field in metadata_db containing collection dates of sequences. Should be format YYYY-MM-DD.

lineage_field: str
    Name of field in metadata_db containing lineage sequences belong to.

metadata_dtypes: str
    Optional can be an empy string, None or 'None'. Path to json defining pandas data types for metadata_db.

data_filter: str
    Optional can be an empy string, None or 'None'. Additional filter applieid to metadata_db when selecting 
    sequences and metadata to be used on pipeline. Must conform to [pandas documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html), see further [example](https://www.slingacademy.com/article/pandas-working-with-the-dataframe-query-method-5-examples/).

sequence_db: str
    Path to fasta file containing sequences.

partition: str
    Name of partition to use when calling `sbatch`.

down_sample_to: int
    If the number sequences in a fasta file is above this the number of sequences is cut to this number via downsampling.

origin_start_addition: float
    Suggested infection period of pathogen. **Should be in years.** This + initial MLE tree height is used as starting value of origin.

origin_upper_addition: float/int
    This + initial MLE tree height is used as upper value of origin prior.

sample_id_field: str
    Name of field in metadata_db containing sequence IDs.

collection_date_field: str
    Name of field in metadata_db containing collection dates of sequences. Should be format YYYY-MM-DD.

template_xml_path:
    Path to template BEAST xml.

chain_length: int
    Number of chains to use for BEAST runs.

trace_log_every: int
    How often to save a log file during BEAST runs.

tree_log_every: int
    How often to save a tree file during BEAST runs.

screen_log_every: int
    How often to output to screen during BEAST runs.

store_state_every: int 
    How often to store MCMC state during BEAST runs.

number_of_chains: int
    Number of chains to use (repeated runs to do) when running BEAST.

seeds: list of ints
    Seeds to use when running BEAST.

partition: str
    Name of partition to use when calling `sbatch`.

beast_ram: str
    RAM to use for each run of beast see sbatch documentation.

beast_threads: int
    Threads/CPUs to use for each run of beast see https://www.beast2.org/2021/03/31/command-line-options.html.

burnin_percentage: int
    Perecentage burnin to use.

```

In [None]:
# Needed in this notebook
overall_save_dir = '../test_runs_of_pipeline'
specific_run_save_dir=None


# Needed in Phases 1 & 3
sample_id_field = 'strain'
collection_date_field = 'date'

# Needed in Phases 2 & 5
partition=None

# Used in Phase 1:
metadata_db = None
separation_field = 'country'
data_filter = None
root_strain_names = None
metadata_dtypes = None

# Used in Phase 2:
sequence_db = None

# Used in Phase 3:
down_sample_to = None


# Used in Phase 4:
template_xml_path = '../template_beast_xmls/BDSKY_serial_COVID-19_template.xml'
origin_upper_addition = 2
origin_start_addition = 10/365.25
chain_length = int(5e5)
trace_log_every = int(5e2)
tree_log_every = int(5e2)
screen_log_every = int(5e2)
store_state_every = int(5e2)

# Used in Phase 5:
number_of_chains = 4
seeds = None
beast_ram = "32G"
beast_threads=6

# Used in Phase 6:
burnin_percentage = 10

Import packages, etc.

In [None]:
from beast_pype.workflow import check_file_for_phrase
from beast_pype.mcmc_diagnostics import gen_xml_set_diag_notebook
from papermill.parameterize import parameterize_notebook
from papermill.iorw import load_notebook_node, write_ipynb
import json
import os
from datetime import datetime
import papermill as pm
from time import perf_counter
import pandas as pd
from numpy.random import randint 
import shutil
import importlib.resources as importlib_resources

### Creating Folders and Subfolders 

In [None]:
if not os.path.exists(overall_save_dir):
    os.makedirs(overall_save_dir)

Create a folder of today's date to save into within save_dir and reassign save_dir to that value.

In [None]:
if specific_run_save_dir is None or specific_run_save_dir in ['', 'None']:
    now = datetime.now()
    specific_run_save_dir = now.strftime('%Y-%m-%d_%H-%M-%S')

save_dir = overall_save_dir +'/'+ specific_run_save_dir
cache_dir = save_dir +'/cache'

If save_dir and and cache_dir do not exist create them. 

In [None]:
for folder in [save_dir, cache_dir]:
    if not os.path.exists(folder):
        os.makedirs(folder)

### Placing Common Parameters in a Dictionary 

In [None]:
common_params = {
    'save_dir' : save_dir,
    'cache_dir' : cache_dir
}

### Creating a record for runtimes

This record list of dirtionaries will be turned into a pandas dataframe and saved as a csv at the end of this notebook.

In [None]:
runtime_records = []

### Set path to workflow modules

In [None]:
workflow_modules = importlib_resources.path('beast_pype', 'workflow_modules')

## Phase 1: Data Gathering

### Placing Phase 1 Parameters in a Dictionary

In [None]:
phase_1_start= perf_counter()
phase_1_params = {
    **common_params,
    **{val_name: eval(val_name) for val_name in ['metadata_db', 'data_filter', 'sample_id_field', 'collection_date_field', 'fasta_file']}
}

#### Running Phase 1i.

In [None]:
#papermill_description=Phase-1-Metadata_and_Sequence-Separation.ipynb
phase_1i_log =pm.execute_notebook(input_path=f'{workflow_modules}/Phase-1-Metadata_and_Sequence-Separation.ipynb',
                                 output_path=save_dir + '/Phase-1-Metadata_and_Sequence-Separation.ipynb',
                                 parameters=phase_1_params,
                                 progress_bar=True,
                                 nest_asyncio=True
                                )
runtime_records.append({
    'Phase': 'Phase-1-Metadata_and_Sequence-Separation.ipynb',
    'Sample': None,
    'Chain': None,
    'Runtime': perf_counter() - phase_1_start
})

## Phase 2: Data Pre-Processing
### Phase 2i: Building an IQ Tree tree.
#### Placing Phase 2i Parameters in a Dictionary

In [None]:
phase_2i_start = perf_counter()
phase_2i_params = {**common_params,
                   'fasta_file': 'sequences.fasta',
                  'partition': partition
                 }

#### Running Phase 2i.

In [None]:
#papermill_description=Phase-2i-IQTree.ipynb

phase_2i_log = pm.execute_notebook(input_path=f'{workflow_modules}/Phase-2i-IQTree.ipynb',
                                  output_path=save_dir + '/Phase-2i-IQTree.ipynb',
                                  parameters=phase_2i_params,
                                  progress_bar=True,
                                  nest_asyncio=True
                                 )


### Wait for IQtrees to be Built

In [None]:
with open(save_dir + '/pipeline_run_info.json', 'r') as file:
    data = file.read()
file.close()
pipeline_run_info = json.loads(data)
xml_set_directories = pipeline_run_info["xml set directories"]
for directory in xml_set_directories.values():
    out_file = directory + '/iqtree.out'
    check_file_for_phrase(out_file)

runtime_records.append({
    'Phase': 'Phase-2i-IQTree.ipynb',
    'Sample': None,
    'Chain': None,
    'Runtime': perf_counter() - phase_2i_start
})

### Phase 2ii: TreeTime & Down Sampling

#### Placing Phase 2ii Parameters in a Dictionary

In [None]:
phase_2ii_start = perf_counter()


#### Running Phase 2ii.

In [None]:
#papermill_description=Phase-2ii-TreeTime-and-Down-Sampling.ipynb
with open(save_dir + "/pipeline_run_info.json", "r") as file:
    data = file.read()
file.close()
pipeline_run_info = json.loads(data)
xml_set_directories = pipeline_run_info['xml set directories']
for sub_dir in xml_set_directories.values(): # This loop could and should be parallellisd
    phase_2ii_params = {
        'save_dir': sub_dir,      
        **{val_name: eval(val_name) for val_name in ['sample_id_field', 'collection_date_field', 'down_sample_to']}
        }
    phase_2ii_log = pm.execute_notebook(input_path=f'{workflow_modules}/Phase-2ii-TreeTime-and-Down-Sampling.ipynb',
                                      output_path=f'{sub_dir}/Phase-2ii-TreeTime-and-Down-Sampling.ipynb',
                                      parameters=phase_2ii_params,
                                      progress_bar=True,
                                      nest_asyncio=True
                                     )
runtime_records.append({
    'Phase': 'Phase-2ii-TreeTime-and-Down-Sampling.ipynb',
    'Sample': None,
    'Chain': None,
    'Runtime': perf_counter() - phase_2ii_start
})

## Phase 3 Generating BEAST xmls

### Running Phase 3

In [None]:
phase_3_start = perf_counter()
for sub_dir in xml_set_directories.values(): # This loop could and should be parallellisd
    phase_3_params = {
        'save_dir': sub_dir,
        **{val_name: eval(val_name) for val_name in [
            'template_xml_path',
            'use_initial_tree',
            'initial_tree_path',
            'rt_dimensions',
            'collection_date_field',
            'origin_start_addition',
            'origin_upper_addition',
            'log_file_basename',
            'origin_prior',
            'chain_length',
            'trace_log_every',
            'tree_log_every',
            'screen_log_every',
            'store_state_every']}
        }
    phase_3_log = pm.execute_notebook(input_path=f'{workflow_modules}/Phase-3-Gen-BDSKY-xml.ipynb',
                                      output_path=f'{sub_dir}/Phase-3-Gen-BDSKY-xml.ipynb',
                                      parameters=phase_3_params,
                                      progress_bar=True,
                                      nest_asyncio=True
                                     )
runtime_records.append({
    'Phase': 'Phase-3-Gen-BDSKY-xml.ipynb',
    'Sample': None,
    'Chain': None,
    'Runtime': perf_counter() - phase_3_start
})

## Phase 4 Running BEAST

BEASTs random number seed can select the same seed for multiple runs if they are launched close together in time (such as proromatically). Therefore lets use numpy to generate our seeds.

In [None]:
if seeds is None:
    number_of_seeds=number_of_chains*(len(xml_set_directories))
    seeds = randint(low=1, high=int(1e6), size=number_of_seeds).tolist()

Record seeds in pipeline_run_info json

In [None]:
with open(save_dir + "/pipeline_run_info.json", "r") as file:
    data = file.read()
file.close()
pipeline_run_info = json.loads(data)
pipeline_run_info['seeds'] = seeds
with open(save_dir +'/pipeline_run_info.json', 'w') as fp:
    json.dump(pipeline_run_info, fp, sort_keys=True, indent=4)

fp.close()

### Placing Phase 4 Parameters in a Dictionary

In [None]:
phase_4_params = {**common_params,
                  'number_of_chains': number_of_chains,
                  'seeds':seeds,
                  'partition': partition,
                  'beast_threads':beast_threads,
                  'beast_ram':beast_ram}

### Running Phase 4.

In [None]:
#papermill_description=Phase5-Running-BEAST.ipynb
phase_4_log = pm.execute_notebook(input_path=f'{workflow_modules}/Phase-4-Running-BEAST.ipynb',
                                  output_path=save_dir + '/Phase-4-Running-BEAST.ipynb',
                                  parameters=phase_4_params,
                                  progress_bar=True,
                                  nest_asyncio=True
                                 )

## Add Slurm Job IDs and Names to pipeline_run_info.json.

In [None]:
with open(save_dir + "/pipeline_run_info.json", "r") as file:
    data = file.read()
file.close()
pipeline_run_info = json.loads(data)
with open(cache_dir +'/slurm_job_ids.txt', 'r') as fp:
    entries = fp.read().splitlines() 
fp.close()
pipeline_run_info['slurm job IDs'] = entries
with open(save_dir +'/pipeline_run_info.json', 'w') as fp:
    json.dump(pipeline_run_info, fp, sort_keys=True, indent=4)

fp.close()

## Phase 5: Diagnosing Outputs and Generate Report

Curently this has to be performed manually. That being said, the code cell below will parameterize a copy of the notebook ready to run. See below for location.

In [None]:
gen_xml_set_diag_notebook(save_dir, directories_to_exclude = [cache_dir])
phase_5_params = {'save_dir': None,
                  'report_template': str(importlib_resources.path('beast_pype', 'report_templates') / 'COVID-Surveillance-Report.ipynb'),
                  'add_unreported_fields': False,
                  'collection_date_field':collection_date_field
                 } 
phase_5_notebook = load_notebook_node(f'{save_dir}/Phase-5-Diagnosing_XML_sets_and_Generate_Report.ipynb')
phase_5_notebook = parameterize_notebook(phase_5_notebook, phase_5_params)
write_ipynb(phase_5_notebook, f'{save_dir}/Phase-5-Diagnosing_XML_sets_and_Generate_Report.ipynb')
print(f'Phase 5 notebook is ready for manual use at: \n{save_dir}/Phase-5-Diagnosing_XML_sets_and_Generate_Report.ipynb')

## Recording Runtimes

Converting to pandas DataFrame and saving as CSV.

In [None]:
runtime_df = pd.DataFrame.from_records(runtime_records)
runtime_df.to_csv(save_dir + "/runtimes.csv", index=False)

### Delete Cache direcory

In [None]:
shutil.rmtree(cache_dir)