<a id="0"></a> <br>
 # Table of Contents  
1. [Packages and variables](#1)
1. [Files](#2)
1. [Initial population object](#3)
1. [Branching process](#4)
    1. [Parallel processing](#5)
1. [Shuffle](#6)
    1. [Parallel processing](#7)
    1. [Large object running] (#8)
    1. [Local running](#9)
1. [Back projection: application](#10)
    1. [Parallel processing](#11)
    1. [Large object running](#12)
    1. [Local running](#13)
1. [Back projection: reformatting](#14)
    1. [Parallel processing](#15)
    1. [Large object running](#16)
    1. [Local running](#17)
1. [Inference](#18)

<a id="1"></a> 
##  packages and global variables

In [49]:
# import modules and global variables

from imp import reload
import sys, os
import shutil
import posixpath as ppath
from copy import deepcopy
import time
print('python version %s' % sys.version)

import numpy as np
print('numpy version %s' % np.__version__)

import scipy as sp
import scipy.stats as st
print('scipy version %s' % sp.__version__)

import pandas as pd
print('pandas version %s' % pd.__version__)

import matplotlib
from matplotlib import cm
import matplotlib.pyplot as plt
print('matplotlib version %s' % matplotlib.__version__)

import sklearn as sk
from sklearn.metrics import roc_auc_score
print('scikit-learn version %s' % sk.__version__)

# global variables
# replace with your directories here

USER_NAME         = 'YOUR_LOCAL_PATH'

SIM_DIR           = os.path.join(USER_NAME, 'paper-epi-backprojection/src/simulations')
DATA_SIM_DIR      = os.path.join(SIM_DIR, 'data-sim')
SCRIPT_DIR        = os.path.join(SIM_DIR, 'simulation-scripts')
DATA_FILES        = OS.path.join(DATA_SIM_DIR, 'data-npz')
STOC_LOC          = os.path.join(DATA_SIM_DIR, 'stoc-format')
BP_LOC_DIR        = os.path.join(DATA_SIM_DIR, 'back-projected-data-sim')
BP_LOC_DIR_WRITE  = os.path.join(DATA_SIM_DIR, 'back-projected-data-sim-write')

# cluster directory setup
DATA_DATE         = 'RUN_DATE_HERE'
SSH_USER          = 'YOUR_CLUSTER_NAME'
SSH_HOME          = 'YOUR_SSH_SERVER'
SSH_DATA          = ppath.join('YOUR_CLUSTER_DIRECTORY' SSH_USER, 'simulations', DATA_DATE)
BP_PROJ_DIR       = ppath.join(SSH_DATA, 'back-projected-func-sim10000')
BP_PROJ_DIR_WRITE = ppath.join(SSH_DATA, 'back-projected-func-write-sim10000')
SCRATCH           = ppath.join(SSH_DATA, 'scratch')

python version 3.9.7 (default, Sep 16 2021, 08:50:36) 
[Clang 10.0.0 ]
numpy version 1.20.3
scipy version 1.7.1
pandas version 1.3.4
matplotlib version 3.4.3


### make directories

In [None]:
print(f'mkdir -p {SSH_DATA} &&')

# Make back projection write folders on directory

#print(f'mkdir {BP_PROJ_DIR} &&')
print(f'mkdir {BP_PROJ_DIR_WRITE} &&')

# create write sub directories

dir_list = []
for i in range(10000):
    dir_str = f'{bp_write_str}/df_{i}'
    dir_list.append(dir_str)

for name in dir_list:
    dir_path = ppath.join(SSH_DATA, name)
    if name!=dir_list[-1]:
        print(f'mkdir {dir_path} &&')
    else:
        print(f'mkdir {dir_path}')

<a id="2"></a>
## file naming

In [24]:
# base names for naming

sim_name = 'sim-samp100-T100-sim100-redo'
inf_name = 'inf-samp100-T100-sim100-redo'

init_file = os.path.join(SIM_DIR, "initial_sel_pop.npz")

# scripts and out files

# scripts

sim_script = os.path.join(SCRIPT_DIR, 'branching.py')
inf_script = os.path.join(SCRIPT_DIR, 'epi-infer-multiple.py')

# file containing the stochastic branching process simulation

sim_out       = os.path.join(SIM_DIR, sim_name + '-stochastic')
sim_clust     = os.path.join(SSH_DATA, sim_name + '-stochastic')
sim_npz       = sim_out + '.npz'
sim_npz_clust = sim_clust + '.npz'

# file containing the inference from the branching process

inf_out    = os.path.join(SIM_DIR, inf_name + '-stochastic')

# shuffling files

sim_shuff_out = os.path.join(SIM_DIR, sim_name + '-shuffled')
inf_shuff_out = os.path.join(SIM_DIR, inf_name + '-shuffled')

# back-projection files

bp_script = 'back_projection_ems.py'
incubation_file = 'incubation_period_sars2.csv'
smoothing_file = 'weights_binom.csv'

bp_script_path = os.path.join(SIM_DIR, bp_script)
inc_path = os.path.join(SIM_DIR, incubation_file)
smooth_path = os.path.join(SIM_DIR, smoothing_file)

### Transfer files to the cluster

In [None]:
# Transfer files to the cluster
print(f'scp {sim_script} {SSH_HOME}{SSH_DATA} &&')
print(f'scp {inf_script} {SSH_HOME}{SSH_DATA} &&')
print(f'scp {bp_script_path} {SSH_HOME}{SSH_DATA} &&')
print(f'scp {inc_path} {SSH_HOME}{SSH_DATA} &&')
print(f'scp {smooth_path} {SSH_HOME}{SSH_DATA}')

<a id="3"></a> 
## initial population object

<div class="alert alert-block alert-success">
<b>ALREADY RUN</b> can edit parameters for other initial files
</div>

In [None]:
# make initial population

# the starting population sizes for 6 variants
init_pop = [1000, 1000, 1000, 1000,  1000,  1000]
# the initial sequences where numbers specify location on the genome of a mutation
init_seqs = [[0],  [1],  [2],  [3],   [4],   [5]]
# the selection coefficients for the mutations, in order of site numbers
init_sel = [0,    0, 0.03, 0.03, -0.03, -0.03]

# initialize numpy object

init_file = os.path.join(SIM_DIR, "initial_sel_pop.npz")
f = open(init_file, mode='wb')
np.savez_compressed(f, counts=init_pop,
                    sequences=init_seqs, selection=init_sel)
f.close()


<a id="4"></a>
## run branching process

<div class="alert alert-block alert-success">
<b>ALREADY RUN</b>
</div>

In [25]:
command = f"%run {sim_script} -o {sim_out} --simulations 100 --pop_limit 10000 --sample 100 --mu 0 -T 100 -i {init_file}"

In [26]:
command

'%run /Users/liz/Documents/back_projection_home/backprojection-SARS-CoV-2-main/simulations/simulation-scripts/branching.py -o /Users/liz/Documents/back_projection_home/backprojection-SARS-CoV-2-main/simulations/sim-samp100-T100-sim100-redo-stochastic --simulations 100 --pop_limit 10000 --sample 100 --mu 0 -T 100 -i /Users/liz/Documents/back_projection_home/backprojection-SARS-CoV-2-main/simulations/initial_sel_pop.npz'

In [28]:
# your global variables here
%run {SCRIPT_DIR}/branching.py -o {DATA_FILES}/initial_sel_pop.npz

  return array(a, dtype, copy=False, order=order, subok=True)


In [None]:
#%%time

# run branching process script for 100 simulations

%run {sim_script} -o {sim_out} --simulations 100 --pop_limit 10000 --sample 100 --mu 0 -T 100 \
-i {init_file}

<a id="5"></a>
### run branching process, parallel

In [None]:
%%time

# run branching process for 1000 simulations, 10 times

init_file = os.path.join(SIM_DIR, "initial_sel_pop.npz")

sim_script = os.path.join(SCRIPT_DIR, 'branching.py')

sim_name = 'sim-samp100-T100-sim1000'

sims = []

for i in range(10):
    sim_out_i = os.path.join(SIM_DIR, sim_name + f'-stochastic-{i}')
    sims.append(sim_out_i)
    
for i in sims:
    %run {sim_script} -o {i} --simulations 1000 --pop_limit 10000 --sample 100 --mu 0 -T 100 \
    -i {init_file}

### load branching simulation for checks
#### optional

In [15]:
with np.load('sim-samp100-T100-sim10000-stochastic.npz', allow_pickle=True) as data:
    times = data['times']
    mutant_sites_all = data['mutant_sites_all']
    simulations = data['simulations']
    full_nVec = data['full_nVec']
    full_sVec = data['full_sVec']
    
with np.load('sim-samp100-T100-sim10000-shuffled.npz', allow_pickle=True) as data_sh:
    times_sh = data_sh['times']
    mutant_sites_all_sh = data_sh['mutant_sites_all']
    simulations_sh = data_sh['simulations']
    full_nVec_sh = data_sh['full_nVec']
    full_sVec_sh = data_sh['full_sVec']
    
with np.load('sim-samp100-T100-sim10000-backprojected.npz', allow_pickle=True) as data_bp:
    times_bp = data_bp['times']
    mutant_sites_all_bp = data_bp['mutant_sites_all']
    simulations_bp = data_bp['simulations']
    full_nVec_bp = data_bp['full_nVec']
    full_sVec_bp = data_bp['full_sVec']

<a id="6"></a>
## shuffle up times
the script will create an npz file and split up alleles for each simulation into subfolders for backprojection

<a id="7"></a>
### parallel running

In [None]:
back_dir_str       = 'back-projected-sim1000'

dirs_list = []
for i in range(10):
    for j in range(1000):
        dir_str = back_dir_str + f'-{i}'
        dirs_list.append(dir_str)

for name in dirs_list:
    dir_path = ppath.join(SSH_DATA, name)
    if name!=dirs_list[-1]:
        print(f'mkdir {dir_path} &&')
    else:
        print(f'mkdir {dir_path}')

In [None]:
# parallel

script_file            = 'stochastic-shuffle.py'
script_path            = ppath.join(SIM_DIR, script_file)
script_path_clust      = ppath.join(SSH_DATA, script_file)
sim_read_str       = 'sim-samp100-T100-sim1000-stochastic'
back_dir_str       = 'back-projected-sim1000'

job_file         = 'job-shuff-1000-array.sh'
job_str          = f"""#!/bin/bash -l
#!/bin/bash
#SBATCH --job-name=stochastic-shuffle
#SBATCH --output=stoc-shuff-%a.out
#SBATCH --error=stoc-shuff-%a.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=16G
#SBATCH --mail-user={USER_EMAIL}
#SBATCH --mail-type=ALL
#SBATCH --time=04:00:00
#SBATCH --array=0-9

module unload miniconda2
module load miniconda3
conda activate sars-env

export PYTHONUNBUFFERED=1

# scripts takes the file path of the directory as input parameter

npz_files=({SSH_DATA}/{sim_read_str}-*.npz) # Create a Bash array with all f
npz_base=({sim_read_str}-*)
PROJ_DIRS=({SSH_DATA}/{back_dir_str}_*/)

module unload miniconda2
module load miniconda3
conda activate sars-env

# Feed the script with the data directory and file names corresponding to the task ID in the array

python {script_file} -npz_file "${{npz_files[$SLURM_ARRAY_TASK_ID]}}" -npz_base_name "${{npz_base[$SLURM_ARRAY_TASK_ID]}}" -PROJ_DIR "${{PROJ_DIRS[$SLURM_ARRAY_TASK_ID]}}"
""" 

f = open(os.path.join(SIM_DIR, job_file), mode='w')
f.write('%s\n' % job_str)
f.close()

sim_name = 'sim-samp100-T100-sim1000'

sims = []
#dirs = []

for i in range(10):
    sim_out_i = os.path.join(SIM_DIR, sim_name + f'-stochastic-{i}.npz')
    sims.append(sim_out_i)
    #dir_i = os.path.join()

# transfer npz file
for i in sims:
    print(f'scp {i} {SSH_HOME}{SSH_DATA} &&')

#print(f'mkdir {SSH_HOME}{SSH_DATA}/{back_dir_str}_*/)
    

# transfer script

#print(f'scp {script_path} {SSH_HOME}{SSH_DATA} &&')

# transfer job file

job_path = os.path.join(SIM_DIR, job_file)

print(f'scp {job_path} {SSH_HOME}{SSH_DATA}')

print('')

# commands to execute on the cluster

print('sbatch %s' % job_file)
print('')


<a id_name="8"></a>
### large object running, 10000 simulations

In [None]:
script_file            = 'stochastic-shuffle.py'
script_path            = ppath.join(SIM_DIR, script_file)
script_path_clust      = ppath.join(SSH_DATA, script_file)


job_file         = 'job-shuff-10000.sh'
job_str          = f"""#!/bin/bash -l
#!/bin/bash
#SBATCH --job-name=stochastic-shuffle
#SBATCH --output=stoc-shuff.out
#SBATCH --error=stoc-shuff.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=64G
#SBATCH --mail-user={USER_EMAIL}
#SBATCH --mail-type=ALL
#SBATCH --time=2-0

module unload miniconda2
module load miniconda3
conda activate sars-env

export PYTHONUNBUFFERED=1

python {script_file} -npz_file {sim_npz_clust} -npz_base_name {sim_name} -PROJ_DIR {BP_PROJ_DIR}
""" 

f = open(os.path.join(SIM_DIR, job_file), mode='w')
f.write('%s\n' % job_str)
f.close()


# transfer npz file

print(f'scp {sim_npz} {SSH_HOME}{SSH_DATA} &&')

# transfer script

print(f'scp {script_path} {SSH_HOME}{SSH_DATA} &&')

# transfer job file

job_path = os.path.join(SIM_DIR, job_file)

print(f'scp {job_path} {SSH_HOME}{SSH_DATA}')

print('')

# commands to execute on the cluster

print('sbatch %s' % job_file)
print('')


<a id="9"></a>
### local running
large, not optimized for more than 1000 simulation files
to do: speed up code for larger objects

In [50]:
%run stochastic-reformat -npz_file {sim_npz} -npz_base_name {sim_name} -PROJ_DIR {STOC_LOC}

Simulation: 0	Time taken: 124.504ms
Simulation: 1	Time taken: 116.282ms
Simulation: 2	Time taken: 115.541ms
Simulation: 3	Time taken: 120.890ms
Simulation: 4	Time taken: 123.110ms
Simulation: 5	Time taken: 124.987ms
Simulation: 6	Time taken: 129.596ms
Simulation: 7	Time taken: 129.969ms
Simulation: 8	Time taken: 131.553ms
Simulation: 9	Time taken: 138.648ms
Simulation: 10	Time taken: 134.168ms
Simulation: 11	Time taken: 137.995ms
Simulation: 12	Time taken: 136.160ms
Simulation: 13	Time taken: 141.598ms
Simulation: 14	Time taken: 141.017ms
Simulation: 15	Time taken: 144.093ms
Simulation: 16	Time taken: 145.589ms
Simulation: 17	Time taken: 146.257ms
Simulation: 18	Time taken: 142.678ms
Simulation: 19	Time taken: 148.905ms
Simulation: 20	Time taken: 150.880ms
Simulation: 21	Time taken: 150.084ms
Simulation: 22	Time taken: 154.483ms
Simulation: 23	Time taken: 156.113ms
Simulation: 24	Time taken: 153.676ms
Simulation: 25	Time taken: 161.389ms
Simulation: 26	Time taken: 164.114ms
Simulation:

Saved dataframe_[1] to the df_22_ref sub-directory
Saved dataframe_[2] to the df_22_ref sub-directory
Saved dataframe_[3] to the df_22_ref sub-directory
Saved dataframe_[4] to the df_22_ref sub-directory
Saved dataframe_[5] to the df_22_ref sub-directory
Saved dataframe_[0] to the df_23_ref sub-directory
Saved dataframe_[1] to the df_23_ref sub-directory
Saved dataframe_[2] to the df_23_ref sub-directory
Saved dataframe_[3] to the df_23_ref sub-directory
Saved dataframe_[4] to the df_23_ref sub-directory
Saved dataframe_[5] to the df_23_ref sub-directory
Saved dataframe_[0] to the df_24_ref sub-directory
Saved dataframe_[1] to the df_24_ref sub-directory
Saved dataframe_[2] to the df_24_ref sub-directory
Saved dataframe_[3] to the df_24_ref sub-directory
Saved dataframe_[4] to the df_24_ref sub-directory
Saved dataframe_[5] to the df_24_ref sub-directory
Saved dataframe_[0] to the df_25_ref sub-directory
Saved dataframe_[1] to the df_25_ref sub-directory
Saved dataframe_[2] to the df_2

Saved dataframe_[0] to the df_56_ref sub-directory
Saved dataframe_[1] to the df_56_ref sub-directory
Saved dataframe_[2] to the df_56_ref sub-directory
Saved dataframe_[3] to the df_56_ref sub-directory
Saved dataframe_[4] to the df_56_ref sub-directory
Saved dataframe_[5] to the df_56_ref sub-directory
Saved dataframe_[0] to the df_57_ref sub-directory
Saved dataframe_[1] to the df_57_ref sub-directory
Saved dataframe_[2] to the df_57_ref sub-directory
Saved dataframe_[3] to the df_57_ref sub-directory
Saved dataframe_[4] to the df_57_ref sub-directory
Saved dataframe_[5] to the df_57_ref sub-directory
Saved dataframe_[0] to the df_58_ref sub-directory
Saved dataframe_[1] to the df_58_ref sub-directory
Saved dataframe_[2] to the df_58_ref sub-directory
Saved dataframe_[3] to the df_58_ref sub-directory
Saved dataframe_[4] to the df_58_ref sub-directory
Saved dataframe_[5] to the df_58_ref sub-directory
Saved dataframe_[0] to the df_59_ref sub-directory
Saved dataframe_[1] to the df_5

Saved dataframe_[1] to the df_87_ref sub-directory
Saved dataframe_[2] to the df_87_ref sub-directory
Saved dataframe_[3] to the df_87_ref sub-directory
Saved dataframe_[4] to the df_87_ref sub-directory
Saved dataframe_[5] to the df_87_ref sub-directory
Saved dataframe_[0] to the df_88_ref sub-directory
Saved dataframe_[1] to the df_88_ref sub-directory
Saved dataframe_[2] to the df_88_ref sub-directory
Saved dataframe_[3] to the df_88_ref sub-directory
Saved dataframe_[4] to the df_88_ref sub-directory
Saved dataframe_[5] to the df_88_ref sub-directory
Saved dataframe_[0] to the df_89_ref sub-directory
Saved dataframe_[1] to the df_89_ref sub-directory
Saved dataframe_[2] to the df_89_ref sub-directory
Saved dataframe_[3] to the df_89_ref sub-directory
Saved dataframe_[4] to the df_89_ref sub-directory
Saved dataframe_[5] to the df_89_ref sub-directory
Saved dataframe_[0] to the df_90_ref sub-directory
Saved dataframe_[1] to the df_90_ref sub-directory
Saved dataframe_[2] to the df_9

In [31]:
%run stochastic-shuffle -npz_file {sim_npz} -npz_base_name {sim_name} -PROJ_DIR {BP_LOC_DIR}


Simulation: 0	Time taken: 238.277ms
Simulation: 1	Time taken: 228.576ms
Simulation: 2	Time taken: 229.582ms
Simulation: 3	Time taken: 232.523ms
Simulation: 4	Time taken: 235.248ms
Simulation: 5	Time taken: 236.998ms
Simulation: 6	Time taken: 237.149ms
Simulation: 7	Time taken: 239.234ms
Simulation: 8	Time taken: 242.174ms
Simulation: 9	Time taken: 243.788ms
Simulation: 10	Time taken: 244.645ms
Simulation: 11	Time taken: 246.423ms
Simulation: 12	Time taken: 250.645ms
Simulation: 13	Time taken: 253.553ms
Simulation: 14	Time taken: 252.557ms
Simulation: 15	Time taken: 255.792ms
Simulation: 16	Time taken: 259.693ms
Simulation: 17	Time taken: 261.264ms
Simulation: 18	Time taken: 262.495ms
Simulation: 19	Time taken: 264.617ms
Simulation: 20	Time taken: 265.426ms
Simulation: 21	Time taken: 274.008ms
Simulation: 22	Time taken: 268.800ms
Simulation: 23	Time taken: 274.131ms
Simulation: 24	Time taken: 274.968ms
Simulation: 25	Time taken: 278.865ms
Simulation: 26	Time taken: 278.090ms
Simulation:

Saved dataframe_[1] to the df_22 sub-directory
Saved dataframe_[2] to the df_22 sub-directory
Saved dataframe_[0] to the df_22 sub-directory
Saved dataframe_[3] to the df_22 sub-directory
Saved dataframe_[4] to the df_22 sub-directory
Saved dataframe_[5] to the df_22 sub-directory
Saved dataframe_[2] to the df_23 sub-directory
Saved dataframe_[0] to the df_23 sub-directory
Saved dataframe_[1] to the df_23 sub-directory
Saved dataframe_[3] to the df_23 sub-directory
Saved dataframe_[4] to the df_23 sub-directory
Saved dataframe_[5] to the df_23 sub-directory
Saved dataframe_[2] to the df_24 sub-directory
Saved dataframe_[4] to the df_24 sub-directory
Saved dataframe_[5] to the df_24 sub-directory
Saved dataframe_[0] to the df_24 sub-directory
Saved dataframe_[1] to the df_24 sub-directory
Saved dataframe_[3] to the df_24 sub-directory
Saved dataframe_[2] to the df_25 sub-directory
Saved dataframe_[3] to the df_25 sub-directory
Saved dataframe_[4] to the df_25 sub-directory
Saved datafra

Saved dataframe_[1] to the df_57 sub-directory
Saved dataframe_[2] to the df_57 sub-directory
Saved dataframe_[5] to the df_57 sub-directory
Saved dataframe_[0] to the df_57 sub-directory
Saved dataframe_[4] to the df_57 sub-directory
Saved dataframe_[3] to the df_57 sub-directory
Saved dataframe_[0] to the df_58 sub-directory
Saved dataframe_[1] to the df_58 sub-directory
Saved dataframe_[2] to the df_58 sub-directory
Saved dataframe_[3] to the df_58 sub-directory
Saved dataframe_[4] to the df_58 sub-directory
Saved dataframe_[5] to the df_58 sub-directory
Saved dataframe_[3] to the df_59 sub-directory
Saved dataframe_[4] to the df_59 sub-directory
Saved dataframe_[0] to the df_59 sub-directory
Saved dataframe_[1] to the df_59 sub-directory
Saved dataframe_[2] to the df_59 sub-directory
Saved dataframe_[5] to the df_59 sub-directory
Saved dataframe_[5] to the df_60 sub-directory
Saved dataframe_[4] to the df_60 sub-directory
Saved dataframe_[0] to the df_60 sub-directory
Saved datafra

Saved dataframe_[2] to the df_87 sub-directory
Saved dataframe_[3] to the df_87 sub-directory
Saved dataframe_[1] to the df_87 sub-directory
Saved dataframe_[4] to the df_87 sub-directory
Saved dataframe_[5] to the df_87 sub-directory
Saved dataframe_[0] to the df_87 sub-directory
Saved dataframe_[4] to the df_88 sub-directory
Saved dataframe_[0] to the df_88 sub-directory
Saved dataframe_[3] to the df_88 sub-directory
Saved dataframe_[5] to the df_88 sub-directory
Saved dataframe_[1] to the df_88 sub-directory
Saved dataframe_[2] to the df_88 sub-directory
Saved dataframe_[1] to the df_89 sub-directory
Saved dataframe_[0] to the df_89 sub-directory
Saved dataframe_[2] to the df_89 sub-directory
Saved dataframe_[3] to the df_89 sub-directory
Saved dataframe_[4] to the df_89 sub-directory
Saved dataframe_[5] to the df_89 sub-directory
Saved dataframe_[3] to the df_90 sub-directory
Saved dataframe_[1] to the df_90 sub-directory
Saved dataframe_[2] to the df_90 sub-directory
Saved datafra

In [None]:
command = f"%run {sim_script} -o {sim_out} --simulations 100 --pop_limit 10000 --sample 100 --mu 0 -T 100 -i {init_file}"

<a id="10"></a>
## back projection application

<a id="11"></a>
### parallel with 10 shuffled parent directories

In [None]:
back_dir_w_str       = 'back-projected-func-write-sim10000'

dirs_list = []
for i in range(10):
    for j in range(10000):
        dir_str = back_dir_w_str + f'/df_{j}'
        dirs_list.append(dir_str)

for name in dirs_list:
    dir_path = ppath.join(SSH_DATA, name)
    if name!=dirs_list[-1]:
        print(f'mkdir {dir_path} &&')
    else:
        print(f'mkdir {dir_path}')

In [None]:
# parallel

script_file      = 'back_projection_ems.py'
script_path      = os.path.join(SSH_DATA, script_file)
bp_read_str       = 'back-projected-sim1000'
bp_write_str    =  'back-projected-sim1000-write'

dirs_write = []
dirs_read = []
for i in range(10):
    #for j in range(1000):
    write_str = bp_write_str + f'-{i}'
    read_str = bp_read_str + f'-{i}'
    
    dirs_write.append(write_str)
    dirs_read.append(read_str)
    
for i in range(10):
    #dir_path = ppath.join(SSH_DATA, name)
    #if name!=dirs_list[-1]:
        #print(dir_path)
    job_file         = f'job-bp-sim-{i}.sh'
    job_str          = f"""#!/bin/bash -l
    #!/bin/bash
    #SBATCH --job-name=back-projection-sim-{i}
    #SBATCH --output=bp-sim-{i}_%a.out
    #SBATCH --error=bp-sim-{i}_%a.err
    #SBATCH --nodes=1
    #SBATCH --ntasks=1
    #SBATCH --cpus-per-task=1
    #SBATCH --mem=1G
    #SBATCH --mail-user={USER_EMAIL}
    #SBATCH --mail-type=ALL
    #SBATCH --time 03:00:00
    #SBATCH --array=0-999

    # scripts takes the file path of the directory as input parameter

    DIRS=({SSH_DATA}/{bp_read_str}-{i}/df_*/) # Create a Bash array with all data directories
    WRITE_DIRS=({SSH_DATA}/{bp_write_str}-{i}/df_*/)

    module unload miniconda2
    module load miniconda3
    conda activate sars-env

    # Feed the script with the data directory corresponding to the task ID in the array

    python {script_file} -read_dir "${{DIRS[$SLURM_ARRAY_TASK_ID]}}" -incubation_file 'incubation_period_sars2.csv' -smoothing_file 'weights_binom.csv' -write_dir "${{WRITE_DIRS[$SLURM_ARRAY_TASK_ID]}}"
    """ 

    f = open(os.path.join(SIM_DIR, job_file), mode='w')
    f.write('%s\n' % job_str)
    f.close()
    
    job_path = os.path.join(SIM_DIR, job_file)

    #print(f'scp {job_path} {SSH_HOME}{SSH_DATA}')

    #print('')

    # commands to execute on the cluster
    print('sbatch %s' % job_file)
    print('')

<a id="12"></a>
### large object, from 10000 shuffled simulations

In [None]:
# not recommended
# note: previous cluster had cutoff of 2500 array jobs

script_file      = 'back_projection_ems.py'
script_path      = os.path.join(SSH_DATA, script_file)
bp_read_str       = 'back-projected-func-sim10000'
bp_write_str      = 'back-projected-func-write-sim10000'


job_file         = 'job-bp-sim-10000-d.sh'
job_str          = f"""#!/bin/bash -l
#!/bin/bash
#SBATCH --job-name=back-projection-sim
#SBATCH --output=bp-sim_%a.out
#SBATCH --error=bp-sim_%a.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=1G
#SBATCH --mail-user={USER_EMAIL}
#SBATCH --mail-type=ALL
#SBATCH --time 03:00:00
#SBATCH --array=7500-9999

# scripts takes the file path of the directory as input parameter

DIRS=({SSH_DATA}/{bp_read_str}/df_*/) # Create a Bash array with all data directories
WRITE_DIRS=({SSH_DATA}/{bp_write_str}/df_*/)

module unload miniconda2
module load miniconda3
conda activate sars-env

# Feed the script with the data directory corresponding to the task ID in the array

python {script_file} -read_dir "${{DIRS[$SLURM_ARRAY_TASK_ID]}}" -incubation_file 'incubation_period_sars2.csv' -smoothing_file 'weights_binom.csv' -write_dir "${{WRITE_DIRS[$SLURM_ARRAY_TASK_ID]}}"
""" 

f = open(os.path.join(SIM_DIR, job_file), mode='w')
f.write('%s\n' % job_str)
f.close()


job_path = os.path.join(SIM_DIR, job_file)

print(f'scp {job_path} {SSH_HOME}{SSH_DATA}')

print('')

# commands to execute on the cluster
print('sbatch %s' % job_file)
print('')


<a id="13"></a>
### local running

In [36]:
BP_LOC_DIR        = os.path.join(SIM_DIR, 'back-projected-redo')
BP_LOC_DIR_WRITE  = os.path.join(SIM_DIR, 'back-projected-redo-write')

READ_1 = os.path.join(BP_LOC_DIR, 'df_1')
WRITE_1 = os.path.join(BP_LOC_DIR_WRITE, 'df_1')

In [33]:
command = f"%run {bp_script} -read_dir {READ_0} -write_dir {WRITE_0} -incubation_file {incubation_file} -smoothing_file {smoothing_file}"

In [34]:
command

'%run back_projection_ems.py -read_dir /Users/liz/Documents/back_projection_home/backprojection-SARS-CoV-2-main/simulations/back-projected-redo/df_0 -write_dir /Users/liz/Documents/back_projection_home/backprojection-SARS-CoV-2-main/simulations/back-projected-redo-write/df_0 -incubation_file incubation_period_sars2.csv -smoothing_file weights_binom.csv'

#### example

In [42]:
%%time

%run back_projection_ems.py -read_dir {BP_LOC_DIR}/df_6 -write_dir {BP_LOC_WRITE_DIR}/df_6 -incubation_file incubation_period_sars2.csv -smoothing_file weights_binom.csv
print("done")
%run back_projection_ems.py -read_dir {BP_LOC_DIR}/df_7 -write_dir {BP_LOC_WRITE_DIR}/df_7 -incubation_file incubation_period_sars2.csv -smoothing_file weights_binom.csv
print("done")
%run back_projection_ems.py -read_dir {BP_LOC_DIR}/df_8 -write_dir {BP_LOC_WRITE_DIR}/df_8 -incubation_file incubation_period_sars2.csv -smoothing_file weights_binom.csv
print("done")
%run back_projection_ems.py -read_dir {BP_LOC_DIR}/df_9 -write_dir {BP_LOC_WRITE_DIR}/df_9 -incubation_file incubation_period_sars2.csv -smoothing_file weights_binom.csv
print("done")
%run back_projection_ems.py -read_dir {BP_LOC_DIR}/df_10 -write_dir {BP_LOC_WRITE_DIR}/df_10 -incubation_file incubation_period_sars2.csv -smoothing_file weights_binom.csv
print("done")
%run back_projection_ems.py -read_dir {BP_LOC_DIR}/df_11 -write_dir {BP_LOC_WRITE_DIR}/df_11 -incubation_file incubation_period_sars2.csv -smoothing_file weights_binom.csv
print("done")

/Users/liz/Documents/back_projection_home/backprojection-SARS-CoV-2-main/simulations/back-projected-redo/df_6
iter	epsilon
1	4.43e+03
2	2.14e-02
3	7.83e-03
4	3.63e-03
5	1.84e-03
6	9.92e-04
7	5.46e-04
8	3.06e-04
9	1.74e-04
10	1.00e-04
11	5.75e-05
iter	epsilon
1	2.42e+03
2	3.66e-02
3	1.30e-02
4	6.80e-03
5	3.76e-03
6	2.09e-03
7	1.18e-03
8	6.63e-04
9	3.76e-04
10	2.15e-04
11	1.24e-04
12	7.24e-05
iter	epsilon
1	6.18e+02
2	5.07e-02
3	2.07e-02
4	1.09e-02
5	6.17e-03
6	3.54e-03
7	2.06e-03
8	1.21e-03
9	7.14e-04
10	4.24e-04
11	2.54e-04
12	1.52e-04
13	9.14e-05
iter	epsilon
1	3.97e+02
2	6.32e-02
3	2.66e-02
4	1.33e-02
5	7.14e-03
6	3.90e-03
7	2.16e-03
8	1.21e-03
9	6.80e-04
10	3.86e-04
11	2.21e-04
12	1.27e-04
13	7.37e-05
iter	epsilon
1	5.27e+02
2	7.30e-02
3	2.46e-02
4	9.71e-03
5	4.55e-03
6	2.52e-03
7	1.46e-03
8	8.74e-04
9	5.25e-04
10	3.16e-04
11	1.91e-04
12	1.15e-04
13	7.00e-05
iter	epsilon
1	1.70e+03
2	2.73e-02
3	1.05e-02
4	5.21e-03
5	2.74e-03
6	1.48e-03
7	8.13e-04
8	4.58e-04
9	2.68e-04
10	1.59e-04
11

<a id="14"></a>
## back projection reformatting

<a id="15"></a>
### parallel for 10 parent directories
to aid with timing issue in large objects

not finished with testing

In [None]:
# parallel (untested)

script_file            = 'back-projected-npz.py'
#script_path            = ppath.join(SIM_DIR, script_file)
#script_path_clust      = ppath.join(SSH_DATA, script_file)

bp_read_str       = 'back-projected-sim1000'
bp_write_str    =  'back-projected-sim1000-write'

dirs_write = []
dirs_read = []
for i in range(10):
    #for j in range(1000):
    write_str = bp_write_str + f'-{i}'
    read_str = bp_read_str + f'-{i}'
    
    dirs_write.append(write_str)
    dirs_read.append(read_str)

for i in range(10):    
    
    job_file         = f'job-bp-npz-1000-{i}.sh'
    job_str          = f"""#!/bin/bash -l
#SBATCH --job-name=bp-npz-1000-{i}
#SBATCH --output=bp-npz-1000-{i}.out
#SBATCH --error=bp-npz-1000-{i}.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=32G
#SBATCH --mail-user={USER_EMAIL}
#SBATCH --mail-type=ALL
#SBATCH --time=1-0

module unload miniconda2
module load miniconda3
conda activate sars-env

python {script_file} -npz_file {sim_npz_clust} -npz_base_name {sim_name} -WRITE_DIR {BP_PROJ_DIR_WRITE}
""" 

    f = open(os.path.join(SIM_DIR, job_file), mode='w')
    f.write('%s\n' % job_str)
    f.close()

#print(f'scp {script_path} {SSH_HOME}{SSH_DATA} &&')

job_path = os.path.join(SIM_DIR, job_file)

print(f'scp {job_path} {SSH_HOME}{SSH_DATA}')

print('')

# commands to execute on the cluster
print('sbatch %s' % job_file)
print('')

<a id="16"></a>
### large object reformatting
formats the large object back projected output, runs relatively quickly

In [None]:
script_file            = 'back-projected-npz.py'
script_path            = ppath.join(SIM_DIR, script_file)
script_path_clust      = ppath.join(SSH_DATA, script_file)


job_file         = 'job-bp-npz-format.sh'
job_str          = f"""#!/bin/bash -l
#!/bin/bash
#SBATCH --job-name=bp-npz-10000
#SBATCH --output=bp-npz.out
#SBATCH --error=bp-npz.err
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=64G
#SBATCH --mail-user={USER_EMAIL}
#SBATCH --mail-type=ALL
#SBATCH --time=3-0

module unload miniconda2
module load miniconda3
conda activate sars-env

python {script_file} -npz_file {sim_npz_clust} -npz_base_name {sim_name} -WRITE_DIR {BP_PROJ_DIR_WRITE}
""" 

f = open(os.path.join(SIM_DIR, job_file), mode='w')
f.write('%s\n' % job_str)
f.close()

print(f'scp {script_path} {SSH_HOME}{SSH_DATA} &&')

job_path = os.path.join(SIM_DIR, job_file)

print(f'scp {job_path} {SSH_HOME}{SSH_DATA}')

print('')

# commands to execute on the cluster
print('sbatch %s' % job_file)
print('')

<a id="17"></a>
### local running reformatting


In [None]:
%run back-projected-npz -npz_file {sim_npz} -npz_base_name {sim_name} -WRITE_DIR {WRITE_TEST}

<a id="18"></a>
# run inferences
### local running on transferred cluster files

#### to do: smarter file renaming
below is a bandaid command line script

In [None]:
# this bash command renames the badly named dataframe directories

for d in df_*; do
    new_name=$(echo "$d" | awk -F_ '{printf "df_%04d", $2}')
    mv "$d" "$new_name"
done

### inference running

In [None]:
%%time

sim_name = 'sim-samp100-T100-sim10000'
inf_name = 'inf-samp100-T100-sim10000'


sim_out       = os.path.join(SIM_DIR, sim_name + '-stochastic')
inf_out       = os.path.join(SIM_DIR, inf_name + '-stochastic')

# shuffling files

sim_shuff_out = os.path.join(SIM_DIR, sim_name + '-shuffled')
inf_shuff_out = os.path.join(SIM_DIR, inf_name + '-shuffled')


sim_bp_out = os.path.join(SIM_DIR, sim_name + '-backprojected')
inf_bp_out = os.path.join(SIM_DIR, sim_name + '-backprojected')

%run {inf_script} --data {sim_out + '.npz'} -o {inf_out} -R 2 --pop_size 10000
%run {inf_script} --data {sim_shuff_out + '.npz'} -o {inf_shuff_out} -R 2 --pop_size 10000
%run {inf_script} --data {sim_bp_out + '.npz'} -o {inf_bp_out} -R 2 --pop_size 10000