# Load Modules

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots
from multiprocessing import Pool
from tqdm import tqdm
import os
import subprocess
import shutil

pio.templates.default = 'plotly_white'
pd.options.mode.chained_assignment = None

# Define Paths

In [2]:
# binary of distance network binary
fn_distbin = "../src/MCD-MotifSearch/src/linkage/bin/distance_network"

# location of avarda
fn_cwd = os.getcwd()
fn_avarda = "../src/AVARDA/"

# peptide meta 
fn_idxmap = "../data/meta/peptide_meta/idx_header_map.tab"
fn_aaseq = "../data/meta/peptide_meta/target_aa.fa"

# enrichment peptides
enr_set = "z10_c8"
fn_enrbool = "../data/enriched/TororoKanunguRound2/enrichments/{}_bool.csv".format(enr_set)
fn_enrpep = "../data/enriched/TororoKanunguRound2/enrichments/{}_peptides.txt".format(enr_set)


# output files
fn_outdir = "../results/AVARDA_{}".format(enr_set)
fn_enrboolav = os.path.join(fn_outdir, "{}_bool.csv".format(enr_set))
fn_enrpepfa = os.path.join(fn_outdir, "{}_peptides.fa".format(enr_set))
fn_enrdist = os.path.join(fn_outdir, "{}_linkage.tab".format(enr_set))
fn_var = os.path.join(fn_avarda, "input/variables.txt".format(enr_set))

## Create Output Directory

In [3]:
try:
    os.mkdir("../results/")
except FileExistsError:
    pass

try:
    os.mkdir(fn_outdir)
except FileExistsError:
    shutil.rmtree(fn_outdir)
    os.mkdir(fn_outdir)

# Create Input Files

## Input Fasta

In [4]:
# fasta format is required for distance calculation

def seqReader(fn):
    """
    iterate through sequences and yield as generator
    """
    def openSeq(fn):
        if 'gz' in fn:
            return gzip.open(fn, 'rt')
        else:
            return open(fn, 'r')

    def num_iter(fn):
        if 'fastq' in fn or 'fq' in fn:
            return 4
        else:
            return 2

    n = num_iter(fn)

    with openSeq(fn) as f:
        while True:
            try:
                yield [next(f).strip('\n') for _ in range(n)]
            except StopIteration:
                break

def load_idxmap(fn_idxmap):
    d = {}
    for line in open(fn_idxmap, "r+"):
        idx, target = line.strip().split("\t")
        d[target] = idx
    
    return d

def load_aaseq(fn_aaseq):
    d = {}
    for h, s in seqReader(fn_aaseq):
        d[h.strip(">")] = s
    return d

def load_peptides(fn_enrpep):
    return [l.strip() for l in open(fn_enrpep, "r+") if "#" not in l]

def write_fa(idxmap, aamap, peps, fn_out):
    f = open(fn_out, "w+")
    
    for p in peps:
        tnum = idxmap[p]
        f.write(
            ">{}\n{}\n".format(p, aamap[tnum])
        )
    
    f.close()
    
idxmap = load_idxmap(fn_idxmap)
aamap = load_aaseq(fn_aaseq)
peps = load_peptides(fn_enrpep)
write_fa(idxmap, aamap, peps, fn_enrpepfa)

# Run Pipelines

## Build Linkage Network

In [5]:
p = subprocess.Popen(
    args = " ".join([
        fn_distbin, 
        fn_enrpepfa,
        fn_enrdist,
        "0"
    ]), 
    shell = True,
    stdout = subprocess.PIPE,
    stderr = subprocess.PIPE
)
stdout, stderr = p.communicate()

# reformat linkage network to fit avarda standards
p = subprocess.Popen(
    args = [
        "tail -n+2 {} | cut -f 1,2 | tr '\t' ',' > tmp.txt && mv tmp.txt {}".format(fn_enrdist, fn_enrdist)
    ],
    shell = True,
    stdout = subprocess.PIPE,
    stderr = subprocess.PIPE
)
stdout, stderr = p.communicate()

## Move Bool Matrix and Format

In [6]:
p = subprocess.Popen(
    args = [
        "paste -d ',' {} {} > {}".format(fn_enrpep, fn_enrbool, fn_enrboolav)
    ],
    shell = True,
    stdout = subprocess.PIPE,
    stderr = subprocess.PIPE
)
stdout, stderr = p.communicate()

## Create AVARDA params file

In [7]:
def write_params(fn_var, fn_enrbool, fn_enrdist):
    f = open(fn_var, "w+")
    params = {
        "dir_home" : "../src/AVARDA",
        "file_annotation" : "VirScan_v1_annot.txt",
        "file_aln" : "aln_sparse.pkl",
        "zscore_file" : os.path.abspath(fn_enrboolav),
        "linkage" : os.path.abspath(fn_enrdist),
        "dir_result" : os.path.abspath(fn_outdir),
        "use_filter" : "yes",
        "Z_threshold" : 1,
        "p_threshold" : 0.01,
        "x_threshold" : 2,
        "bh_threshold" : 0.05
        
    }
    for x, y in params.items():
        f.write("{}={}\n".format(x, y))
    
    f.close()
write_params(fn_var, fn_enrbool, fn_enrdist)

## Run AVARDA

In [8]:
# avarda requires a specific path placement to run
os.chdir(os.path.join(fn_avarda, "bin"))

p = subprocess.Popen(
    args = " ".join([
        "python3 main.py"
    ]), 
    shell = True,
    stdout = subprocess.PIPE,
    stderr = subprocess.PIPE
)
p.communicate()

os.chdir(fn_cwd)