In [1]:
nato = "whisky foxtrot tango"

In [2]:
w, t, f = (_ for _ in nato.split())

In [3]:
t

'foxtrot'

In [13]:
import re
from sortedcontainers import SortedDict

def cluster(feature_f: str, clusters: int, min_length: int = 10000):  # $gc_out, $bins
    gc_hash: Dict[int, int] = dict()
    cut_off_points = list()
    num_of_seq = 0
    total_length = 0
    header_to_cod_GC = dict()

    with open(feature_f, "r") as GC:
        # read in probuild output, line by line.  Should be fasta input.
        for line in GC:
            # if the line is a fasta header in the form of '>(Reference sequence name)\t(number) (number)
            # if (text := re.search(pattern="^>(.*?)\t(\d+)\s+(\d+)", string=line)): # switch to this?  only support python>=3.8?
            text = re.search(pattern="^>(.*?)\t(\d+)\s+(\d+)", string=line)
            if text:
                header = text.group(1)  # Reference name
                length = int(text.group(2))  # length of sequence?
                GC = int(text.group(3))  # must be GC percentage
                header_re = re.search(
                    pattern="^(.*?)\t", string=line
                )  # Dont get this one - didn't we already extract just this capture group?
                if header_re:
                    header = header_re.group(1)
                header_to_cod_GC[header] = GC
                num_of_seq += 1
                total_length += length
                if GC in gc_hash:
                    gc_hash[GC] += length
                else:
                    gc_hash[GC] = length

    sorted_GC = SortedDict(gc_hash)  # sort the gc_hash dictionary by keys
    min_GC = sorted_GC.values()[0]
    max_GC = sorted_GC.values()[-1]
    print(f"min_GC={min_GC} max_GC={max_GC} total_seq_length={total_length}\n")

    previous = 0
    for key in sorted_GC:
        gc_hash[key] += previous

        if previous < total_length / 3 and gc_hash[key] >= total_length / 3:
            one_third = key
        if previous < total_length / 3 * 2 and gc_hash[key] >= total_length / 3 * 2:
            two_third = key
        if previous < total_length / 2 and gc_hash[key] >= total_length / 2:
            one_half = key
        previous = gc_hash[key]
        # TODO: uncomment when we have logging fixed
        # log.info(f"({one_third})->({gc_hash[one_third]})\n")
        # log.info(f"({one_half})->({gc_hash[one_half]})\n")
        # log.info(f"({two_third})->({gc_hash[two_third]})\n")

    if clusters == 0:
        # cluster number is not specified by user
        # automatically choose cluster number.
        if two_third - one_third > 3:
            clusters = 3
        else:
            clusters = 1
    if clusters == 3:
        if (
            (two_third - one_third) < 1
            or (max_GC - two_third) < 1
            or (one_third - min_GC) < 1
        ):
            # &Log( "Total number of sequences is not enough for training in 3 clusters!\n" )
            clusters = 1
        else:
            if gc_hash[one_third] > min_length:
                cut_off_points.extend((min_GC, one_third, two_third, max_GC))
            else:
                # &Log( "Total length of sequences is not enough for training in 3 clusters!\n" )
                clusters = 2

    if clusters == 2:
        if gc_hash[one_half] > min_length:
            cut_off_points.extend((min_GC, one_half, max_GC))
        else:
            # &Log( "Total length of sequences is not enough for training in 2 clusters!\n" )
            pass

    if clusters == 1:
        cut_off_points.extend((min_GC, max_GC))
    return clusters, cut_off_points, header_to_cod_GC


In [14]:
gc_out = "initial.meta.list.feature"
clusters = 3
min_length: int = 10000

In [16]:
bin_num, cutoffs, seq_GC = cluster(feature_f=gc_out, clusters=clusters)

min_GC=232 max_GC=2039 total_seq_length=9926700



In [17]:
bin_num

1

In [18]:
cutoffs

[232, 2039]

In [19]:
seq_GC

{'>98_1132_1457_1035_31.3:NM_001020461 hypothetical_protein': 31,
 '>256_1713_1713_1458_34.2:NM_001020458 membrane_transporter__predicted_': 34,
 '>130_684_742_555_35.1:NM_001020454 isomerase': 35,
 '>365_1153_1471_789_33.7:NM_001020453 GDT1_like_protein': 34,
 '>138_1220_1372_1083_37.8:NM_001020452 L_asparaginase__predicted_': 38,
 '>185_1183_1294_999_36.3:NM_001020451 hydroxyacid_dehydrogenase__predicted_': 36,
 '>231_1211_1498_981_35.7:NM_001020450 cell_surface_glycoprotein__predicted___DIPSY_family': 36,
 '>74_1825_1907_1752_38.0:NM_001020449 amidase__predicted_': 38,
 '>95_1378_2086_1284_34.6:NM_001020448 nitric_oxide_dioxygenase__predicted_': 35,
 '>1_1986_2109_1986_37.0:NM_001020447 urea_transporter__predicted_': 37,
 '>318_2840_3200_2523_35.2:NM_001020445 sulfate_transporter__predicted_': 35,
 '>16_627_732_612_36.2:NM_001020444 HHE_domain_cation_binding_protein__predicted_': 36,
 '>15_1325_1469_1311_34.9:NM_001020443 alpha_galactosidase__melibiase': 35,
 '>316_1008_1363_693_35.

In [20]:
type(bin_num)

int

In [21]:
type(cutoffs)

list

In [22]:
type(seq_GC)

dict

In [23]:
metaout = "meta.lst"
logfile = "gms.log"
seq = "sequence"
start_prefix = "startseq."
gibbs_prefix = "gibbs_out."
mod_prefix = "itr_"
mod_suffix = ".mod"
hmmout_prefix = "itr_"
hmmout_suffix = ".lst"
out_name = "GeneMark_hmm.mod"
out_name_heuristic = "GeneMark_hmm_heuristic.mod"
out_suffix = "_hmm.mod"
out_suffix_heu = "_hmm_heuristic.mod"
fnn_out = ""
faa_out = ""

meta_out = "initial.meta.lst"
gc_out = f"{meta_out}.feature"

In [25]:
import sys
import os
import click
import re
import logging
from typing import Optional, List, Tuple, Dict
import logging
from subprocess import run
import pyfaidx
from Bio.SeqUtils import GC as getGC
from sortedcontainers import SortedDict
from tempfile import TemporaryDirectory

In [34]:
os.path.abspath("../utilities/probuild")

'/Users/milessmith/workspace/pygmst/pygmst/utilities/probuild'

In [30]:
probuild = os.path.abspath("../utilities/probuild")
gmhmmp = os.path.abspath("../utilities/gmhmmp")
meta_model = os.path.abspath("../models/MetaGeneMark_v1.mod")

In [35]:
def train(
    input_seq: str = "test.fa",
    seq: str = "sequence",
    motif: bool = True,
    fixmotif: bool = True,
    order: int = 4,
    order_non: int = 2,
    start_prefix: str  = "startseq.",
    gibbs_prefix: str = "itr_",
    prestart: int = 6,
    width: int = 12,
    build_cmd: str = f"{probuild} --par par_1.default",
    hmm_cmd: str = f"{gmhmmp} -s -d",
    par: str = "par_1.default",
    maxitr: int = 10,
    identity: float = 0.99,
) -> Tuple[str, List[str]]:
    tmp_files: List[str] = list()
    # ------------------------------------------------
    # prepare sequence
    # build_cmd should have the form of! `probuild --par par_1.default`
    # so this makes the below
    # `probuild --par par_1.default --clean_join sequence --seq test.fa`
    # which seems kind of redundant, but what do I know
    run(f"{build_cmd} --clean_join {seq} --seq {input_seq}".split())
    # TODO: another thing to restore when we figure out logging.
    # run([build_cmd, "--clean_join", seq, "--seq", input_seq, "--log", logfile])

    # ------------------------------------------------
    # tmp solution: get sequence size, get minimum sequence size from --par <file>
    # compare, skip iterations if short

    with open(seq, "r") as _:
        sequence_size = len(_.read())

    min_size_find_cmd = run(args=[f"grep MIN_SEQ_SIZE {par}".split()], capture_output=True)
    minimum_sequence_size = re.findall(
        pattern="\s*--MIN_SEQ_SIZE\s+", string=str(min_size_find_cmd.stdout, "utf=8")
    )[0]

    do_iterations = True

    if sequence_size < minimum_sequence_size:
        # form of! `probuild --par par_1.default --clean_join sequence --seq test.fa --MIN_CONTIG_SIZE 0 --GAP_FILLER
        run(f"{build_cmd} --clean_join {seq} --seq {input_seq} --MIN_CONTIG_SIZE 0 --GAP_FILLER".split())
        # run([build_cmd, "--clean_join", seq, "--seq", input_seq, "--log", logfile, "--MIN_CONTIG_SIZE", 0, "--GAP_FILLER"])
        do_iterations = False

    # Log("do_iterations = $do_iterations\n")

    # ------------------------------------------------
    # run initial prediction
    itr = 0
    next_item = f"{hmmout_prefix}{itr}{hmmout_suffix}"
    print("run initial prediction")

    # form of! `gmhmmp -s -d sequence -m MetaGeneMark_v1.mod -o itr_{itr}.lst`
    run(f"{hmm_cmd} {seq} -m {os.path.abspath('../models/MetaGeneMark_v1.mod')} -o {next_item}".split())
    # TODO: tempfile
    tmp_files.extend((next_item))

    # ------------------------------------------------
    # enter iterations loop

    # TODO: logfile
    # &Log( "entering iteration loop\n" );

    while do_iterations:
        itr += 1
        mod = f"{mod_prefix}{itr}{mod_suffix}"

        if motif and not fixmotif:
            start_seq = f"{start_prefix}{itr}"
            gibbs_out = f"{gibbs_prefix}{itr}"

        command = f"{build_cmd} --mkmod {mod} --seq {seq} --geneset {next_item} --ORDM {order} --order_non {order_non} --revcomp_non 1"

        if motif and not fixmotif:
            command = f"{command} --pre_start {start_seq} --PRE_START_WIDTH {prestart}"
        elif motif and fixmotif:
            command = (
                f"{command} --fixmotif --PRE_START_WIDTH {prestart} --width {width}"
            )
            # command += f" --fixmotif --PRE_START_WIDTH {prestart} --width {width} --log {logfile}"

        print(f"build model: {mod} for iteration: {itr}")
        run(command.split())
        tmp_files.extend((mod))

        if (
            motif and not fixmotif
        ):  # given that we only have Gibbs3, don't *really* have an option
            # if $gibbs_version == 1:
            #     &RunSystem( "$gibbs $start_seq $width -n > $gibbs_out", "run gibbs sampler\n" )
            # elif $gibbs_version == 3:
            #     &RunSystem( "$gibbs3 $start_seq $width -o $gibbs_out -F -Z  -n -r -y -x -m -s 1 -w 0.01", "run gibbs3 sampler\n" )
            print("run gibbs3 sampler")
            # form of! 'Gibbs3 startseq.{itr} 12 -o gibbs_out.{itr} -F -Z -n -r -y -x -m -s 1 -w 0.01"
            run(f"{gibbs3} {start_seq} {width} -o {gibbs_out} -F -Z -n -r -y -x -m -s 1 -w 0.01".split())
            tmp_files.extend((start_seq))

            print("make prestart model")
            # TODO: logfile
            # run([build_cmd, "--gibbs", gibbs_out, "--mod", mod, "--seq", start_seq, "--log", logfile])
            # form of! `probuild --par par_1.default --gibs gibbs_out.{itr} --mod itr_{itr}.mod --seq startseq.{itr}
            run(f"{build_cmd} --gibbs {gibbs_out} --mod {mod} --seq {start_seq}".split())
            tmp_files.extend((gibbs_out))

        prev = next_item
        # next_item = itr_{itr}.lst
        next_item = f"{hmmout_prefix}{itr}{hmmout_suffix}"

        # form of! `gmhmmp -s d sequence -m itr_{itr}.mod -o itr_{itr}.lst
        command = f"{hmm_cmd} {seq} -m {mod} -o {next_item}"

        # form of! `gmhmmp -s d sequence -m itr_{itr}.mod -o itr_{itr}.lst -r
        if motif:
            command += " -r"

        print(f"prediction, iteration: {itr}")
        run(command.split())
        tmp_files.extend((next_item))

        # `probuild --par par_1.default --compare --source itr_{itr}.lst --target itr_{itr-1}.lst
        command = f"{build_cmd} --compare --source {next_item} --target {prev}"
        # &Log( "compare:\n" . $command . "\n" );

        diff = str(run(command.split(), capture_output=True).stdout).strip()
        # &Log( "compare $prev and $next_item: $diff\n" );

        if diff >= identity:
            # &Log( "Stopped iterations on identity: $diff\n" );
            do_iterations = False
        if itr == maxitr:
            # &Log( "Stopped iterations on maximum number: $maxitr\n" )
            do_iterations = False
    return mod, tmp_files

In [36]:
list_of_temp = list()
model, list_of_temp = train()

OSError: [Errno 8] Exec format error: '../utilities/probuild'