diff --git a/.gitignore b/.gitignore index fefab16..f852435 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ PhyloCSF.Linux.x86_64 PhyloCSF.Darwin.x86_64 _build *.native +ForkMaybe.ml diff --git a/.travis-ci.sh b/.travis-ci.sh index 066e31f..d7bf67f 100644 --- a/.travis-ci.sh +++ b/.travis-ci.sh @@ -1,4 +1,4 @@ -OPAM_DEPENDS="batteries gsl ocaml+twt ounit should" +OPAM_DEPENDS="batteries gsl ocaml+twt forkwork ounit should" sudo add-apt-repository -y ppa:avsm sudo apt-get update -qq diff --git a/.travis.yml b/.travis.yml index ab37d14..959dcaf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,4 +3,4 @@ script: bash -ex .travis-ci.sh env: matrix: - SKIP_SLOW=1 - - DUMMY=1 + - FORKWORK=1 diff --git a/src/ForkNo.ml b/src/ForkNo.ml new file mode 100644 index 0000000..4aa2d1e --- /dev/null +++ b/src/ForkNo.ml @@ -0,0 +1,3 @@ +let can_fork = false + +let map ?procs f lst = List.map f lst diff --git a/src/ForkYes.ml b/src/ForkYes.ml new file mode 100644 index 0000000..e6d645e --- /dev/null +++ b/src/ForkYes.ml @@ -0,0 +1,8 @@ +open Batteries + +let can_fork = true + +let map ?(procs=1) f lst = + if procs=1 || List.length lst = 1 then List.map f lst + else + ForkWork.map_list ~maxprocs:procs f lst diff --git a/src/Makefile b/src/Makefile index 3e161b8..95a2cdd 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,12 +1,25 @@ +OCAMLBUILDFLAGS= +ifdef FORKWORK +OCAMLBUILDFLAGS=-tag pkg_forkwork +endif + all: - ocamlbuild PhyloCSF.native + rm -f ForkMaybe.ml +ifdef FORKWORK + ln -s ForkYes.ml ForkMaybe.ml +else + ln -s ForkNo.ml ForkMaybe.ml +endif + ocamlbuild $(OCAMLBUILDFLAGS) PhyloCSF.native -test: all - ocamlbuild testSim.native test.native +test: testexe ./test.native -verbose +testexe: all + ocamlbuild $(OCAMLBUILDFLAGS) testSim.native test.native + clean: rm -f *~ ocamlbuild -clean -.PHONY: all test clean +.PHONY: all test testexe clean diff --git a/src/PhyloCSF.ml b/src/PhyloCSF.ml index ca30168..b8d4270 100644 --- a/src/PhyloCSF.ml +++ b/src/PhyloCSF.ml @@ -39,6 +39,7 @@ let reading_frame = opt ~group ~l:"frames" ~s:'f' ~h:"how many reading frames to let orf_mode = opt ~group ~l:"orf" ~h:"search for ORFs (default AsIs)" (Opt.value_option "AsIs|ATGStop|StopStop|StopStop3|ToFirstStop|FromLastStop|ToOrFromStop" (Some AsIs) (fun s -> match String.lowercase s with "asis" -> AsIs | "atgstop" -> ATGStop | "stopstop" -> StopStop | "stopstop3" -> StopStop3 | "tofirststop" -> ToFirstStop | "fromlaststop" -> FromLastStop | "toorfromstop" -> ToOrFromStop| x -> invalid_arg x) (fun _ s -> sprintf "invalid ORF search mode %s"s)) let min_codons = opt ~group ~l:"minCodons" ~h:"minimum ORF length for searching over ORFs (default 25 codons)" (StdOpt.int_option ~default:25 ()) let print_orfs = opt ~group ~l:"allScores" ~h:"report scores of all regions evaluated, not just the max" (StdOpt.store_true ()) +let procs = opt ~group ~s:'p' ~h:"search frames/ORFs using up to p parallel subprocesses" (StdOpt.int_option ~default: 1 ()) let group = OptParser.add_group opt_parser "output control" let print_bls = opt ~group ~l:"bls" ~h:"include alignment branch length score (BLS) for the reported region in output" (StdOpt.store_true ()) @@ -65,6 +66,16 @@ if Opt.get orf_mode <> AsIs && Opt.get reading_frame = One then eprintf "Warning: --orf with --frames=1; are you sure you don't want to search for ORFs in three or six frames?\n" flush stderr +if Opt.get procs > 1 && not ForkMaybe.can_fork then + eprintf "Warning: ignoring -p, recompile PhyloCSF with: make FORKWORK=1\n" + flush stderr + +(* +if Opt.get procs > 1 && Opt.get orf_mode = AsIs && Opt.get reading_frame = One then + eprintf "Warning: -p isn't useful without --orf and/or --frames\n" + flush stderr +*) + if Opt.get debug then Printexc.record_backtrace true (******************************************************************************) @@ -178,7 +189,7 @@ let find_orfs ?(ofs=0) dna = orfs := if firstorflo = ofs then [firstorf] else [] orfs := if len - lastorfhi <= 3 && firstorf <> lastorf then lastorf :: !orfs else !orfs - !orfs |> List.rev |> List.enum |> filter + !orfs |> List.rev |> List.filter fun (lo,hi) -> assert ((hi-lo+1) mod 3 = 0) assert ((lo-ofs) mod 3 = 0) @@ -191,9 +202,9 @@ let candidate_regions dna rcdna = (if Opt.get reading_frame <> One then [ (false,1,hi); (false,2,hi) ] else []); (if Opt.get reading_frame = Six then - [ (true,0,hi); (true,1,hi); (true,2,hi) ] else []) ] |> List.flatten |> List.enum + [ (true,0,hi); (true,1,hi); (true,2,hi) ] else []) ] |> List.flatten else - let aoeu a = map (fun (b,c) -> a,b,c) + let aoeu a = List.map (fun (b,c) -> a,b,c) let all_orfs = ref [aoeu false (find_orfs ~ofs:0 dna)] if Opt.get reading_frame <> One then all_orfs := aoeu false (find_orfs ~ofs:1 dna) :: !all_orfs @@ -203,7 +214,7 @@ let candidate_regions dna rcdna = all_orfs := aoeu true (find_orfs ~ofs:0 rcdna) :: !all_orfs all_orfs := aoeu true (find_orfs ~ofs:1 rcdna) :: !all_orfs all_orfs := aoeu true (find_orfs ~ofs:2 rcdna) :: !all_orfs - concat (List.enum (List.rev !all_orfs)) + List.rev !all_orfs |> List.flatten let pleaves ?(lo=0) ?hi t leaf_ord aln = let hi = match hi with Some x -> x | None -> String.length aln.(0) - 1 @@ -288,38 +299,56 @@ let process_alignment (nt,t,evaluator) fn = (* generate list of candidate regions within the alignment *) let rgns = candidate_regions aln.(0) rc_aln.(0) + + let rgns = + if Opt.get procs > 1 && ForkMaybe.can_fork then + (* If parallelizing, sort regions longest-to-shortest + in order to optimize utilization *) + List.sort (fun (_,lo1,hi1) (_,lo2,hi2) -> (hi2-lo2) - (hi1-lo1)) rgns + else rgns try - if Enum.is_empty rgns then + if rgns = [] then assert (Opt.get orf_mode <> AsIs) failwith "no sufficiently long ORFs found" - (* evaluate each candidate region *) + (* evaluate each candidate region, perhaps in parallel *) let rgns_scores = - rgns |> Enum.filter_map + rgns |> ForkMaybe.map ~procs:(Opt.get procs) fun (rc,lo,hi) -> try let aln_leaves = Array.of_enum (pleaves ~lo:lo ~hi:hi t leaf_ord (if rc then rc_aln else aln)) let rslt = evaluator aln_leaves - Some (rslt,rc,lo,hi) + `Res (rslt,rc,lo,hi) with | exn -> (* problem evaluating an individual region within the - alignment: complain, but proceed, as maybe some - other region will succeed. *) - printf "%s\texception\t%d\t%d%s\t%s\n" - fn2id fn - lo - hi - if Opt.get reading_frame = Six then (if rc then "\t-" else "\t+") else "" - Printexc.to_string exn + alignment: register complaint, but don't die, as + maybe some other region will succeed. *) + let msg = + sprintf "%s\texception\t%d\t%d%s\t%s" + fn2id fn + lo + hi + if Opt.get reading_frame = Six then (if rc then "\t-" else "\t+") else "" + Printexc.to_string exn if Opt.get debug then - flush stdout - eprintf "%s" (Printexc.get_backtrace ()) - flush stderr - None + `Exn [| msg; Printexc.get_backtrace () |] + else + `Exn [| msg |] - if Enum.is_empty rgns_scores then failwith "no regions successfully evaluated" + let rgns_scores = + rgns_scores |> List.filter_map + function + | `Res x -> Some x + | `Exn [| msg |] -> print_endline msg; flush stdout; None + | `Exn [| msg; bt |] -> + print_endline msg; flush stdout + prerr_endline bt; flush stderr + None + | _ -> assert false + + if rgns_scores = [] then failwith "no regions successfully evaluated" let report_score ty (rslt,rc,lo,hi) = printf "%s\t%s\t%.4f" (fn2id fn) ty rslt.PhyloCSFModel.score @@ -342,10 +371,9 @@ let process_alignment (nt,t,evaluator) fn = foreach (List.enum rslt.PhyloCSFModel.diagnostics) (fun (k,v) -> printf " %s=%s" k v) printf "\n" - Enum.force rgns_scores if Opt.get print_orfs then - Enum.clone rgns_scores |> iter (report_score "orf_score(decibans)") - reduce max rgns_scores |> report_score (if Opt.get orf_mode <> AsIs || Opt.get reading_frame <> One then "max_score(decibans)" else "score(decibans)") + rgns_scores |> List.iter (report_score "orf_score(decibans)") + List.reduce max rgns_scores |> report_score (if Opt.get orf_mode <> AsIs || Opt.get reading_frame <> One then "max_score(decibans)" else "score(decibans)") with | ((Assert_failure _) as exn) -> raise exn (* move on to the next alignment: convergence problems, no ORFs found, etc. *) diff --git a/src/test.ml b/src/test.ml index df6b328..a5d511d 100644 --- a/src/test.ml +++ b/src/test.ml @@ -12,6 +12,9 @@ let slow () = skip_if (try ignore (Sys.getenv "SKIP_SLOW"); true with Not_found (* test results on the three bundled example alignments *) let run_PhyloCSF species params = + let params = + if ForkMaybe.can_fork then params ^ " -p 8" + else params let cmd = sprintf "%s %s %s" fn_PhyloCSF (Filename.concat dn_here ("../PhyloCSF_Parameters/" ^ species)) params let phylocsf_in = Unix.open_process_in ~cleanup:true cmd diff --git a/src/testSim.ml b/src/testSim.ml index 1dd8c68..55ed0e0 100644 --- a/src/testSim.ml +++ b/src/testSim.ml @@ -129,7 +129,7 @@ let mfa headers seqs = Buffer.contents buf let run_phylocsf aln = - let cmd = sprintf "%s %s" fn_exe fp_params + let cmd = sprintf "%s %s%s" fn_exe fp_params (if ForkMaybe.can_fork then " -p 8" else "") let phylocsf_in, phylocsf_out = Unix.open_process ~cleanup:true cmd output_string phylocsf_out aln