diff --git a/DNAnexus/Makefile b/DNAnexus/Makefile new file mode 100644 index 0000000..6f9e34c --- /dev/null +++ b/DNAnexus/Makefile @@ -0,0 +1,15 @@ +all: default + +default: + rm -rf resources/PhyloCSF* + mkdir -p resources + cp -r ../PhyloCSF ../PhyloCSF.Linux.x86_64 ../PhyloCSF_Parameters resources + $(MAKE) -C src all + +clean: + $(MAKE) -C src clean + +test: all + test/test.sh + +.PHONY: all default clean test diff --git a/DNAnexus/Readme.developer.md b/DNAnexus/Readme.developer.md new file mode 100644 index 0000000..7a0f176 --- /dev/null +++ b/DNAnexus/Readme.developer.md @@ -0,0 +1,35 @@ +# PhyloCSF Developer Readme + + + +## Running this app with additional computational resources + +This app has the following entry points: + +* main +* process +* postprocess + +When running this app, you can override the instance type to be used for each +entry point by providing the ``systemRequirements`` field to +```/applet-XXXX/run``` or ```/app-XXXX/run```, as follows: + + { + systemRequirements: { + "main": {"instanceType": "dx_m1.large"}, + "process": {"instanceType": "dx_m1.large"}, + "postprocess": {"instanceType": "dx_m1.large"} + }, + [...] + } + +See Run +Specification in the API documentation for more information about the +available instance types. diff --git a/DNAnexus/Readme.md b/DNAnexus/Readme.md new file mode 100644 index 0000000..8a4a00e --- /dev/null +++ b/DNAnexus/Readme.md @@ -0,0 +1,51 @@ + +# PhyloCSF (DNAnexus Platform App) + +Phylogenetic analysis of multi-species genome sequence alignments to identify conserved protein-coding regions + +This is the source code for an app that runs on the DNAnexus Platform. +For more information about how to run or modify it, see +http://wiki.dnanexus.com/. + + + + + + + + +## Inputs + +* **alignments** the ``CrossSpeciesAlignments`` to evaluate, usually generated by the MAF Stitcher app +* **output_name** name of the output table +* **alignments_per_job** controls the degree of parallelization +* **species_set** usually auto-detected for alignments produced by the MAF Stitcher app +* **frames** search over three or six reading frames +* **orf** search over ORFs within each alignment +* **min_codons** minimum length considered in ORF search mode (with ``orf != AsIs``) + +## Advanced Inputs + +* **strategy** model and parameter optimization strategy +* **all_scores** report all scores computed (with ``frames != 1`` or ``orf != AsIs``) +* **bls** include a Branch Length Score along with PhyloCSF score in output table +* **anc_comp** include an ancestral composition score along with PhyloCSF score in output table +* **dna** include evaluated DNA sequence in output table +* **aa** include evaluated amino acid sequence in output table +* **instance_type** instance type for parallel jobs + +## Outputs + +* **scores** +* **errors** number of alignments without scores (see job log for details) diff --git a/DNAnexus/dxapp.json b/DNAnexus/dxapp.json new file mode 100644 index 0000000..0f8fdd5 --- /dev/null +++ b/DNAnexus/dxapp.json @@ -0,0 +1,137 @@ +{ + "name": "PhyloCSF", + "title": "PhyloCSF", + "summary": "Phylogenetic analysis of multi-species genome sequence alignments to identify conserved protein-coding regions", + "dxapi": "1.0.0", + "version": "0.1.0", + "inputSpec": [ + { + "name": "alignments", + "class": "gtable", + "type": "CrossSpeciesAlignments", + "optional": false, + "help": "Alignments to evaluate, usually generated by the MAF Stitcher app" + }, + { + "name": "output_name", + "class": "string", + "optional": true, + "help": "Name of output table" + }, + { + "name": "alignments_per_job", + "class": "int", + "optional": true, + "default": 500, + "help": "Degree of parallelization" + }, + { + "name": "species_set", + "class": "string", + "choices": ["12flies", "29mammals"], + "optional": true, + "help": "Usually auto-detected for alignments produced by the MAF Stitcher app" + }, + { + "name": "frames", + "class": "int", + "choices": [1,3,6], + "optional": true, + "default": 1, + "help": "Search over three or six reading frames" + }, + { + "name": "orf", + "class": "string", + "choices": ["AsIs", "ATGStop", "StopStop", "StopStop3", "ToFirstStop", "FromLastStop", "ToOrFromStop"], + "optional": true, + "default": "AsIs", + "help": "Search over ORFs within each alignment (usually with frames=3 or frames=6)" + }, + { + "name": "min_codons", + "class": "int", + "optional": true, + "default": 25, + "help": "Minimum length considered in ORF search mode (with orf != AsIs)" + }, + { + "name": "strategy", + "class": "string", + "choices": ["mle", "fixed", "omega"], + "optional": true, + "default": "mle", + "group": "Advanced", + "help": "Model and parameter optimization strategy" + }, + { + "name": "all_scores", + "class": "boolean", + "optional": true, + "default": false, + "group": "Advanced", + "help": "Report all scores computed (with frames != 1 or orf != AsIs)" + }, + { + "name": "bls", + "class": "boolean", + "optional": true, + "default": false, + "group": "Advanced", + "help": "Include a Branch Length Score along with PhyloCSF score in output table" + }, + { + "name": "anc_comp", + "class": "boolean", + "optional": true, + "default": false, + "group": "Advanced", + "help": "Include an ancestral composition score along with PhyloCSF score in output table" + }, + { + "name": "dna", + "class": "boolean", + "optional": true, + "default": false, + "group": "Advanced", + "help": "Include evaluted DNA sequence in output table" + }, + { + "name": "aa", + "class": "boolean", + "optional": true, + "default": false, + "group": "Advanced", + "help": "Include evaluated amino acid sequence in output table" + }, + { + "name": "instance_type", + "class": "string", + "choices": ["dx_m1.large", "dx_c1.xlarge"], + "optional": true, + "group": "Advanced", + "help": "Instance type for parallel jobs" + } + ], + "outputSpec": [ + { + "name": "scores", + "class": "gtable" + }, + { + "name": "errors", + "class": "int" + } + ], + "runSpec": { + "interpreter": "bash", + "file": "src/run.sh", + "execDepends": [ {"name": "gsl-bin"} ] + }, + "details": { + "upstreamUrl": "https://github.com/mlin/PhyloCSF", + "citations": [ + "doi:10.1093/bioinformatics/btr209" + ] + } +} diff --git a/DNAnexus/src/Makefile b/DNAnexus/src/Makefile new file mode 100644 index 0000000..1e31d73 --- /dev/null +++ b/DNAnexus/src/Makefile @@ -0,0 +1,12 @@ +all: + mkdir -p ../resources + $(MAKE) ../resources/dxPhyloCSF + +../resources/dxPhyloCSF: _tags dxPhyloCSF.ml + ocamlbuild -use-ocamlfind dxPhyloCSF.native + mkdir -p ../resources + cp -f dxPhyloCSF.native ../resources/dxPhyloCSF + +clean: + ocamlbuild -clean + rm -f ../resources/dxPhyloCSF diff --git a/DNAnexus/src/_tags b/DNAnexus/src/_tags new file mode 100644 index 0000000..15164b1 --- /dev/null +++ b/DNAnexus/src/_tags @@ -0,0 +1,2 @@ +<*.ml{,i}>: pp(ocaml+twt) +true: package(DNAnexus), package(forkwork), thread, debug diff --git a/DNAnexus/src/dxPhyloCSF.ml b/DNAnexus/src/dxPhyloCSF.ml new file mode 100644 index 0000000..1655721 --- /dev/null +++ b/DNAnexus/src/dxPhyloCSF.ml @@ -0,0 +1,316 @@ +open Batteries +open Printf +open JSON.Operators +open DNAnexus + +Printexc.record_backtrace true + +(* abbreviated JSON casts *) +module J = struct + include JSON + let of_str_list lst = of_list (List.map (fun x -> `String x) lst) + let of_str_assoc lst = of_assoc (List.map (fun (k,v) -> k, `String v) lst) + +(* return a list of (starting,limit) tuples *) +let partition_rows length rows_per_job = + if length = 0 then [] + else + (* make distribution more uniform *) + let jobs = 1 + (length - 1) / rows_per_job + let k = int_of_float (ceil (float length /. float jobs)) + (0 --^ jobs) /@ (fun i -> (i*k), (min k (length-i*k))) |> List.of_enum + +let make_scores_gtable name app_id input = + let optional_colspecs = + List.filter_map + fun (k,ty) -> + if J.bool (input$k) then Some (k,ty) + else None + ["bls",`Float; "anc_comp",`Float; "dna",`String; "aa",`String] + let frames = J.int (input$"frames") + let optional_colspecs = + if frames <> 6 then optional_colspecs + else ("strand",`String) :: optional_colspecs + let optional_colspecs = + if frames = 1 && J.string (input$"orf") = "AsIs" then optional_colspecs + else ("lo",`Int64) :: ("hi",`Int64) :: optional_colspecs + GTable.make_new (J.of_assoc [ + "name", `String name; + "columns", GTable.json_of_columns (Array.of_list (("name",`String) :: ("score",`Float) :: optional_colspecs)); + "indices", GTable.json_of_indices ["name", `Lexicographic ["name",`Asc,None]]; + "details", J.of_assoc [ + "PhyloCSF", make_link app_id; + "PhyloCSF_input", input + ] + ]) + +let get_species_names alignments_table = + try + let alignments_cols = GTable.columns alignments_table + let n = Array.length alignments_cols + assert (n >= 4) + Array.sub alignments_cols 2 (n-2) |> Array.map + function + | (sp,`String) when sp <> "" -> sp + | _ -> assert false + with + | exn -> + eprintf "%s\n%s\n" (Printexc.to_string exn) (Printexc.get_backtrace ()) + raise (DNAnexus.AppInternalError "Could not understand input alignments table. Please make sure it was generated with the maf_stitcher app.") + +type configuration = { + strategy : string; + frames : int; + orf : string; + min_codons : int; + all_scores : bool; + bls : bool; + anc_comp : bool; + dna : bool; + aa : bool +} + +let run_PhyloCSF species_set species_names alignment_row cfg = + let n = Array.length species_names + assert (Array.length alignment_row = n+3) + let alignment_name = match alignment_row.(1) with + | `String nm -> nm + | _ -> assert false + let seqs = + Array.sub alignment_row 3 n |> Array.map + function + | `String sp -> sp + | _ -> assert false + + (* Formulate PhyloCSF command line and start process *) + (* TODO: keep PhyloCSF online to avoid reinstantiating phylo-models for every alignment *) + (* TODO: support --species *) + let cmd = + sprintf "/PhyloCSF %s --removeRefGaps --strategy=%s --frames=%d --orf=%s --minCodons=%d%s%s%s%s%s" + species_set + cfg.strategy + cfg.frames + cfg.orf + cfg.min_codons + if cfg.all_scores then " --allScores" else "" + if cfg.bls then " --bls" else "" + if cfg.anc_comp then " --ancComp" else "" + if cfg.dna then " --dna" else "" + if cfg.aa then " --aa" else "" + + let phylocsf_out, phylocsf_in = Unix.open_process cmd + + (* write Multi-FASTA alginment into PhyloCSF *) + for i = 0 to n-1 do + fprintf phylocsf_in ">%s\n" species_names.(i) + fprintf phylocsf_in "%s\n" seqs.(i) + flush phylocsf_in + close_out phylocsf_in + + (* collect output *) + let result = IO.read_all phylocsf_out + + match Unix.close_process (phylocsf_out, phylocsf_in) with + | Unix.WEXITED 0 -> () + | _ -> + printf "%s\n" result + raise (DNAnexus.AppInternalError ("PhyloCSF exited abnormally; see the job log for more details.")) + + (* for each output line *) + List.of_enum + IO.lines_of (IO.input_string result) |> Enum.filter_map + fun line -> + (* ignore comments *) + if String.length line = 0 || line.[0] = '#' then + print_endline line + None + else + (* parse line, extract score *) + let flds = Array.of_list (String.nsplit line "\t") + assert (Array.length flds >= 2) + assert (flds.(1) = "score(decibans)" || flds.(1) = "orf_score(decibans)" || flds.(1) = "max_score(decibans)") + if cfg.all_scores && flds.(1) = "max_score(decibans)" then (* these are redundant *) + None + else + let sc = float_of_string flds.(2) + (* extract optional outputs *) + let i = ref 2 + let extras = ref [] + (* lo & hi *) + if cfg.frames <> 1 || cfg.orf <> "AsIs" then + incr i + extras := `Int (int_of_string flds.(!i)) :: !extras + incr i + extras := `Int (int_of_string flds.(!i)) :: !extras + (* strand *) + if cfg.frames = 6 then + incr i + extras := `String flds.(!i) :: !extras + (* other goodies *) + ([cfg.bls, `Float; cfg.anc_comp, `Float; cfg.dna, `String; cfg.aa, `String]) |> List.iter + fun (b,ty) -> + if b then + incr i + match ty with + | `Float -> extras := `Float (float_of_string flds.(!i)) :: !extras + | `String -> extras := `String flds.(!i) :: !extras + (* make output GTable row *) + Some (Array.of_list (`String alignment_name :: `Float sc :: (List.rev !extras))) + +(* main job: instantiate the job tree *) +let main input = + let alignments_table = GTable.bind_link (input$"alignments") + let alignments_desc = GTable.describe alignments_table + + (* resolve which parameter set to use *) + let species_set = + if input$?"species_set" then J.string (input$"species_set") + else + let alignments_deets = GTable.get_details alignments_table + let alignments_species = + try + J.string (alignments_deets$"maf_stitcher_input"$"species_set") + with _ -> + raise (AppError "Please provide the species_set input (could not detect which PhyloCSF parameters to use for these alignments)") + match alignments_species with + | "hg19_29mammals" -> "29mammals" + | "dm3_12flies" -> "12flies" + | _ -> raise (AppError (sprintf "No PhyloCSF parameters available for maf_stitcher species set: %s. Regenerate the alignments with a supported species set, or override this check by providing the species_set input" alignments_species)) + + (* verify the species names can be extracted from the alignments table. + TODO: also verify that the obtained species names match those supported by species_set *) + ignore (get_species_names alignments_table) + + (* get own app[let] ID, for details breadcrumbs *) + let job_desc = api_call [job_id (); "describe"] J.empty + let app_id = + if job_desc$?"app" then J.string (job_desc$"app") + else J.string (job_desc$"applet") + + (* initialize output GTable *) + let output_name = + if input$?"output_name" then J.string (input$"output_name") + else J.string (alignments_desc$"name") ^ " PhyloCSF" + let output_gtable = make_scores_gtable output_name app_id input + + (* partition alignments and launch subjobs *) + let options = + if not (input$?"instance_type") then JSON.empty + else + J.of_assoc [ + "systemRequirements", J.of_assoc [ + "process", J.of_assoc [ + "instanceType", (input$"instance_type") + ] + ] + ] + let subjobs = + partition_rows (JSON.int (alignments_desc$"length")) (JSON.int (input$"alignments_per_job")) |> List.map + fun (starting, limit) -> + let subjob = J.of_assoc [ + "species_set", `String species_set; + "starting", `Int starting; + "limit", `Int limit; + "scores", `String (GTable.id output_gtable) + ] + new_job ~options "process" (input $+ ("process",subjob)) + + (* set up postprocessing job and output *) + let postprocess_input = + (input + $+ ("postprocess", J.of_assoc [ + "scores", `String (GTable.id output_gtable); + "errors", J.of_list (List.map (fun subjob -> make_jobref subjob "errors") subjobs)])) + let postprocess = new_job "postprocess" postprocess_input + + J.of_assoc [ + "scores", make_link (GTable.id output_gtable); + "errors", make_jobref postprocess "errors" + ] + +(* subjob: process some of the alignments *) +let process input = + let alignments_table = GTable.bind_link (input$"alignments") + let species_names = get_species_names alignments_table + let starting = J.int (input$"process"$"starting") + let limit = J.int (input$"process"$"limit") + let scores_table = GTable.bind (None,J.string (input$"process"$"scores")) + + (* get configuration options *) + let cfg = { + strategy = J.string (input$"strategy"); + frames = J.int (input$"frames"); + orf = J.string (input$"orf"); + min_codons = J.int (input$"min_codons"); + all_scores = J.bool (input$"all_scores"); + bls = J.bool (input$"bls"); + anc_comp = J.bool (input$"anc_comp"); + dna = J.bool (input$"dna"); + aa = J.bool (input$"aa") + } + + let worker_process alignment = + try + run_PhyloCSF (J.string (input$"process"$"species_set")) species_names alignment cfg + with + | DNAnexus.AppError msg -> + raise + ForkWork.ChildExn [ + "AppError"; J.string ((GTable.json_of_row alignment)$@1); msg + ] + | exn -> + raise + ForkWork.ChildExn [ + "AppInternalError"; + J.string ((GTable.json_of_row alignment)$@1); + Printexc.to_string exn; + Printexc.get_backtrace () + ] + + let fw = ForkWork.manager () + let rows = GTable.iterate_rows ~starting ~limit alignments_table + let promises = rows /@ (ForkWork.fork fw worker_process) + Enum.force promises + let errors = + promises |> Enum.fold + fun n ans -> + match ForkWork.await_result fw ans with + | `OK [] -> n+1 + | `OK scores -> + scores |> List.iter (fun score_row -> GTable.add_row scores_table score_row) + n + | `Exn ["AppError"; name; msg] -> + eprintf "AppError on %s: %s\n" name msg + raise (DNAnexus.AppError (sprintf "%s: %s" name msg)) + | `Exn ["AppInternalError"; name; msg; bt] -> + eprintf "AppInternalError on %s: %s\n" name msg + if bt <> "" then eprintf "%s\n" bt + raise (DNAnexus.AppInternalError (sprintf "%s: %s" name msg)) + | _ -> assert false + 0 + + GTable.flush_rows scores_table + J.of_assoc ["errors", `Int errors] + +(* finish-up job: collect stats & initiate GTable closure *) +let postprocess input = + let gtable = GTable.bind (None, JSON.string (input$"postprocess"$"scores")) + + let stats = + Enum.fold + fun failures j -> failures + JSON.int j + 0 + Vect.enum (JSON.array (input$"postprocess"$"errors")) + let stats_json = J.of_assoc ["errors", `Int stats] + + GTable.set_details gtable (GTable.get_details gtable $+ ("PhyloCSF_stats", stats_json)) + + GTable.close gtable + J.of_assoc ["scores", make_link (GTable.id gtable); "errors", `Int stats] + +(* entry point *) +job_main + fun input -> + if input $? "process" then process input + else if input $? "postprocess" then postprocess input + else main input diff --git a/DNAnexus/src/run.sh b/DNAnexus/src/run.sh new file mode 100755 index 0000000..536016e --- /dev/null +++ b/DNAnexus/src/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This script simply launches your C++ executable and contains 3 entry +# points. + +main() { + /dxPhyloCSF +} + +process() { + /dxPhyloCSF process +} + +postprocess() { + /dxPhyloCSF postprocess +} diff --git a/DNAnexus/test/test.sh b/DNAnexus/test/test.sh new file mode 100755 index 0000000..b16d11b --- /dev/null +++ b/DNAnexus/test/test.sh @@ -0,0 +1,64 @@ +#!/bin/bash -e + +SOURCE="${BASH_SOURCE[0]}" +while [ -h "$SOURCE" ] ; do SOURCE="$(readlink "$SOURCE")"; done +HERE="$( cd -P "$( dirname "$SOURCE" )" && pwd )" + +function test_in_project { + dx-build-applet $HERE/.. -d $TEST_PROJECT:/ + dx="dx --project-context-id=$TEST_PROJECT" + job1=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ + --delay-workspace-destruction -y --brief) + job2=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ + -i bls=true -i dna=true \ + --delay-workspace-destruction -y --brief) + job3=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/tal-AA dm3_12flies alignments" \ + -i frames=6 -i all_scores=true -i bls=true -i anc_comp=true -i dna=true -i aa=true \ + --delay-workspace-destruction -y --brief) + job4=$($dx run :/PhyloCSF -i alignments="PhyloCSF development:/test data/ALDH2_exons hg19_29mammals alignments" \ + -i strategy=fixed -i orf=ATGStop -i frames=6 -i bls=true -i anc_comp=true -i aa=true \ + --delay-workspace-destruction -y --brief) + job5=$($dx run :/PhyloCSF \ + -i alignments="PhyloCSF development:/test data/CCDS longest isoforms hg19_29mammals chr21 alignments" \ + -i strategy=fixed -i alignments_per_job=10 \ + --delay-workspace-destruction -y --brief) + + $dx wait $job1 + $dx export tsv -o - $job1:scores + $dx wait $job2 + $dx export tsv -o - $job2:scores + $dx wait $job3 + $dx export tsv -o - $job3:scores + $dx wait $job4 + $dx export tsv -o - $job4:scores + $dx wait $job5 + printf "%d/197\n" $($dx export tsv -o - --no-header $job5:scores | cut -f2 | grep -v - | wc -l) +} + +if [ "$1" = "inner" ]; then + test_in_project + exit +fi + +TEST_PROJECT=$(dx new project --brief "PhyloCSF test ($(date))") +export TEST_PROJECT + +function cleanup { + dx rmproject -y $TEST_PROJECT > /dev/null +} +trap "cleanup; exit" int + +$SOURCE inner & +inner=$! + +exit_code=0 +wait $inner || exit_code=$? + +if [ "$exit_code" -ne 0 ]; then + echo "Leaving behind project for failed test, named \"$(dx describe $TEST_PROJECT --name)\"" + echo "To delete: dx rmproject -y $TEST_PROJECT" +else + cleanup +fi + +exit $exit_code