diff --git a/lib/CamlPaml/CamlPaml.mlpack b/lib/CamlPaml/CamlPaml.mlpack index 5900b58..5ff9a0a 100644 --- a/lib/CamlPaml/CamlPaml.mlpack +++ b/lib/CamlPaml/CamlPaml.mlpack @@ -4,10 +4,10 @@ NewickLexer T P Q -Infer Expr Fit PhyloModel +PhyloLik PhyloEM Code Parsimony diff --git a/lib/CamlPaml/CamlPaml.odocl b/lib/CamlPaml/CamlPaml.odocl index 05b1248..d5fbe19 100644 --- a/lib/CamlPaml/CamlPaml.odocl +++ b/lib/CamlPaml/CamlPaml.odocl @@ -2,10 +2,10 @@ Code Newick T Parsimony -Expr +PhyloModel P Q -PhyloModel +Expr +PhyloLik PhyloEM -Infer Fit diff --git a/lib/CamlPaml/PhyloEM.ml b/lib/CamlPaml/PhyloEM.ml index 12abdba..2bdfeb0 100644 --- a/lib/CamlPaml/PhyloEM.ml +++ b/lib/CamlPaml/PhyloEM.ml @@ -11,12 +11,12 @@ let new_sufficient_statistics m = let t0 = PM.tree m Array.init (T.size t0 - 1) (fun _ -> Array.make_matrix n n 0.) -let collect_sufficient_statistics m leaves ss = - let inter = PM.infer m leaves - let lik = Infer.likelihood inter +let collect_sufficient_statistics ?workspace m leaves ss = + let inter = PM.prepare_lik ?workspace m leaves + let lik = PhyloLik.likelihood inter let t = PM.tree m for i = 0 to T.root t - 1 do - Infer.add_branch_posteriors inter i ss.(i) + PhyloLik.add_branch_posteriors inter i ss.(i) lik let clean_sufficient_statistics ?(tol=1e-6) ss = diff --git a/lib/CamlPaml/PhyloEM.mli b/lib/CamlPaml/PhyloEM.mli index adc3927..29280a1 100644 --- a/lib/CamlPaml/PhyloEM.mli +++ b/lib/CamlPaml/PhyloEM.mli @@ -8,8 +8,8 @@ type sufficient_statistics = float array array array val new_sufficient_statistics : PhyloModel.t -> sufficient_statistics -(** compute the E-step statistics for one site, given the leaves (as would be given to {{:PhyloModel}PhyloModel.infer}). Update the given [sufficient_statistics] and return the probability of the leaves under the model. This should be called for each site to collect the statistics for the whole alignment. *) -val collect_sufficient_statistics : PhyloModel.t -> Infer.leaf array -> sufficient_statistics -> float +(** compute the E-step statistics for one site, given the leaves (as would be given to {{:PhyloModel}PhyloModel.prepare_lik}). Update the given [sufficient_statistics] and return the probability of the leaves under the model. This should be called for each site to collect the statistics for the whole alignment. *) +val collect_sufficient_statistics : ?workspace:PhyloLik.workspace -> PhyloModel.t -> PhyloLik.leaf array -> sufficient_statistics -> float (** any entry in the sufficient statistics less than [tol] is set to zero. useful if e.g. planning to compress the sufficient statistics for transport between compute nodes *) val clean_sufficient_statistics : ?tol:float -> sufficient_statistics -> unit diff --git a/lib/CamlPaml/Infer.ml b/lib/CamlPaml/PhyloLik.ml similarity index 79% rename from lib/CamlPaml/Infer.ml rename to lib/CamlPaml/PhyloLik.ml index 40f9053..76f6d15 100644 --- a/lib/CamlPaml/Infer.ml +++ b/lib/CamlPaml/PhyloLik.ml @@ -18,21 +18,29 @@ let get_leaf k leaf = match leaf with | `Distribution ar -> Gsl_vector.of_array ar | `Marginalize -> hashcons_raw_marg k +type workspace = { + mutable generation : int; + data : Gsl_matrix.matrix +} + type intermediate = { tree : T.t; pms : Gsl_matrix.matrix array; leaves : leaf array; - alpha : Gsl_matrix.matrix; + + workspace : workspace; + my_generation : int; + + alpha : Gsl_matrix.matrix; (** actually a sub-matrix of workspace.data *) mutable z : float; mutable have_alpha : bool; - beta : Gsl_matrix.matrix; - mutable have_beta : bool + beta : Gsl_matrix.matrix; (** actually a sub-matrix of workspace.data *) + mutable have_beta : bool; } -type workspace = Gsl_matrix.matrix let new_workspace tree dim = let rows = 2 * (T.size tree) - (T.leaves tree) - Gsl_matrix.create rows dim + { generation = min_int; data = Gsl_matrix.create rows dim } let empty = [||] let prepare ?workspace tree pms prior leaves = @@ -44,14 +52,17 @@ let prepare ?workspace tree pms prior leaves = if Array.length pms < n-1 then invalid_arg "CamlPaml.Infer.prepare: not enough P matrices" let workspace = match workspace with Some x -> x | None -> new_workspace tree k - let rows,cols = Gsl_matrix.dims workspace + workspace.generation <- (if workspace.generation = max_int then min_int else workspace.generation+1) + + let rows,cols = Gsl_matrix.dims workspace.data if rows < (2*n-nl) || cols <> k then invalid_arg "CamlPaml.Infer.prepare: inappropriate workspace dimensions" - let alpha = Bigarray.Array2.sub_left workspace 0 (n-nl) - let beta = Bigarray.Array2.sub_left workspace (n-nl) n + let alpha = Bigarray.Array2.sub_left workspace.data 0 (n-nl) + let beta = Bigarray.Array2.sub_left workspace.data (n-nl) n for a = 0 to k-1 do beta.{n-1,a} <- prior.(a) - { tree = tree; pms = pms; leaves = leaves; alpha = alpha; z = nan; have_alpha = false; beta = beta; have_beta = false } + { tree = tree; pms = pms; leaves = leaves; workspace = workspace; my_generation = workspace.generation; + alpha = alpha; z = nan; have_alpha = false; beta = beta; have_beta = false } (* Inside algo (aka Felsenstein algo.) *) let alpha_get x br = @@ -106,10 +117,12 @@ let ensure_beta x = x.have_beta <- true let likelihood x = + if x.workspace.generation <> x.my_generation then failwith "CamlPaml.PhyloLik.likelihood: invalidated workspace" ensure_alpha x x.z let node_posterior inferred i = + if inferred.workspace.generation <> inferred.my_generation then failwith "CamlPaml.PhyloLik.node_posterior: invalidated workspace" ensure_beta inferred let k = snd (Gsl_matrix.dims inferred.alpha) if inferred.z = 0. then @@ -123,6 +136,7 @@ let node_posterior inferred i = Array.init k (fun x -> (alpha_get inferred i).{x} *. inferred.beta.{i,x} /. inferred.z) let add_branch_posteriors ?(weight=1.0) inferred branch ecounts = + if inferred.workspace.generation <> inferred.my_generation then failwith "CamlPaml.PhyloLik.add_branch_posterior: invalidated workspace" ensure_beta inferred let k = snd (Gsl_matrix.dims inferred.alpha) diff --git a/lib/CamlPaml/Infer.mli b/lib/CamlPaml/PhyloLik.mli similarity index 74% rename from lib/CamlPaml/Infer.mli rename to lib/CamlPaml/PhyloLik.mli index 617e229..88e62a5 100644 --- a/lib/CamlPaml/Infer.mli +++ b/lib/CamlPaml/PhyloLik.mli @@ -1,29 +1,31 @@ -(** Probabilistic inference on phylogenetic trees given the substitution (P) matrices on each branch. These are low-level routines that should usually be used through the higher-level {{:PhyloModel}[PhyloModel]} abstractions. *) +(** core phylogenetic likelihood calculations -type intermediate +These should usually be accessed through the higher-level {{:PhyloModel}[PhyloModel]} abstractions. *) (** Specifying a leaf (extant) character. Usually the extant character is known with certainty; in this case, use [`Certain] with the index of the character, e.g. [`Certain (Code.Codon61.index ('A','T','G'))]. Alternatively, you can specify an arbitrary probability distribution over extant characters. Lastly, you can specify to marginalize a leaf out of the likelihood calculations entirely. *) type leaf = [`Certain of int | `Distribution of float array | `Marginalize] type workspace -(** The calculations use a workspace of [((2 * T.size tree - T.leaves tree) * k)] [float]s where [k] is the alphabet size. As a performance optimization, you can create a workspace with [new_workspace tree k] and use it across multiple calls to [prepare]; otherwise it will allocate by itself. *) +(** The calculations use a workspace of [((2 * T.size tree - T.leaves tree) * k)] [float]s where [k] is the alphabet size. As a performance optimization, you can create a workspace with [new_workspace tree k] and use it across multiple calls to [prepare]; otherwise, it will allocate automatically. *) val new_workspace : T.t -> int -> workspace -(** [prepare tree p_matrices root_prior leaves] infers ancestral states, given: +type intermediate + +(** [prepare tree p_matrices root_prior leaves] initializes the likelihood calculations, given: - [tree] the phylogenetic tree -- [p_matrices.(i)] is the substitution matrix for the branch leading TO node [i] FROM its parent. +- [p_matrices.(i)] is the substitution matrix for the branch leading {e to} node [i] {e from} its parent. - [root_prior] the prior probability distribution over characters at the root. - [leaves] is an array of [leaf]s (see above), the appropriate number for the tree -- [workspace] is an appropriately sized workspace; it will be allocated if not given. -@return an abstract value from which various information about ancestral states can be extracted (see below). If using a shared workspace, be sure to get all the results you need before the next call to [prepare]. +- [workspace] is an appropriately sized workspace; one will be allocated if not given. +@return an abstract value from which various information about ancestral states can be extracted (see below). The time-consuming calculations are not actually performed until needed. If using a shared workspace, be sure to get all the results you need before the next call to [prepare]. *) val prepare : ?workspace:workspace -> T.t -> P.matrix array -> float array -> leaf array -> intermediate (** calculate the probability of the leaves under the substitution model *) val likelihood : intermediate -> float -(** compute the posterior probability distribution over characters at the specified node *) +(** compute the posterior probability distribution over characters at the specified node. *) val node_posterior : intermediate -> int -> float array (** [branch_posteriors intermediate k] computes the posterior probability of each possible substitution on the specified branch [k]. That is, entry [(i,j)] of the returned matrix is [P(parent(k) = i && k = j | Leaves)] *) diff --git a/lib/CamlPaml/PhyloModel.ml b/lib/CamlPaml/PhyloModel.ml index f33352c..71cb73b 100644 --- a/lib/CamlPaml/PhyloModel.ml +++ b/lib/CamlPaml/PhyloModel.ml @@ -29,7 +29,7 @@ let q { qms = qms } br = qms.(br) let p { pms = pms } br = pms.(br) (* Array.map Array.copy pms.(br) *) let prior { prior = rp } = Array.copy rp -let infer ?workspace m leaves = Infer.prepare ?workspace m.tree m.pms m.prior leaves +let prepare_lik ?workspace m leaves = PhyloLik.prepare ?workspace m.tree m.pms m.prior leaves let checksum = 1., 1e-6 diff --git a/lib/CamlPaml/PhyloModel.mli b/lib/CamlPaml/PhyloModel.mli index 35f0f2f..f9f547b 100644 --- a/lib/CamlPaml/PhyloModel.mli +++ b/lib/CamlPaml/PhyloModel.mli @@ -25,13 +25,13 @@ val prior : t -> float array *) val simulate : t -> (?a:(int array) -> unit -> int array) -(** Probabilistically infer ancestral characters based on the given configuration of extant characters (leaves). +(** Prepare likelihood calculations for the given configuration of extant characters (leaves). -@param workspace (optional) a preallocated workspace for [Infer.prepare] +@param workspace (optional) a preallocated workspace for {{:PhyloLik}PhyloLik.prepare} -@return {{:Infer}intermediate values}, from which additional information can be extracted. +@return {{:PhyloLik}PhyloLik.intermediate} values, from which additional information can be extracted. *) -val infer : ?workspace:Infer.workspace -> t -> Infer.leaf array -> Infer.intermediate +val prepare_lik : ?workspace:PhyloLik.workspace -> t -> PhyloLik.leaf array -> PhyloLik.intermediate (** {1 Symbolic parameterizations} diff --git a/src/PhyloCSFModel.ml b/src/PhyloCSFModel.ml index 968c5db..599d455 100644 --- a/src/PhyloCSFModel.ml +++ b/src/PhyloCSFModel.ml @@ -52,11 +52,11 @@ let lpr_leaves inst leaves t = let ts = PM.P14n.tree_settings inst ts.(0) <- t let inst = PM.P14n.update ~tree_settings:ts inst - let workspace = Infer.new_workspace (PM.tree (PM.P14n.model inst)) Codon.dim + let workspace = PhyloLik.new_workspace (PM.tree (PM.P14n.model inst)) Codon.dim let lik = ref 0. leaves |> Array.iter fun lvs -> - let lvslik = Infer.likelihood (PM.infer ~workspace:workspace (PM.P14n.model inst) lvs) + let lvslik = PhyloLik.likelihood (PM.prepare_lik ~workspace:workspace (PM.P14n.model inst) lvs) lik := !lik +. log lvslik !lik, inst