Permalink
...
Checking mergeability…
Don’t worry, you can still create the pull request.
Comparing changes
Open a pull request
- 9 commits
- 7 files changed
- 0 commit comments
- 1 contributor
Unified
Split
Showing
with
688 additions
and 0 deletions.
- +128 −0 src/FitECM/ApproxQ.ml
- +173 −0 src/FitECM/ArvestadBruno1997.ml
- +10 −0 src/FitECM/Makefile
- +372 −0 src/FitECM/TestApproxQ.ml
- +3 −0 src/FitECM/_tags
- +1 −0 src/FitECM/all.itarget
- +1 −0 src/FitECM/test.itarget
View
128
src/FitECM/ApproxQ.ml
| @@ -0,0 +1,128 @@ | ||
| +open CamlPaml | ||
| +open Batteries | ||
| +open Printf | ||
| +module JSON = Yojson.Basic | ||
| + | ||
| +let tol = 1e-6 | ||
| +let smooth = 1e-6 | ||
| + | ||
| +(* JSON helpers *) | ||
| +let vector_of_json = function | ||
| + | `List v -> | ||
| + Array.of_list | ||
| + v |> List.map | ||
| + function | ||
| + | `Float x -> x | ||
| + | `Int x -> float x | ||
| + | _ -> invalid_arg "expected numeric vector" | ||
| + | _ -> invalid_arg "expected vector" | ||
| + | ||
| +let matrix_of_json = function | ||
| + | `List rows -> | ||
| + try | ||
| + let m = rows |> List.map vector_of_json |> Array.of_list | ||
| + if Array.length m > 0 then | ||
| + let nc = Array.length m.(0) | ||
| + if not (Array.for_all (fun ar -> Array.length ar = nc) m) then invalid_arg "" | ||
| + m | ||
| + with | ||
| + | Invalid_argument _ -> failwith "expected rectangular numeric matrix" | ||
| + | _ -> invalid_arg "expected matrix" | ||
| + | ||
| +let (@) json key = match json with | ||
| + | `Assoc lst -> List.assoc key lst | ||
| + | _ -> invalid_arg "expected JSON hash" | ||
| + | ||
| + | ||
| + | ||
| +(* parse input JSON *) | ||
| +if Array.length Sys.argv < 2 then | ||
| + eprintf "Usage: ApproxECM <file.json>\n" | ||
| + exit (-1) | ||
| + | ||
| +let fn_input = Sys.argv.(1) | ||
| +let input = JSON.from_file fn_input | ||
| + | ||
| + | ||
| + | ||
| +(* load and validate input *) | ||
| +let pi = | ||
| + try | ||
| + vector_of_json (input@"CodonFrequencies") | ||
| + with | ||
| + | Not_found | ||
| + | Invalid_argument _ -> failwith ("expected CodonFrequencies to be a numeric vector") | ||
| + | ||
| +let n = Array.length pi | ||
| +if n < 2 then failwith "CodonFrequencies vector is too short" | ||
| + | ||
| +let pisum = | ||
| + pi |> Array.fold_left | ||
| + fun sum x -> | ||
| + if x < 0. then failwith "CodonFrequencies vector has a negative entry" | ||
| + sum +. x | ||
| + 0. | ||
| +if abs_float (pisum -. 1.0) > tol then failwith "CodonFrequencies vector does not sum to 1" | ||
| +let pi = Gsl.Vector.of_array pi | ||
| + | ||
| +let sms = | ||
| + try | ||
| + let lst = match input@"SubstitutionMatrices" with | ||
| + | `List lst -> lst | ||
| + | _ -> failwith "expected SubstitutionMatrices to be a list" | ||
| + Array.of_list | ||
| + List.map | ||
| + fun entry -> | ||
| + try | ||
| + let m = Gsl.Matrix.of_arrays (matrix_of_json (entry@"Matrix")) | ||
| + if Gsl.Matrix.dims m <> (n,n) then | ||
| + failwith (sprintf "Expected each substitution matrix to be %d-by-%d" n n) | ||
| + | ||
| + if smooth > 0. then | ||
| + for i = 0 to n-1 do | ||
| + for j = 0 to n-1 do | ||
| + m.{i,j} <- (m.{i,j} +. smooth) /. (1.0 +. (float n) *. smooth) | ||
| + | ||
| + CamlPaml.P.validate m | ||
| + let sps = `List [entry@"FirstSpecies"; entry@"SecondSpecies"] | ||
| + eprintf "%s\n" (JSON.to_string sps) | ||
| + m | ||
| + with | ||
| + | Not_found -> failwith "expected each member of SubstitutionMatrices to have a Matrix" | ||
| + lst | ||
| + with | ||
| + | Not_found -> failwith "expected SubstitutionMatrices" | ||
| + | ||
| +(* run algorithm *) | ||
| + | ||
| +eprintf "Estimating %d-by-%d rate matrix based on %d pairwise substitution matrices...\n" n n (Array.length sms) | ||
| +flush stderr | ||
| + | ||
| +let q = ArvestadBruno1997.est_Q pi sms | ||
| + | ||
| + | ||
| + | ||
| +(* output results *) | ||
| + | ||
| +for i = 1 to n-1 do | ||
| + for j = 0 to i-1 do | ||
| + if j > 0 then printf " " | ||
| + printf "%f" (q.{i,j} /. pi.{j}) | ||
| + printf "\n" | ||
| + | ||
| +pi |> Gsl.Vector.to_array |> Array.to_list |> List.map (sprintf "%f") |> String.concat " " |> printf "\n%s\n\n\n" | ||
| + | ||
| +if n = 64 then | ||
| + (* FIXME: residue vector should be included in input JSON *) | ||
| + printf "AAA AAC AAG AAT ACA ACC ACG ACT AGA AGC AGG AGT ATA ATC ATG ATT CAA CAC CAG CAT \ | ||
| +CCA CCC CCG CCT CGA CGC CGG CGT CTA CTC CTG CTT GAA GAC GAG GAT GCA GCC GCG GCT \ | ||
| +GGA GGC GGG GGT GTA GTC GTG GTT TAA TAC TAG TAT TCA TCC TCG TCT TGA TGC TGG TGT \ | ||
| +TTA TTC TTG TTT\n" | ||
| + | ||
| +(* | ||
| +let pos = ref 0 | ||
| +while !pos < n do | ||
| + let row = min 20 (n - !pos) | ||
| + Array.sub codons !pos row |> Array.to_list |> List.map (sprintf "%f") |> String.concat " " |> printf "%s\n" | ||
| + pos := !pos + row | ||
| +*) |
View
173
src/FitECM/ArvestadBruno1997.ml
| @@ -0,0 +1,173 @@ | ||
| +(* | ||
| + | ||
| +Subroutines for the Q matrix estimation strategy described in: | ||
| + | ||
| +Arvestad L, Bruno WJ (1997). Estimation of Reversible Substitution Matrices from Multiple Pairs of | ||
| +Sequences. J Mol Evol 45:696-703 | ||
| + | ||
| +*) | ||
| + | ||
| +open Batteries | ||
| +open Printf | ||
| + | ||
| +exception Numerical_error of string | ||
| + | ||
| +let diag v = | ||
| + let n = Gsl.Vector.length v | ||
| + let m = Gsl.Matrix.create ~init:0. n n | ||
| + for i = 0 to n-1 do m.{i,i} <- v.{i} | ||
| + m | ||
| + | ||
| +let mm a b = | ||
| + let (x,y) = Gsl.Matrix.dims a | ||
| + let (y',z) = Gsl.Matrix.dims b | ||
| + if (y <> y') then invalid_arg "mm: matrix dimensions mismatched" | ||
| + let c = Gsl.Matrix.create ~init:0.0 x z | ||
| + Gsl.Blas.(gemm ~ta:NoTrans ~tb:NoTrans ~alpha:1.0 ~a ~b ~beta:0.0 ~c) | ||
| + c | ||
| + | ||
| +(* symmetrize the ~P matrix according to eq. 12 *) | ||
| +let symmetrizeP pi p = | ||
| + let (m,n) = Gsl.Matrix.dims p | ||
| + assert (m=n) | ||
| + assert (n=Gsl.Vector.length pi) | ||
| + | ||
| + let pi_sqrt = Gsl.Vector.copy pi | ||
| + for i = 0 to n-1 do pi_sqrt.{i} <- sqrt pi.{i} | ||
| + (* pi_sqrt is pi^(1/2) as in eq. 10 *) | ||
| + | ||
| + let pi_invsqrt = Gsl.Vector.copy pi_sqrt | ||
| + for i = 0 to n-1 do pi_invsqrt.{i} <- 1.0 /. pi_sqrt.{i} | ||
| + (* pi_invsqrt is pi^(-1/2) as in eq. 10 *) | ||
| + | ||
| + (* now compute eq. 12. TODO this more efficiently than actually constructing & multiplying the diagonal matrices *) | ||
| + let x = mm (diag pi_sqrt) (mm p (diag pi_invsqrt)) | ||
| + | ||
| + let xT = Gsl.Matrix.create n n | ||
| + Gsl.Matrix.transpose xT x | ||
| + | ||
| + Gsl.Matrix.add x xT | ||
| + Gsl.Matrix.scale x 0.5 | ||
| + x | ||
| + | ||
| +(* compute the eigenvectors of the symmetrized P matrix, | ||
| + and use them to estimate the eigenvectors of P and Q *) | ||
| +let est_eigenvectors pi symP = | ||
| + let (m,n) = Gsl.Matrix.dims symP | ||
| + assert (m=n) | ||
| + assert (n=Gsl.Vector.length pi) | ||
| + | ||
| + let (_,u) = Gsl.Eigen.symmv ~protect:true (`M symP) | ||
| + (* columns of u are the eigenvectors of symP *) | ||
| + | ||
| + let pi_sqrt = Gsl.Vector.copy pi | ||
| + for i = 0 to n-1 do pi_sqrt.{i} <- sqrt pi.{i} | ||
| + | ||
| + let pi_invsqrt = Gsl.Vector.copy pi_sqrt | ||
| + for i = 0 to n-1 do pi_invsqrt.{i} <- 1.0 /. pi_sqrt.{i} | ||
| + | ||
| + let rv = mm (diag pi_invsqrt) u | ||
| + Gsl.Matrix.transpose_in_place rv | ||
| + (* rows of rv are the estimated right eigenvectors of P and Q (eq. 14) *) | ||
| + | ||
| + Gsl.Matrix.transpose_in_place u | ||
| + let lv = mm u (diag pi_sqrt) | ||
| + (* rows of lv are estimated left eigenvectors of P and Q (eq. 13) *) | ||
| + | ||
| + lv, rv | ||
| + | ||
| +(* estimate the eigenvalues of Q based on the observations and estimated eigenvectors *) | ||
| +let est_eigenvalues sms lv rv = | ||
| + let (m,n) = Gsl.Matrix.dims lv | ||
| + assert (m=n) | ||
| + assert ((m,n) = Gsl.Matrix.dims rv) | ||
| + let k = Array.length sms | ||
| + assert (k>0) | ||
| + sms |> Array.iter (fun sm -> assert ((m,n) = Gsl.Matrix.dims sm)) | ||
| + | ||
| + (* compute distance estimate for each pair of sequences (sm) and nucleotide (eq. 17) *) | ||
| + let distances = | ||
| + sms |> Array.mapi | ||
| + fun which_pair sm -> | ||
| + let y = Gsl.Vector.create ~init:0. n | ||
| + Array.init n | ||
| + fun i -> | ||
| + Gsl.Blas.(gemv NoTrans ~alpha:1.0 ~a:sm ~x:(Gsl.Matrix.row rv i) ~beta:0.0 ~y) | ||
| + let exp_d_i = (Gsl.Blas.(dot (Gsl.Matrix.row lv i) y)) | ||
| + if (exp_d_i <= 0.) then raise (Numerical_error (sprintf "Bad distance estimate for species pair %d, residue %d: log(%e)" which_pair i exp_d_i)) | ||
| + (* TODO: test cases to provoke negative exp_d_i | ||
| + http://www2.gsu.edu/~mkteer/npdmatri.html | ||
| + *) | ||
| + log exp_d_i | ||
| + | ||
| + (* least-squares estimates of the eigenvalues (eq. 18), arbitrarily assigning the last eigenvalue to 1. | ||
| + TODO make sure that cannot be the zero eigenvalue. *) | ||
| + let denom = distances |> Array.map (fun d -> d.(n-1) *. d.(n-1)) |> Array.fold_left (+.) 0.0 | ||
| + let denomfpc = classify_float denom | ||
| + if (denomfpc = FP_nan || denomfpc = FP_infinite || denom < 1e-6) then | ||
| + raise (Numerical_error (sprintf "Bad denominator for eigenvalue estimates: %e" denom)) | ||
| + let ans = | ||
| + Array.init n | ||
| + fun i -> | ||
| + if i < n-1 then | ||
| + distances |> Array.map (fun d -> d.(i) *. d.(n-1)) |> Array.fold_left (+.) 0.0 |> ( *. ) (1.0 /. denom) | ||
| + else 1.0 | ||
| + | ||
| + (* check all eigenvalues are nonnegative reals *) | ||
| + let ans_tot = Array.fold_left (+.) 0. ans | ||
| + | ||
| + match classify_float ans_tot with | ||
| + | FP_infinite | FP_nan -> raise (Numerical_error "Infinite/undefined eigenvalue estimate") | ||
| + | _ -> () | ||
| + | ||
| + (* round slightly negative eigenvalue estimates up to zero *) | ||
| + ans |> Array.iteri | ||
| + fun i ans_i -> | ||
| + if ans_i < 0. then | ||
| + if (abs_float ans_i >= 1e-3 *. (float n) *. ans_tot) then | ||
| + raise (Numerical_error (sprintf "Eigenvalue estimate %d is too negative: %e" i ans_i)) | ||
| + ans.(i) <- 0. | ||
| + | ||
| + Gsl.Vector.of_array ans | ||
| + | ||
| +(* Normalize rate matrix to unity mean rate of replacement at equilibrium *) | ||
| +let normalize_Q pi q = | ||
| + let (m,n) = Gsl.Matrix.dims q | ||
| + assert (m=n) | ||
| + let qdiag = Gsl.Vector.of_array (Array.init m (fun i -> q.{i,i})) | ||
| + | ||
| + let z = 1.0 /. (Gsl.Blas.dot pi qdiag) | ||
| + (* 'sign' of eigenvectors is arbitrary *) | ||
| + Gsl.Matrix.scale q (if z>0. then 0. -. z else z) | ||
| + | ||
| + (* round slightly negative off-diagonal entries to zero *) | ||
| + for i = 0 to n-1 do | ||
| + let row_tot = ref 0. | ||
| + for j = 0 to n-1 do if i <> j then row_tot := !row_tot +. q.{i,j} | ||
| + | ||
| + for j = 0 to n-1 do | ||
| + if i <> j && q.{i,j} < 0. then | ||
| + if abs_float q.{i,j} >= 1e-3 *. (float n) *. !row_tot then | ||
| + raise (Numerical_error (sprintf "Rate estimate Q[%d,%d] is too negative: %e" i j q.{i,j})) | ||
| + q.{i,j} <- 0. | ||
| + | ||
| +(* putting it all together *) | ||
| +let est_Q pi sms = | ||
| + let n = Gsl.Vector.length pi | ||
| + if (n < 2 || Array.length sms = 0) then invalid_arg "ArvestadBruno1997.est_Q" | ||
| + sms |> Array.iter (fun sm -> if (Gsl.Matrix.dims sm <> (n,n)) then invalid_arg "ArvestadBruno1997.est_Q") | ||
| + | ||
| + let p = Gsl.Matrix.create ~init:0. n n | ||
| + sms |> Array.iter (fun sm -> Gsl.Matrix.add p sm) | ||
| + | ||
| + let symP = symmetrizeP pi p | ||
| + | ||
| + let lv, rv = est_eigenvectors pi symP | ||
| + let ev = est_eigenvalues sms lv rv | ||
| + | ||
| + (* reconstruct Q according to the spectral thm Q = rv'*diag(ev)*lv | ||
| + http://www.maths.lancs.ac.uk/~gilbert/m306c/node4.html *) | ||
| + Gsl.Matrix.transpose_in_place rv | ||
| + let q = mm rv (mm (diag ev) lv) | ||
| + normalize_Q pi q | ||
| + q |
View
10
src/FitECM/Makefile
| @@ -0,0 +1,10 @@ | ||
| +all: | ||
| + ocamlbuild -use-ocamlfind all.otarget | ||
| + | ||
| +test: | ||
| + ocamlbuild -use-ocamlfind test.otarget | ||
| + _build/TestApproxQ.native -verbose | ||
| + | ||
| +clean: | ||
| + rm -f *~ | ||
| + ocamlbuild -clean |
Oops, something went wrong.