WIP: Add support for shifted arrays (#22)

* shifted support attempt * fixed missing inconvenience * test data * update news
piever · Feb 15, 2018 · d56fd84 · d56fd84
1 parent 45de004
commit d56fd84
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 5 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # GroupedErrors Release Notes
 
+## v.0.1.2
+
+- Compatible with new `IndexedTables` grouping mechanism
+- Support for alignment analysis of time varying signals
+
 ## v.0.1.1
 
 ### Bugfixes

diff --git a/README.md b/README.md
@@ -164,6 +164,29 @@ end
 
 ![density](https://user-images.githubusercontent.com/6333339/29373096-06317b50-82a5-11e7-900f-d6c183977ab8.png)
 
+### Analysis of time varying signals
+
+GroupedErrors allows (experimentally! use at your own risk!) aligning time varying signal using [ShiftedArrays](https://github.com/piever/ShiftedArrays.jl) (at the moment ShiftedArray unreleased version is required, will try to release as soon as possible). You need to build a column of ShiftedArrays as follows. Let's say that `v` is your vector of signals and indices `inds = [13, 456, 607]` are those where meaningful event happens (assuming your dataset only have 3 rows, of course in practice `inds` will be much longer). You can create a column of `ShiftedArrays` with:
+
+```julia
+[ShiftedArray(v, -i) for i in [13, 456, 607]]
+```
+
+and add it to your data. GroupedErrors will then be able to leverage reducing functions from ShiftedArrays to run analysis. In this example dataset we take the column `:signal` to be the vector of `ShiftedArrays`, `:subject` is our grouping variable and `:treatment` is some variable we will use to split the data:
+
+```julia
+df = JuliaDB.load(joinpath(Pkg.dir("GroupedErrors", "test", "tables"), "test_signal"))
+@> df begin
+    @splitby _.treatment
+    @across _.subject
+    @x -100:100 :discrete
+    @y _.signal
+    @plot plot() :ribbon
+end
+```
+
+![signal](https://user-images.githubusercontent.com/6333339/36281283-0cb52924-1295-11e8-87aa-b01160e3aa5e.png)
+
 ### Non-parametric bootstrap error computation
 
 Rather than computing the variability across groups, it is also possible to compute the overall variability using non-parametric [bootstrap](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)#Case_resampling) using the `@bootstrap` macro. The analysis will be run as many times as the specified number (defaults to 1000) on a fake dataset sampled with replacement from the actual data. Estimate and error are computed as mean and std of the different outcomes. Example:

diff --git a/REQUIRE b/REQUIRE
@@ -9,3 +9,4 @@ TableTraitsUtils 0.1.0
 TableTraits
 MacroTools
 Lazy
+ShiftedArrays 0.2.0
diff --git a/src/GroupedErrors.jl b/src/GroupedErrors.jl
@@ -9,6 +9,7 @@ using TableTraits
 using MacroTools
 using StatsBase
 using NamedTuples
+using ShiftedArrays, Missings
 
 export @splitby, @bootstrap, @across, @x, @y, @xy, @compare, @summarize, @set_attr, @>, @plot
 export @xlims, @ylims

diff --git a/src/analysisfunctions.jl b/src/analysisfunctions.jl
@@ -33,7 +33,16 @@ end
 In the discrete case, the function computes the estimate of `y` for
 a given value of `x` using the function `estimator` (default is mean)
 """
-_locreg(::Val{:discrete}, xaxis, t; estimator = mean) = groupby((:y => estimator, ), t, :x, select = :y)
+function _locreg(::Val{:discrete}, xaxis, t; estimator = mean)
+    if column(t, :y)[1] isa ShiftedArray
+        v = reduce_vec(estimator, column(t, :y), xaxis)
+        inds = find(!ismissing, v)
+        table(xaxis[inds], convert(Vector{Float64}, view(v, inds)), names = [:x, :y],
+            copy = false, presorted = true)
+    else
+        groupby((:y => estimator, ), t, :x, select = :y)
+    end
+end
 
 """
     `_density(df,xaxis::Range, x; kwargs...)`
@@ -85,6 +94,7 @@ end
 
 #### Method to compute and plot grouped error plots using the above functions
 
+get_axis(column::AbstractArray{<:Range}) = column[1]
 get_axis(column) = sort!(union(column))
 get_axis(column, npoints::Int64) = linspace(extrema(column)..., npoints)
 

diff --git a/src/pipeline.jl b/src/pipeline.jl
@@ -46,7 +46,7 @@ function process_axis_type!(cols, kw)
     (kw[:axis_type] == :auto) && (kw[:axis_type] = :continuous)
     kw[:axis_type] in [:discrete, :continuous] ||
         error("Axis type $(kw[:axis_type]) is not supported")
-    if all(isnan.(y))
+    if all(t -> (t isa Real) && isnan(t), y)
         if kw[:axis_type] == :discrete
             y .= bin_width
         else
@@ -66,13 +66,13 @@ end
 
 function get_grouped_error(trend, variation, f, xaxis, split_table, compute_error)
     if !isa(compute_error, Integer)
-        subject_table = groupby(tt -> f(xaxis, table(tt)), split_table, flatten = true)
+        subject_table = groupby(tt -> f(xaxis, table(tt)), split_table, flatten = true, select = IndexedTables.valuenames(split_table))
     else
         ns = compute_error
         large_table = table(repeat(collect(1:ns), inner = length(xaxis)),
             repeat(xaxis, outer = ns), zeros(ns*length(xaxis)), names = [:across, :x, :y],
             pkey = :across, presorted = true)
-        subject_table = groupby(large_table, flatten = true) do tt
+        subject_table = groupby(large_table, flatten = true, select = IndexedTables.valuenames(large_table)) do tt
             nd = length(split_table)
             perm = rand(1:nd, nd)
             permuted_data = table(columns(split_table, :x)[perm], columns(split_table, :x)[perm],
@@ -117,4 +117,4 @@ function _group_apply(t::Table2Process)
         end
     end
     return filter(_isfinite, g)
-end
+end
diff --git a/test/tables/test_signal b/test/tables/test_signal