-
Notifications
You must be signed in to change notification settings - Fork 4
/
analysisfunctions.jl
128 lines (105 loc) · 4.09 KB
/
analysisfunctions.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#### Auxiliary function
tablify(xaxis, data) = table(collect(xaxis), data, names = [:x, :y], pkey = :x, presorted = true)
function tablify(xaxis, data::IndexedTables.AbstractIndexedTable, val = 0.0)
extra_axis = setdiff(xaxis, columns(data, :x))
merge(tablify(extra_axis, fill(val, length(extra_axis))), data)
end
##### List of functions to analyze data
"""
`_locreg(df, xaxis::AbstractRange, x, y; kwargs...)`
Apply loess regression, training the regressor with `x` and `y` and
predicting `xaxis`
"""
function _locreg(::Val{:continuous}, xaxis, t; kwargs...)
x, y = columns(t, :x), columns(t, :y)
within = filter(t -> minimum(x)<= t <= maximum(x), xaxis)
if length(within) > 0
model = Loess.loess(convert(Vector{Float64},x),convert(Vector{Float64},y); kwargs...)
prediction = Loess.predict(model,within)
else
prediction = Float64[]
end
tablify(within, prediction)
end
"""
`_locreg(df, xaxis, x, y; estimator = mean)`
In the discrete case, the function computes the estimate of `y` for
a given value of `x` using the function `estimator` (default is mean)
"""
function _locreg(::Val{:discrete}, xaxis, t; estimator = mean)
if column(t, :y)[1] isa ShiftedArray
v = reduce_vec(estimator, column(t, :y), xaxis, filter = !isnan)
inds = findall(!ismissing, v)
table(xaxis[inds], convert(Vector{Float64}, view(v, inds)), names = [:x, :y],
copy = false, presorted = true)
else
groupby((:y => estimator, ), t, :x, select = :y)
end
end
"""
`_density(df,xaxis::AbstractRange, x; kwargs...)`
Kernel density of `x`, computed along `xaxis`
"""
function _density(::Val{:continuous}, xaxis, t; kwargs...)
data = KernelDensity.pdf(KernelDensity.kde(columns(t, :x); kwargs...), xaxis)
tablify(xaxis, data)
end
"""
`_density(df, xaxis, x)`
Normalized histogram of `x` (which is discrete: every value is its own bin)
"""
function _density(::Val{:discrete}, xaxis, t; normalize = true)
s = normalize ? reduce(+, t, select = :y) : 1.0
small_table = groupby((:y => v -> length(v)/s, ), t, :x, select = :y)
tablify(xaxis, small_table, 0.0)
end
_density_axis(column, axis_type::Symbol; kwargs...) =
if (axis_type == :discrete)
get_axis(column)
else
start, stop = extrema(KernelDensity.kde(column; kwargs...).x)
range(start, stop = stop, length = 100)
end
"""
`_cumulative!(df, xaxis, x) = ecdf(df[x])(xaxis)`
Cumulative density function of `x`, computed along `xaxis`
"""
function _cumulative(T, xaxis, t)
data = ecdf(columns(t, :x))(xaxis)
tablify(xaxis, data)
end
"""
`_hazard(df,xaxis, x; kwargs...)`
Hazard rate of `x`, computed along `xaxis`. Keyword arguments are passed to
the function computing the density
"""
function _hazard(T, xaxis, t; kwargs...)
data_pdf = columns(_density(T, xaxis, t; kwargs...), :y)
data_cdf = columns(_cumulative(T, xaxis, t), :y)
bin_size = t[1].y
tablify(xaxis, @.(data_pdf/(1 + bin_size * data_pdf - data_cdf)))
end
#### Method to compute and plot grouped error plots using the above functions
get_axis(column::AbstractArray{<:AbstractRange}) = column[1]
get_axis(column) = sort!(union(column))
function get_axis(column, npoints::Int64)
start, stop = extrema(column)
range(start, stop = stop, length = npoints)
end
function get_axis(column, axis_type::Symbol, compute_axis::Symbol; kwargs...)
if axis_type == :discrete
return get_axis(column)
elseif axis_type == :continuous
return get_axis(column, 100)
else
error("Unexpected axis_type: only :discrete and :continuous allowed!")
end
end
get_axis(column, axis_type::Symbol, compute_axis; kwargs...) =
compute_axis(column, axis_type; kwargs...)
# f is the function used to analyze dataset: define it as nan when it is not defined,
# the input is: dataframe used, points chosen on the x axis, x (and maybe y) column labels
# the output is the y value for the given xvalues
builtin_funcs = Dict(zip([:locreg, :density, :cumulative, :hazard],
[_locreg, _density, _cumulative, _hazard]))
builtin_axis = Dict(:density => _density_axis)