# GOR.jl - Development WIP

In [96]:
using GOR
using DataFrames

left = GorFile(GOR.pkgpath("test", "left.gor"))



<GorFile: /Users/florian/projects/GOR.jl/src/../test/left.gor>

Idea is to develop functions that work on iterators of genome-sorted NamedTuples. The output should be Tables.jl compatible.

The simplest attempt is to work with generators

In [97]:
select(stream, cols) = (NamedTuple{cols}(x) for x in stream)
where(stream, pred) = (x for x in stream if pred(x))
mutate(stream, func) = (merge(x, func(x)) for x in stream)

mutate (generic function with 1 method)

In order to be Tables.jl compatible, we need to work with structs with associated iterators, though.

Let's try out some things that couldmake it easier to program, now that Julia is on version 1.8

In [98]:
n = (a = 1, b = (c = 3,))

(a = 1, b = (c = 3,))

In [99]:
n.b

(c = 3,)

In [100]:
Base.return_types(x -> (a = 2,c = 3))

1-element Vector{Any}:
 NamedTuple{(:a, :c), Tuple{Int64, Int64}}

In [101]:
Base.merge((a = 1, b = 2), (a = 3, c = 4))

(a = 3, b = 2, c = 4)

Base.return_types works different for Tuples and NamedTuples when Union type is used

In [102]:
rt = Base.return_types(x -> (a=2x,b=x+1), (Union{Missing, Int64},))

1-element Vector{Any}:
 NamedTuple{(:a, :b), _A} where _A<:Tuple{Union{Missing, Int64}, Union{Missing, Int64}}

In [103]:
rt[1] <: NamedTuple

true

In [104]:
typeof(rt[1])

UnionAll

To get the names, use

In [105]:
rt[1].body.parameters[1]

(:a, :b)

and to get the type, use

In [106]:
rt[1].var.ub

Tuple{Union{Missing, Int64}, Union{Missing, Int64}}

In [107]:
rt = Base.return_types(x -> (a=2x,b=x+1), (Int64,))

1-element Vector{Any}:
 NamedTuple{(:a, :b), Tuple{Int64, Int64}}

In [108]:
rt[1] <: NamedTuple

true

In [109]:
typeof(rt[1])

DataType

In [110]:
rt[1].parameters[1]

(:a, :b)

In [111]:
rt[1].parameters[2]

Tuple{Int64, Int64}

This lets us define a function

In [112]:
function returntype(func, intype)
    rt = Base.return_types(func, intype)[1]

    @assert rt <: NamedTuple "func needs to return a NamedTuple"

    if typeof(rt) === DataType
        return NamedTuple{rt.parameters[1], rt.parameters[2]}
    elseif typeof(rt) === UnionAll
        return NamedTuple{rt.body.parameters[1], rt.var.ub}
    else
        error("Unknown type")
    end
end 

returntype (generic function with 1 method)

In [113]:
rt = returntype(r -> (a = r*2, b = r+1), (Int64,))

NamedTuple{(:a, :b), Tuple{Int64, Int64}}

In [114]:
returntype(r -> (a = r*2, b = r+1), (Union{Missing,Int64},))


NamedTuple{(:a, :b), Tuple{Union{Missing, Int64}, Union{Missing, Int64}}}

In [115]:
returntype(r -> (a = r.x*2, b = r.x+1), (NamedTuple{(:x,), Tuple{Int64}},))

NamedTuple{(:a, :b), Tuple{Int64, Int64}}

In [116]:
returntype(r -> (a = r.x*2, b = r.x+1), (NamedTuple{(:x,), Tuple{Union{Missing,Int64}}},))

NamedTuple{(:a, :b), Tuple{Union{Missing, Int64}, Union{Missing, Int64}}}

In [117]:
returntype( (x,y) -> Base.merge(x,y), (typeof((a = 2, b = 3)), NamedTuple{(:x,), Tuple{Union{Missing,Int64}}}))

NamedTuple{(:a, :b, :x), Tuple{Int64, Int64, Union{Missing, Int64}}}

In [118]:
returntype( x -> Base.merge(x, (c = x.a + 3,)), (typeof((a = 2, b = 3)),))

NamedTuple{(:a, :b, :c), Tuple{Int64, Int64, Int64}}

In [119]:
Base.return_types(x -> (a = x.Chrom, b = x.Pos + 123), (eltype(left),))

1-element Vector{Any}:
 NamedTuple{(:a, :b), Tuple{InlineStrings.String7, Int64}}

In [120]:
myselect2(iter) = ( (a = row.Chrom, b = row.Pos) for row in iter) 

myselect2 (generic function with 1 method)

In [121]:
Base.return_types(myselect2, (typeof(left),))

1-element Vector{Any}:
 Base.Generator{GOR.GorFileIter{Tables.NamedTupleIterator{Tables.Schema{(:Chrom, :Pos, :Val), Tuple{InlineStrings.String7, Int64, InlineStrings.String7}}, CSV.Rows{Vector{UInt8}, Tuple{}, Any, WeakRefStrings.PosLenString}}}, var"#121#122"}

In [122]:
eltype(left |> GOR.select(:Chrom))

NamedTuple{(:Chrom,), Tuple{InlineStrings.String7}}

In [123]:
DataFrame(a = 1:3, b = 4:6) |> Tables.namedtupleiterator |> GOR.mutate(r -> (Value = r.a*2,)) #|> top(1) |> DataFrame

GOR.Map{Tables.NamedTupleIterator{Tables.Schema{(:a, :b), Tuple{Int64, Int64}}, Tables.RowIterator{NamedTuple{(:a, :b), Tuple{Vector{Int64}, Vector{Int64}}}}}, GOR.var"#57#58"{var"#123#124"}, NamedTuple{(:a, :b, :Value), Tuple{Int64, Int64, Int64}}}(Tables.NamedTupleIterator{Tables.Schema{(:a, :b), Tuple{Int64, Int64}}, Tables.RowIterator{NamedTuple{(:a, :b), Tuple{Vector{Int64}, Vector{Int64}}}}}(Tables.RowIterator{NamedTuple{(:a, :b), Tuple{Vector{Int64}, Vector{Int64}}}}((a = [1, 2, 3], b = [4, 5, 6]), 3)), GOR.var"#57#58"{var"#123#124"}(var"#123#124"()))