In [7]:
include("./src/EPIREAD.jl")

Main.EPIREAD

In [8]:
path = "./testing/test_epiread_small.bed"
big_bgz_path = "/Volumes/projects/laird/nathan/projects/julia/epiread/testing/test_epiread.bed.bgz"
part_path = "/Volumes/projects/laird/nathan/projects/julia/epiread/testing/part_epiread.bed"


"/Volumes/projects/laird/nathan/projects/julia/epiread/testing/part_epiread.bed"

In [9]:
# Find any reads in the cache that are too far to the left, or on a different chr than provided
function clean_record_cache!(cache::Vector{EPIREAD.Record}, max_end::Int64, new_chr::String)::Vector{EPIREAD.Record}
    reads_to_analyze = Vector{EPIREAD.Record}()
    for (i, record) in enumerate(cache)
        if (EPIREAD.chromend(record) < max_end) || (EPIREAD.chrom(record) ≠ new_chr)
            push!(reads_to_analyze, popat!(cache, i))
        end
    end
    return reads_to_analyze
end

function get_leftmost_chromend(cache::Vector{EPIREAD.Record})::Int64
    if length(cache) > 0
        return minimum(EPIREAD.chromend.(cache))
    else
        return Int64(0)
    end
end

get_leftmost_chromend (generic function with 1 method)

In [19]:
function parse_rle(rle::String)
	reps = parse.(Int8, map(x-> x == "" ? "1" : x, split(rle, r"[[:alpha:]]")[2:end]))
	bases = only.(split(replace(rle, r"[0-9]" => ""), ""))
	return zip(bases, reps)
end 

#Note that if we call EPIREAD.chromstart(record) to get the start position of the read, it is 1-based, end-inclusive (*on-base*). Try to match this here. 
function get_feature_pos_in_read(rle::String)
	J = Vector{Int64}()
	V = Vector{Char}()
	counter = 0

	for (base, rep) in parse_rle(rle)
		if occursin(base, "FPx") # nothing base
			counter += rep
		elseif occursin(base, "MUOSACGTN") # feature base
			for i in 1:rep
				counter += 1
				push!(J, copy(counter))
				push!(V, base)
			end
		elseif occursin(base, "acgtn") # insertion base
			# handle this better in the future, but for right now keep everything aligned with reference
		else #must be a deletion base
			@assert base == 'D'
			# handle this better in the future, but for right now keep everything aligned with reference
			counter += rep
		end
	end

	return (J, V)
end

function get_feature_pos_absolute(rle::String, read_start::Int64)
	J, V = get_feature_pos_in_read(rle)
	return (J.+read_start.-1, V) # we have to subtract one, as this is 1-start, fully closed
end

function update_vecs(master_I::Vector{Int64}, master_J::Vector{Int64}, master_V::Vector{Char}, J::Vector{Int64}, V::Vector{Char})
	read_num = length(master_I) == 0 ? Int64(1) : master_I[end]+1
	append!(master_I, fill(read_num, length(J)))
	append!(master_J, J)
	append!(master_V, V)
	return nothing
end

update_vecs (generic function with 1 method)

In [20]:
function analyze_read(read::EPIREAD.Record, master_I::Vector{Int64}, master_J::Vector{Int64}, master_V::Vector{Char})
	println("analyzing read $(EPIREAD.name(read))/$(EPIREAD.readnum(read)) ($(EPIREAD.chrom(read))): $(EPIREAD.cg_rle(read))")
	J, V = get_feature_pos_absolute(EPIREAD.cg_rle(read), EPIREAD.chromstart(read))
	update_vecs(master_I, master_J, master_V, J, V)
	return nothing
end

allsame(x) = all(y->y==first(x),x)

function analyze_record_pair(read1::EPIREAD.Record, read2::EPIREAD.Record)
	if !allsame(EPIREAD.chrom.((read1, read2)))
		for each in (read1, read2)
			analyze_read(each)
		end
	else 
		println("analyzing read pair $(EPIREAD.name(read1))/$(EPIREAD.readnum(read1)), $(EPIREAD.name(read2))/$(EPIREAD.readnum(read2))")
	end
end


analyze_record_pair (generic function with 1 method)

In [21]:
# Reads must be coordinate-sorted, so read pairs might not be adjacent in the file. 
# Need to set a parameter (insert size) to stop looking for mate pair of a read

#= 
Step through reads, one at a time. When we first see a read, we don't know if we'll find a mate for it. 
So we put it into the 'read cache' for a moment. We then keep reading. For each new read we find, we 
perform two checks:
1. if the chromend of the first (leftmost) read in the cache is more than (max_isize) bp away from the chromstart
of the read we're looking at, we are 'out of pairing range'. We take this opportunity to go through the cache
and purge any reads that are impossible to pair anymore, putting them in the 'analyze' queue. We also push our current read to the cache.
2. if the name of the read we're looking at matches the name of a read in the cache, we pull that read out and analyze it with the current read.
(also should check that chrom stays the same...!)
=# 





In [22]:
function test2(path; max_isize = 1000)

    I,J,V = Vector{Int64}(), Vector{Int64}(), Vector{Char}()

    record_cache = Vector{EPIREAD.Record}()
    reader = EPIREAD.Reader(path)

    # Pre-allocate record.
    record = EPIREAD.Record()
    i = 0
    last_chr = String("")
    while !eof(reader)
    # while i < 1000
        empty!(record)
        read!(reader, record)
        name = EPIREAD.name(record)
        chromstart = EPIREAD.chromstart(record)
        current_chr = EPIREAD.chrom(record)
        
        # Find the furthest-left *end* coordinate of all reads in the cache (if it is too far away from the beginning of this read, we'll never be able to pair to it)
        leftmost_chromend = get_leftmost_chromend(record_cache)
        # Test as above, and also check if we're on a new chr.
        if ( (leftmost_chromend < (chromstart - max_isize)) && (leftmost_chromend > 0) ) || ( (last_chr ≠ current_chr) && (last_chr ≠ "") )
            # If either of the above is true, get all the un-pair-able reads out of the cache and analyze them:
            println("cleaning cache because min_end $leftmost_chromend is more than $max_isize away from $chromstart or because last-seen $(last_chr) is different from current $(EPIREAD.chrom(record)) (read $(name))")
            for each in clean_record_cache!(record_cache, chromstart - max_isize, EPIREAD.chrom(record))
                analyze_read(each, I, J, V)
                println(I)
                println(J)
                println(V)
            end
        end
        
        # Ok, any read still in the cache is fair game to pair with our current read. Let's see if we can find a match:
        cached_names = EPIREAD.name.(record_cache)
        if name in cached_names 
            pair_record = popat!(record_cache, findfirst(isequal(name), cached_names))
            analyze_record_pair(record, pair_record)

        # Ok, we couldn't find a match. Save this read for later (maybe its match just hasn't been read yet!)
        else
            push!(record_cache, copy(record))
        end
        i += 1
        last_chr = current_chr
    end
    println("Finished processing $(i) reads")
    # Finally, close the reader.
    close(reader)
    for each in record_cache
        analyze_read(each, I, J, V)
    end
    return sparse(I, J, V)
end


test2 (generic function with 1 method)

In [23]:
test2(path)

analyzing read pair A00426:227:H7KMJDMXY:2:2233:29695:6605/2, A00426:227:H7KMJDMXY:2:2233:29695:6605/1
analyzing read pair A00426:227:H7KMJDMXY:2:2205:19551:25081/1, A00426:227:H7KMJDMXY:2:2205:19551:25081/2
analyzing read pair A00426:227:H7KMJDMXY:2:1331:6994:10034/2, A00426:227:H7KMJDMXY:2:1331:6994:10034/1
analyzing read pair A00426:227:H7KMJDMXY:2:2205:14642:2456/2, A00426:227:H7KMJDMXY:2:2205:14642:2456/1
analyzing read pair A00426:227:H7KMJDMXY:2:1230:12653:24126/1, A00426:227:H7KMJDMXY:2:1230:12653:24126/2
analyzing read pair A00426:227:H7KMJDMXY:2:1437:4255:21261/1, A00426:227:H7KMJDMXY:2:1437:4255:21261/2
analyzing read pair A00426:227:H7KMJDMXY:2:2351:20627:7091/1, A00426:227:H7KMJDMXY:2:2351:20627:7091/2
analyzing read pair A00426:227:H7KMJDMXY:2:2305:21938:14121/1, A00426:227:H7KMJDMXY:2:2305:21938:14121/2
analyzing read pair A00426:227:H7KMJDMXY:2:1309:16649:34272/1, A00426:227:H7KMJDMXY:2:1309:16649:34272/2
analyzing read pair A00426:227:H7KMJDMXY:2:2435:11749:27320/2, A0

BoundsError: BoundsError: attempt to access 0-element Vector{Int64} at index [0]

In [11]:
reader = EPIREAD.Reader(path)
record = EPIREAD.Record()
read!(reader, record)

Main.EPIREAD.Record:
    chromosome: chr1
         start: 10055
           end: 10203
     name/read: A00426:227:H7KMJDMXY:2:2233:29695:6605/1
        strand: +
CpG RLE string: F3x49ax24Fx12Fx5Fx3cxF2x16Fx10FxFx3FxFx5Fx2F3


parse_rle (generic function with 1 method)

get_feature_pos_in_read (generic function with 2 methods)

In [101]:


get_feature_pos_in_read("F3x50actM3x20Ux10F3")


([54, 55, 56, 77], ['M', 'M', 'M', 'U'])

In [89]:
J, V = get_feature_pos_absolute("F3MF3", 5)


([8], ['M'])

In [99]:
sparse([1], [16], ['M'])

1×16 SparseMatrixCSC{Char, Int64} with 1 stored entry:
⠀⠀⠀⠀⠀⠀⠀⠈