In [1]:
include("./src/EPIREAD.jl")

Main.EPIREAD

In [2]:
path = "./testing/test_epiread_small.bed"
big_bgz_path = "/Volumes/projects/laird/nathan/projects/julia/epiread/testing/test_epiread.bed.bgz"
part_path = "/Volumes/projects/laird/nathan/projects/julia/epiread/testing/part_epiread.bed"


"/Volumes/projects/laird/nathan/projects/julia/epiread/testing/part_epiread.bed"

In [3]:
# Find any reads in the cache that are too far to the left, or on a different chr than provided
function clean_record_cache!(cache::Vector{EPIREAD.Record}, max_end::Int64, new_chr::String)::Vector{EPIREAD.Record}
    reads_to_analyze = Vector{EPIREAD.Record}()
    for (i, record) in enumerate(cache)
        if (EPIREAD.chromend(record) < max_end) || (EPIREAD.chrom(record) ≠ new_chr)
            push!(reads_to_analyze, popat!(cache, i))
        end
    end
    return reads_to_analyze
end

function get_leftmost_chromend(cache::Vector{EPIREAD.Record})::Int64
    if length(cache) > 0
        return minimum(EPIREAD.chromend.(cache))
    else
        return Int64(0)
    end
end

get_leftmost_chromend (generic function with 1 method)

In [4]:
function parse_rle(rle::String)
	reps = parse.(Int16, map(x-> x == "" ? "1" : x, split(rle, r"[[:alpha:]]")[2:end]))
	bases = only.(split(replace(rle, r"[0-9]" => ""), ""))
	return zip(bases, reps)
end 

#Note that if we call EPIREAD.chromstart(record) to get the start position of the read, it is 1-based, end-inclusive (*on-base*). Try to match this here. 
function get_feature_pos_in_read(rle::String)
	J = Vector{Int64}()
	V = Vector{Char}()
	counter = 0

	for (base, rep) in parse_rle(rle)
		if occursin(base, "FPx") # nothing base
			counter += rep
		elseif occursin(base, "MUOSACGTN") # feature base
			for i in 1:rep
				counter += 1
				push!(J, copy(counter))
				push!(V, base)
			end
		elseif occursin(base, "acgtn") # insertion base
			# handle this better in the future, but for right now keep everything aligned with reference
		else #must be a deletion base
			@assert base == 'D'
			# handle this better in the future, but for right now keep everything aligned with reference
			counter += rep
		end
	end

	return (J, V)
end

function get_feature_pos_absolute(rle::String, read_start::Int64)
	J, V = get_feature_pos_in_read(rle)
	return (J.+read_start.-1, V) # we have to subtract one, as this is 1-start, fully closed
end

function update_vecs(master_I::Vector{Int64}, master_J::Vector{Int64}, master_V::Vector{Char}, J::Vector{Int64}, V::Vector{Char})
	read_num = length(master_I) == 0 ? Int64(1) : master_I[end]+1
	append!(master_I, fill(read_num, length(J)))
	append!(master_J, J)
	append!(master_V, V)
	return nothing
end

update_vecs (generic function with 1 method)

In [28]:
function analyze_read(read::EPIREAD.Record, master_I::Vector{Int64}, master_J::Vector{Int64}, master_V::Vector{Char})
	# println("analyzing read $(EPIREAD.name(read))/$(EPIREAD.readnum(read)) ($(EPIREAD.chrom(read))): $(EPIREAD.cg_rle(read))")
	J, V = get_feature_pos_absolute(EPIREAD.cg_rle(read), EPIREAD.chromstart(read))
	update_vecs(master_I, master_J, master_V, J, V)
	return nothing
end

function analyze_read(rle::String, chromstart::Int64, master_I::Vector{Int64}, master_J::Vector{Int64}, master_V::Vector{Char})
	# println("analyzing read $(EPIREAD.name(read))/$(EPIREAD.readnum(read)) ($(EPIREAD.chrom(read))): $(EPIREAD.cg_rle(read))")
	J, V = get_feature_pos_absolute(rle, chromstart)
	update_vecs(master_I, master_J, master_V, J, V)
	return nothing
end

allsame(x) = all(y->y==first(x),x)

function are_overlapped(read1::EPIREAD.Record, read2::EPIREAD.Record)
	rightmost_start = maximum([EPIREAD.chromstart(read1), EPIREAD.chromstart(read2)])
	leftmost_end = minimum([EPIREAD.chromend(read1), EPIREAD.chromend(read2)])
	return leftmost_end ≥ rightmost_start # since were' in 1-based, fully-closed coords (aka on-base coords), reads are overlapped if the start coord of one equals the end coord of the other
end

# TODO! need to fix (doesn't handle case where reads overlap)
function analyze_record_pair(read1::EPIREAD.Record, read2::EPIREAD.Record, master_I::Vector{Int64}, master_J::Vector{Int64}, master_V::Vector{Char})
	if !allsame(EPIREAD.chrom.((read1, read2)))
		for each in (read1, read2)
			analyze_read(each, master_I, master_J, master_V)
		end
	else 
		if !are_overlapped(read1, read2)
			for each in (read1, read2)
				analyze_read(each, master_I, master_J, master_V)
			end
		else
			# handle overlapping reads
			rle, chromstart = merge_rle(read1, read2)
			analyze_read(rle, chromstart, master_I, master_J, master_V)
		end 
	end
end


analyze_record_pair (generic function with 1 method)

In [6]:
# Reads must be coordinate-sorted, so read pairs might not be adjacent in the file. 
# Need to set a parameter (insert size) to stop looking for mate pair of a read

#= 
Step through reads, one at a time. When we first see a read, we don't know if we'll find a mate for it. 
So we put it into the 'read cache' for a moment. We then keep reading. For each new read we find, we 
perform two checks:
1. if the chromend of the first (leftmost) read in the cache is more than (max_isize) bp away from the chromstart
of the read we're looking at, we are 'out of pairing range'. We take this opportunity to go through the cache
and purge any reads that are impossible to pair anymore, putting them in the 'analyze' queue. We also push our current read to the cache.
2. if the name of the read we're looking at matches the name of a read in the cache, we pull that read out and analyze it with the current read.
(also should check that chrom stays the same...!)
=# 





In [7]:
using SparseArrays


# TODO! Need to watch out for chromosome switching!!!
function test2(path; max_isize = 1000)

    I,J,V = Vector{Int64}(), Vector{Int64}(), Vector{Char}()

    record_cache = Vector{EPIREAD.Record}()
    reader = EPIREAD.Reader(path)

    # Pre-allocate record.
    record = EPIREAD.Record()
    i = 0
    last_chr = String("")
    while !eof(reader)
    # while i < 1000
        empty!(record)
        read!(reader, record)
        name = EPIREAD.name(record)
        chromstart = EPIREAD.chromstart(record)
        current_chr = EPIREAD.chrom(record)
        
        # Find the furthest-left *end* coordinate of all reads in the cache (if it is too far away from the beginning of this read, we'll never be able to pair to it)
        leftmost_chromend = get_leftmost_chromend(record_cache)
        # Test as above, and also check if we're on a new chr.
        if ( (leftmost_chromend < (chromstart - max_isize)) && (leftmost_chromend > 0) ) || ( (last_chr ≠ current_chr) && (last_chr ≠ "") )
            # If either of the above is true, get all the un-pair-able reads out of the cache and analyze them:
            # println("cleaning cache because min_end $leftmost_chromend is more than $max_isize away from $chromstart or because last-seen $(last_chr) is different from current $(EPIREAD.chrom(record)) (read $(name))")
            for each in clean_record_cache!(record_cache, chromstart - max_isize, EPIREAD.chrom(record))
                analyze_read(each, I, J, V)
            end
        end
        
        # Ok, any read still in the cache is fair game to pair with our current read. Let's see if we can find a match:
        cached_names = EPIREAD.name.(record_cache)
        if name in cached_names 
            pair_record = popat!(record_cache, findfirst(isequal(name), cached_names))
            analyze_record_pair(record, pair_record, I, J, V)

        # Ok, we couldn't find a match. Save this read for later (maybe its match just hasn't been read yet!)
        else
            push!(record_cache, copy(record))
        end
        i += 1
        last_chr = current_chr
    end
    println("Finished processing $(i) reads")
    # Finally, close the reader.
    close(reader)
    for each in record_cache
        analyze_read(each, I, J, V)
    end
    return sparse(I, J, V)
end


test2 (generic function with 1 method)

In [8]:
test2(path)

Found overlapping reads!! A00426:227:H7KMJDMXY:2:2205:19551:25081
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1331:6994:10034
Found overlapping reads!! A00426:227:H7KMJDMXY:2:2205:14642:2456
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1437:4255:21261
Found overlapping reads!! A00426:227:H7KMJDMXY:2:2351:20627:7091
Found overlapping reads!! A00426:227:H7KMJDMXY:2:2305:21938:14121
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1309:16649:34272
Found overlapping reads!! A00426:227:H7KMJDMXY:2:2435:11749:27320
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1103:3667:6026
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1161:19334:7326
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1146:5954:29121
Found overlapping reads!! A00426:227:H7KMJDMXY:2:2265:27299:33270
Found overlapping reads!! A00426:227:H7KMJDMXY:2:2472:13096:1658
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1102:17318:18176
Found overlapping reads!! A00426:227:H7KMJDMXY:2:1184:28483:31313
Found overlapping r

16×634657 SparseMatrixCSC{Char, Int64} with 61 stored entries:
⠀⠀⠀⠀⠀⠀⠀⠀⠈⠀⠀⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠁⠀⠙

In [9]:
records = []

reader = EPIREAD.Reader(path)
for each in reader
	if EPIREAD.name(each) == "A00426:227:H7KMJDMXY:2:1452:8721:25426"
		push!(records, copy(each))
	end
end

records

2-element Vector{Any}:
 Main.EPIREAD.Record:
    chromosome: chr2
         start: 667868
           end: 668018
     name/read: A00426:227:H7KMJDMXY:2:1452:8721:25426/1
        strand: -
CpG RLE string: F3x145F3

 Main.EPIREAD.Record:
    chromosome: chr2
         start: 667916
           end: 668067
     name/read: A00426:227:H7KMJDMXY:2:1452:8721:25426/2
        strand: -
CpG RLE string: F104x44DF3


In [10]:
are_overlapped(records[1], records[2])

rle1 = EPIREAD.cg_rle(records[1])
rle2 = EPIREAD.cg_rle(records[2])

println(rle1, ", ", rle2)


F3x145F3, F104x44DF3


In [11]:
function expand_rle(rle::String)
	temp = Vector{Char}()
	for (base, rep) in parse_rle(rle)
		for i in 1:rep
			push!(temp, base)
		end
	end
	return(String(temp))
end

expand_rle (generic function with 1 method)

In [12]:
const lookup_matrix = [
	'F' 'F' 'x' 'M' 'U' 'O' 'S' 'A' 'C' 'G' 'T' 'N' 'X' 'X' 'X' 'X' 'X' 'X' 'F';
	'F' 'P' 'x' 'M' 'U' 'O' 'S' 'A' 'C' 'G' 'T' 'N' 'X' 'X' 'X' 'X' 'X' 'X' 'P';
	'x' 'x' 'x' 'M' 'U' 'O' 'S' 'A' 'C' 'G' 'T' 'N' 'X' 'X' 'X' 'X' 'X' 'X' 'x';
	'M' 'M' 'M' 'M' 'Q' '?' '?' 'Q' 'Q' 'Q' 'Q' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'M';
	'U' 'U' 'U' 'Q' 'U' '?' '?' 'Q' 'Q' 'Q' 'Q' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'U';
	'O' 'O' 'O' '?' '?' 'O' 'Q' 'Q' 'Q' 'Q' 'Q' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'O';
	'S' 'S' 'S' '?' '?' 'Q' 'S' 'Q' 'Q' 'Q' 'Q' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'S';
	'A' 'A' 'A' 'Q' 'Q' 'Q' 'Q' 'A' 'Q' 'Q' 'Q' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'A';
	'C' 'C' 'C' 'Q' 'Q' 'Q' 'Q' 'Q' 'C' 'Q' 'Q' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'C';
	'G' 'G' 'G' 'Q' 'Q' 'Q' 'Q' 'Q' 'Q' 'G' 'Q' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'G';
	'T' 'T' 'T' 'Q' 'Q' 'Q' 'Q' 'Q' 'Q' 'Q' 'T' 'Q' 'X' 'X' 'X' 'X' 'X' 'X' 'T';
	'N' 'N' 'N' 'Q' 'Q' 'Q' 'Q' 'Q' 'Q' 'Q' 'Q' 'N' 'X' 'X' 'X' 'X' 'X' 'X' 'N';
	'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'a' 'Q' 'Q' 'Q' 'Q' 'X' 'a';
	'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'Q' 'c' 'Q' 'Q' 'Q' 'X' 'c';
	'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'Q' 'Q' 'g' 'Q' 'Q' 'X' 'g';
	'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'Q' 'Q' 'Q' 't' 'Q' 'X' 't';
	'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'Q' 'Q' 'Q' 'Q' 'n' 'X' 'n';
	'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'D' 'D';
	'F' 'P' 'x' 'M' 'U' 'O' 'S' 'A' 'C' 'G' 'T' 'N' 'a' 'c' 'g' 't' 'n' 'D' '_'
]

const lookup_str = "FPxMUOSACGTNacgtnD_"

"FPxMUOSACGTNacgtnD_"

In [13]:
function compare_bases(base1::Char, base2::Char)
	if base1==base2
		return base1
	else
		return lookup_matrix[findfirst(base1, lookup_str), findfirst(base2, lookup_str)]
	end
end

compare_bases (generic function with 1 method)

In [27]:
function merge_rle(rle1::String, rle2::String)
	merged = Vector{Char}()
	for (base1, base2) in zip(rle1, rle2)
		push!(merged, compare_bases(base1, base2))
	end
	return(String(merged))
end

function merge_rle(read1::EPIREAD.Record, read2::EPIREAD.Record)
	# We are only guaranteed that the reads overlap by ≥ 1 base. 
	left_read, right_read = (EPIREAD.chromstart(read1) ≤ EPIREAD.chromstart(read2)) ? (read1, read2) : (read2, read1)
	left_rle, right_rle = expand_rle.(EPIREAD.cg_rle.([left_read, right_read]))

	# We don't need to pad the left side of the left_read, but we might need to pad all 3 other sides:
	# We use the character '_' (underscore) as a pad.
	# First, line up the left sides:
	left_pad = EPIREAD.chromstart(right_read) - EPIREAD.chromstart(left_read)
	if left_pad > 0
		right_rle = "_"^left_pad*right_rle
	end
	# Now line up the right sides:
	long_rle, short_rle = length(right_rle) ≥ length(left_rle) ? (right_rle, left_rle) : (left_rle, right_rle)
	diff = length(long_rle) - length(short_rle)
	if diff > 0
		# pad right end of the shorter one
		short_rle = short_rle*"_"^abs(diff)
	end
	return (merge_rle(long_rle, short_rle), EPIREAD.chromstart(left_read))
end

merge_rle (generic function with 2 methods)