## Analysis code from OCTA-tool

Notebook for experimenting with how to do the analysis

### Loading data

In [1]:
using LibExpat
# C/P
function get_unknown_barcodes(stats_dir, lane)
    summary_path = joinpath(stats_dir, "DemuxSummaryF1L$(lane).txt")
    unknown_barcode_count = Dict()
    open(summary_path) do f
        found_unknown_bc_section = false
        for line = eachline(f)
            if found_unknown_bc_section
                if ! startswith(line, "###")
                    (barcode, count_str) = split(line, "\t")
                    unknown_barcode_count[barcode] = parse(Int64, count_str)
                end
            elseif strip(line) == "### Most Popular Unknown Index Sequences"
                found_unknown_bc_section = true
            end
        end
    end
    return unknown_barcode_count
end

function get_demultiplexing_stats(stats_dir)
    xml_path = joinpath(stats_dir, "DemultiplexingStats.xml")
    xmltree = xp_parse(readall(xml_path))
    mismatch_xpath = "Flowcell/Project[@name != \"all\"]/Sample/Barcode[@name != \"all\"]/Lane/BarcodeCount"
    mismatch_results = xmltree[mismatch_xpath]
    perfect_xpath = "Flowcell/Project[@name != \"all\"]/Sample/Barcode[@name != \"all\"]/Lane/PerfectBarcodeCount"
    perfect_results = xmltree[perfect_xpath]

    read_counts = Dict()
    perfect_read_counts = Dict()

    for res = mismatch_results
        lane = parse(Int,res.parent.attr["number"])
        barcode = res.parent.parent.attr["name"]
        
        read_counts[(lane, barcode)] = parse(Int64, res.elements[1])
    end
    return read_counts, perfect_read_counts
end


get_demultiplexing_stats (generic function with 1 method)

In [35]:
# Adapted
stats_dir = "/home/fa2k/nsc/statfiles/170113_NB501273_0042_AHTKM2BGXY/Data/Intensities/BaseCalls/Stats"

lane = 1

(read_counts, perfect_read_counts) = get_demultiplexing_stats(stats_dir)
filtered_read_counts = filter((k, v) -> (k[1] == lane && k[2] != "unknown"), read_counts)
sample_indexes =Dict(
                [split(replace(barcode, "+", "-"), "-") => read_count
                    for ((_,barcode), read_count) in filtered_read_counts]
                )

unknown_barcodes_raw = get_unknown_barcodes(stats_dir, lane)
unknown_indexes = Dict(map(x -> split(replace(x[1], "+", "-"), "-") => x[2],
                        unknown_barcodes_raw))

unknown_total_reads = read_counts[(lane, "unknown")]

4652858

## Analysis

### Determine possible index combinations (sample)

In [3]:
first(keys(sample_indexes))

2-element Array{SubString{ASCIIString},1}:
 "ATTACTCG"
 "TCAGAGCC"

In [4]:
first(zip(keys(sample_indexes)...))

("ATTACTCG","ATTACTCG","GAGATTCC","ATTACTCG","GAGATTCC","CGCTCATT","TCCGGAGA","ATTACTCG","ATTACTCG","GAATTCGT","CGCTCATT","ATTACTCG","TCCGGAGA","GAATTCGT","GAGATTCC","GAATTCGT","CGCTCATT","GAGATTCC","GAATTCGT","GAATTCGT","GAGATTCC","TCCGGAGA","ATTCAGAA","GAATTCGT","ATTCAGAA","CGCTCATT","GAGATTCC","ATTACTCG","ATTCAGAA","CGCTCATT","GAATTCGT","ATTCAGAA","TCCGGAGA","TCCGGAGA","TCCGGAGA","GAGATTCC","ATTCAGAA","TCCGGAGA","TCCGGAGA","ATTCAGAA","CGCTCATT","ATTCAGAA","CGCTCATT","GAATTCGT","ATTCAGAA","GAGATTCC","CGCTCATT","ATTACTCG")

In [5]:
sample_single_indexes = map(Set, zip(keys(sample_indexes)...)) # Indexes 1 and 2
sample_single_indexes[1]

Set(SubString{ASCIIString}["GAATTCGT","ATTCAGAA","GAGATTCC","ATTACTCG","CGCTCATT","TCCGGAGA"])

In [6]:
combinations = [(i1, i2) for i1 in sample_single_indexes[1], i2 in sample_single_indexes[2]]

6x8 Array{Tuple{Any,Any},2}:
 ("GAATTCGT","GTCAGTAC")  ("GAATTCGT","TCAGAGCC")  …  ("GAATTCGT","AGGCTATA")
 ("ATTCAGAA","GTCAGTAC")  ("ATTCAGAA","TCAGAGCC")     ("ATTCAGAA","AGGCTATA")
 ("GAGATTCC","GTCAGTAC")  ("GAGATTCC","TCAGAGCC")     ("GAGATTCC","AGGCTATA")
 ("ATTACTCG","GTCAGTAC")  ("ATTACTCG","TCAGAGCC")     ("ATTACTCG","AGGCTATA")
 ("CGCTCATT","GTCAGTAC")  ("CGCTCATT","TCAGAGCC")     ("CGCTCATT","AGGCTATA")
 ("TCCGGAGA","GTCAGTAC")  ("TCCGGAGA","TCAGAGCC")  …  ("TCCGGAGA","AGGCTATA")

### Is analysis feasible (any unused combinations)?

In [7]:
length(sample_indexes)

48

In [8]:
length(combinations)

48

In [9]:
feasible = length(combinations) > length(sample_indexes)

false

Note: if we can look at indexes with one known index and one unknown index, it is still possible to analyse data where all possible index combinations have been used. This will depend on the nature of the mis-assignment effect, and the results are not necessarily comparable:

### Find number of reads per single (#1 or #2) index

In [32]:
first(keys(sample_indexes)) == ["ATTACTCG", "TCAGAGCC"]

true

In [37]:
first(unknown_indexes)

Pair{Any,Any}(SubString{ASCIIString}["CGCTCATT","GTCATACG"],993)

In [23]:
index_seq = nothing
for t in sample_single_indexes[1]
    index_seq = t # Unable to index into the set directly
    break
end
index_seq

"GAATTCGT"

In [36]:
in_known_samples = 0
in_misassigned= 0
in_unknown = 0
for other_index_seq = sample_single_indexes[2]
    in_known_samples += get(sample_indexes, [index_seq, other_index_seq], 0)
    in_misassigned += get(unknown_indexes, [index_seq, other_index_seq], 0)
end
(in_known_samples, in_misassigned)

(16778026,0)

In [11]:
single_index = [[],[]]

for i_fixed, i_count = [(1,2), (2,1)]
    
    push!(single_index[i_fixed], '')
end



LoadError: LoadError: syntax: invalid iteration specification
while loading In[11], in expression starting on line 3

 in depwarn at ./deprecated.jl:73
while loading In[11], in expression starting on line 1
