In [None]:
%matplotlib inline  

from collections import Counter
import numpy as np
from matplotlib import pyplot as plt

from read_seq_data import parse_reads_file_format7B
from models import EventType

from constants import BARCODE_V7, NUM_BARCODE_V7_TARGETS

"""
A descriptive analysis of data from the first fish.
"""

In [None]:
START_BASE = 120
BARCODE_SPACER_LEN = 27
TARGET_LEN = 23

In [None]:
cell_reads = parse_reads_file_format7B("../../data/fish_7B_UMI_collapsed_reads.txt")

In [None]:
"""
Really basic data characteristics
"""
num_cells = len(cell_reads.all_barcodes)
print("# of cells:", num_cells)
print("# of unique barcodes (alleles):", len(cell_reads.uniq_barcodes))

In [None]:
"""
Organ data distribution
"""
organ_barcode_dict = dict()
for b in cell_reads.all_barcodes:
    if b.organ not in organ_barcode_dict:
        organ_barcode_dict[b.organ] = []
    organ_barcode_dict[b.organ].append(b)
    
for organ, barcodes in organ_barcode_dict.items():
    bcode_strs = [".".join([str(evt) for evt in b.uniq_events]) for b in barcodes]
    uniq_bcodes = set(bcode_strs)
    num_organ_cells = len(barcodes)
    print("# of %s cells: %d (%f%%)" % (organ, num_organ_cells, (100 * num_organ_cells)/num_cells))
    print("  # of unique barcodes:", len(uniq_bcodes))
        

In [None]:
# Process unique insertions
uniq_inserts = set()
uniq_insert_strs = set()
for b in cell_reads.uniq_barcodes:
    for evt in b.uniq_events:
        if evt.event_type == EventType.INSERT:
            if evt.get_str_id() in uniq_insert_strs:
                continue
            else:
                uniq_insert_strs.add(evt.get_str_id())
                uniq_inserts.add(evt)

# Process insertions with target idx
all_insert_target_pos = []
for b in cell_reads.uniq_barcodes:
    all_target_evts = b.events
    for target_i, target_evts in enumerate(all_target_evts):
        for evt in target_evts:
            if evt.event_type == EventType.INSERT:
                all_insert_target_pos.append((target_i, evt))
all_insert_target_pos = np.array(all_insert_target_pos)

In [None]:
"""
Interpretation: There aren't that many insertion locations possible
"""
num_insert_evt = np.sum([str_id.startswith("EventType.INSERT") for str_id in cell_reads.event_str_ids])
print("# unique insertion events:", num_insert_evt)
insert_pos = [ins.start_pos for ins in uniq_inserts]
num_uniq_insert_start = np.unique(insert_pos).size
print("# unique insertion positions:", num_uniq_insert_start)

In [None]:
"""
A summary table of the length of unique insertion events

Interpretation: There is actually a wide range of insertion lengths (up to 48 bp!).
Most insertions are short. However some insertion lengths seem to be more favorable.
Even though we only look at unique insertions, there are a large number of
unique insertions that are all length 12, 15, 20, and 23.
"""
insert_len = [ins.event_len for ins in uniq_inserts]
Counter(insert_len)

In [None]:
"""
Where are things inserted for each target, what is the usual position, what is the usual length?
The numbers below are median (min, max)

Interpretation: Looks like insertion location is not always at the cut site. It could be a little left
of the cut site due to simultaneous deletions maybe?
"""
insert_target_summary = [
    {'count': 0, 'locations': [], 'lengths': [], 'uniq_strs': set()} for i in range(NUM_BARCODE_V7_TARGETS)
]
for target_i, evt in all_insert_target_pos:
    if evt.get_str_id() in insert_target_summary[target_i]['uniq_strs']:
        continue
    else:
        insert_target_summary[target_i]['uniq_strs'].add(evt.get_str_id())
        insert_target_summary[target_i]["count"] += 1
        insert_target_summary[target_i]["locations"].append(evt.start_pos)
        insert_target_summary[target_i]["lengths"].append(evt.event_len)

for target_i, target_dict in enumerate(insert_target_summary):
    print("Target", target_i)
    print("  Count:", target_dict['count'])
    locs = target_dict['locations']
    print("  Location:", np.median(locs), "(", np.min(locs), ",", np.max(locs), ")")
    lens = target_dict['lengths']
    print("  Lengths:", np.median(lens), "(", np.min(lens), ",", np.max(lens), ")")

In [None]:
"""
Plot: histogram of insertion positions

Interpretation: Most insertion positions are centered around the cutting locations.
The cut locations are almost equally spaced apart, with some jitter. (This is just a more visual
plot of the list above.)
"""
plt.hist(insert_pos, bins=50, log=True)

In [None]:
"""
Understanding simultaneous deletions and insertions (those that occur in the same position)

Interpretation: There are 10/59 insertion events that overlap with a deletion event on the same barcode.
My guess is that the two happened at the same time.
However I'm a bit surprised that we don't have simultaneous deletion and insertion more often.
"""
simult_del_ins = set()
for b in cell_reads.uniq_barcodes:
    for target_evts in b.events:
        start_idx = 0
        insert_evts = [evt.event_type == EventType.INSERT for evt in target_evts]
        delete_evts = [evt.event_type == EventType.DELETE for evt in target_evts]
        insert_offset_posns = [evt.start_pos + 1 for evt in target_evts if evt.event_type == EventType.INSERT]
        delete_posns = [evt.start_pos for evt in target_evts if evt.event_type == EventType.DELETE]
        num_intersects = (set(insert_offset_posns)).intersection(set(delete_posns))
        if len(num_intersects) == 1:
            del_idx = delete_evts.index(True)
            ins_idx = insert_evts.index(True)
            simult_del_ins.add(target_evts[ins_idx].get_str_id() + "--" + target_evts[del_idx].get_str_id())
print("# simultaneous delete and insert:", len(simult_del_ins))
for del_in in simult_del_ins:
    print(del_in)

In [None]:
"""
Understanding complex events involving insertions.
Complex events mean there are more than two events in a single target.

Interpretation: There are actually plenty of insertion events without a deletion event.
There are also insertion events with non-overlapping deletion events. I'm not sure why these occur --
is it possible that when we repair the barcode, we actually have a double-strand break in some place and
we patch in a couple of the same bases AND insert some new bases?
There are also double insertions in the same target.
If there are multiple events in the same target, usually the events occur close to each other.
It seems like some of these events actually occurred simultaneously, even though we're processing as separate
events right now.
"""
many_inserts_same_target = set()
inserts_no_dels_same_target = set()
del_ins_no_simul_same_target = set()
for b in cell_reads.uniq_barcodes:
    for target_evts in b.events:
        start_idx = 0
        insert_evts = [evt.event_type == EventType.INSERT for evt in target_evts]
        delete_evts = [evt.event_type == EventType.DELETE for evt in target_evts]
        target_evt_str = "--".join([evt.get_str_id() for evt in target_evts])
        if any(insert_evts) and not any(delete_evts):
            inserts_no_dels_same_target.add(target_evt_str)
        if sum(insert_evts) >= 2:
            many_inserts_same_target.add(target_evt_str)
        if any(insert_evts):
            insert_offset_posns = [evt.start_pos + 1 for evt in target_evts if evt.event_type == EventType.INSERT]
            delete_posns = [evt.start_pos for evt in target_evts if evt.event_type == EventType.DELETE]
            num_intersects = (set(insert_offset_posns)).intersection(set(delete_posns))
            if len(num_intersects) == 0:
                target_evt_str = "--".join([evt.get_str_id() for evt in target_evts])
                del_ins_no_simul_same_target.add(target_evt_str)

print("# insert in a target without any deletions:", len(inserts_no_dels_same_target))
print("# 2+-insertions in a target:", len(many_inserts_same_target))
print("# inserts in a target without overlapping deletions:", len(del_ins_no_simul_same_target))

In [None]:
# Process unique deletions
uniq_deletes = set()
uniq_delete_strs = set()
for b in cell_reads.uniq_barcodes:
    for evt in b.uniq_events:
        if evt.event_type == EventType.DELETE:
            if evt.get_str_id() in uniq_delete_strs:
                continue
            else:
                uniq_delete_strs.add(evt.get_str_id())
                uniq_deletes.add(evt)

# Process deletions with target idx
all_delete_target_pos = []
for b in cell_reads.uniq_barcodes:
    all_target_evts = b.events
    for target_i, target_evts in enumerate(all_target_evts):
        for evt in target_evts:
            if evt.event_type == EventType.DELETE:
                all_delete_target_pos.append((target_i, evt))
all_delete_target_pos = np.array(all_delete_target_pos)

In [None]:
"""
Counting deletion events and positions

Interpretation: There are quite a lot of deletion positions possible. The total barcode is only 270 long.
We are starting deletes from almost every position.
"""
num_delete_evt = np.sum([str_id.startswith("EventType.DELETE") for str_id in cell_reads.event_str_ids])
print("# unique delete events:", num_delete_evt)
delete_pos = [deletion.start_pos for deletion in uniq_deletes]
num_uniq_delete_start = np.unique(delete_pos).size
print("# unique deletion positions:", num_uniq_delete_start)

In [None]:
"""
Histogram of the lengths of deletions

Interpretation: Wide range of deletion lengths possible, though more often we delete short lengths.
We even have deletions of 222 bp.
"""
# A summary table of the length of unique deletion events
del_len = [ins.event_len for ins in uniq_deletes]
plt.hist(del_len, bins=20)

In [None]:
"""
Plot: histogram of deletion start locations

Picture: deletion start locations for each unique deletion event.
You can spot 10 peaks, but it's a bit difficult. Deleions can start anywhere!
"""
plt.hist(delete_pos, bins=100)

In [None]:
"""
Where are things deleted for each target, what is the usual position, what is the usual length?
The numbers below are median (min, max)

Interpretation: The median position for the deletion for each target is pretty evenly spaced.
Median deletion length is actually quite high!
"""
del_target_summary = [
    {'count': 0, 'locations': [], 'lengths': [], 'uniq_strs': set()}
    for i in range(NUM_BARCODE_V7_TARGETS)
]
for target_i, evt in all_delete_target_pos:
    is_target_i_start = evt.start_pos >= START_BASE + BARCODE_SPACER_LEN * target_i
    is_target_i_end = evt.start_pos <= START_BASE + BARCODE_SPACER_LEN * (target_i + 1)
    if is_target_i_start and is_target_i_end:
        if evt.get_str_id() in del_target_summary[target_i]['uniq_strs']:
            continue
        else:
            del_target_summary[target_i]['uniq_strs'].add(evt.get_str_id())
            del_target_summary[target_i]["count"] += 1
            del_target_summary[target_i]["locations"].append(evt.start_pos)
            del_target_summary[target_i]["lengths"].append(evt.event_len)

for target_i, target_dict in enumerate(del_target_summary):
    print("Target", target_i)
    print("  Count:", target_dict['count'])
    locs = target_dict['locations']
    print("  Location:", np.median(locs), "(", np.min(locs), ",", np.max(locs), ")")
    lens = target_dict['lengths']
    print("  Lengths:", np.median(lens), "(", np.min(lens), ",", np.max(lens), ")")

In [None]:
"""
Plot: deletion length histogram for deletion events that start at this target.

Interpretation: The deletion length profiles vary across the targets.
The deletion lengths can vary widely, some going up to 200+ bp.
Shorter deletions are preferred.
"""
plt.figure(figsize=(8,14))
for target_i, target_dict in enumerate(del_target_summary):
    print("Target", target_i)
    plt.subplot(NUM_BARCODE_V7_TARGETS, 1, target_i + 1)
    plt.hist(target_dict['lengths'], bins=30)

In [None]:
"""
How often do intertarget deletions occur?
e.g. do targets 0 and 9 often get cut together?

Interpretation: most deletions are in the same target. The next most common deletions occur across two targets.
There are very few very long inter-target deletions.
"""
uniq_deletion_strs = set()
intertarget_pairs = np.zeros((NUM_BARCODE_V7_TARGETS, NUM_BARCODE_V7_TARGETS))
for b in cell_reads.uniq_barcodes:
    deletions = dict()
    for target_idx, target_evts in enumerate(b.events):
        for evt in target_evts:
            if evt.event_type == EventType.DELETE:
                evt_id = evt.get_str_id()
                if evt_id in deletions:
                    deletions[evt_id].append(target_idx)
                else:
                    deletions[evt_id] = [target_idx]
    for del_evt, del_targets in deletions.items():
        if del_evt in uniq_deletion_strs:
            continue
        else:
            uniq_deletion_strs.add(del_evt)
            intertarget_pairs[min(del_targets), max(del_targets)] += 1
plt.imshow(intertarget_pairs, cmap='hot')
plt.colorbar()

In [None]:
"""
Understanding double deletions in the same target

Interpretation: There are actually some double deletions within the same target.
I think it was because a longer deletion occurred but some of the same nucleotides
were patched back in so that it may be mis-interpretted as two events instead of
a single one.
"""
many_dels_same_target = set()
many_short_dels_same_target = set()
for b in cell_reads.uniq_barcodes:
    for target_evts in b.events:
        event_lens = [evt.event_len for evt in target_evts if evt.event_type == EventType.DELETE]
        if len(event_lens) >= 2:
            target_evt_str = "--".join([evt.get_str_id() for evt in target_evts])
            many_dels_same_target.add(target_evt_str)
            if np.all(np.array(event_lens) < TARGET_LEN/2):
                many_short_dels_same_target.add(target_evt_str)

print("# 2+-deletions in a target:", len(many_dels_same_target))
print("# 2+-short-deletions (11 bp or less) in a target:", len(many_short_dels_same_target))

In [None]:
"""
Average number of targets disturbed for each barcode

Interpretation: Most barcodes have most targets modified by the time we sequence.
This is like how the paper mentioned that we have saturated the barcodes.
"""
all_num_disturbed = []
for b in cell_reads.uniq_barcodes:
    num_disturbed = sum([len(target_evts) >= 1 for target_evts in b.events])
    all_num_disturbed.append(num_disturbed)
Counter(all_num_disturbed)