In [None]:
%matplotlib inline  

from collections import Counter
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

from allele import Allele
from allele_simulator import AlleleSimulator
from all_reads import CellReads, CellRead
from cell_state import CellTypeTree, CellState
from cell_state_simulator import CellTypeSimulator
from clt_simulator import CLTSimulator
from clt_observer import CLTObserver
from constants import BARCODE_V7, NUM_BARCODE_V7_TARGETS

In [None]:
START_BASE = 120
BARCODE_SPACER_LEN = 27
TARGET_LEN = 23

birth_lambda = 1.2
death_lambda = 0.01

target_lambdas = [0.3 for _ in range(10)]
repair_lambdas = [1, 2]
repair_indel_probability = 0.5
repair_deletion_lambda = 8
repair_insertion_lambda = 0.5

time = 10

sampling_rate = 0.1

In [None]:
cell_types = ["brain", "eye"]
cell_type_tree = CellTypeTree(cell_type=None, rate=0)
cell_type_tree.add_child(
    CellTypeTree(cell_type=0, rate=0.05))
cell_type_tree.add_child(
    CellTypeTree(cell_type=1, rate=0.05))

In [None]:
allele_simulator = AlleleSimulator(
        np.array(target_lambdas),
        np.array(repair_lambdas), repair_indel_probability,
        repair_deletion_lambda, repair_deletion_lambda,
        repair_insertion_lambda)
cell_type_simulator = CellTypeSimulator(cell_type_tree)
clt_simulator = CLTSimulator(
    birth_lambda,
    death_lambda,
    cell_type_simulator,
    allele_simulator)

In [None]:
clt = clt_simulator.simulate(Allele(), CellState(categorical=cell_type_tree), time)

In [None]:
observer = CLTObserver(sampling_rate)
obs_leaves = observer.observe_leaves(clt, give_pruned_clt=False)

all_reads = [CellRead(obs.allele.get_event_encoding(), obs.cell_state.categorical_state) for obs in obs_leaves]
print(len(all_reads))
cell_type_dict = {"0": cell_types[0], "1": cell_types[1], "0,1": "0,1"}
cell_reads = CellReads(all_reads, cell_type_dict)

In [None]:
"""
distributions of the number of (visible) events in each cell type
"""
df = pd.DataFrame([(cell_reads.organ_dict[x.organ.get_gen_name()], len(x.events)) for x in cell_reads.reads],
                  columns=('cell type', 'number of events'))
plt.figure(figsize=(20, 3))
sns.boxplot(x='cell type', y='number of events', data=df, color='lightgrey')

In [None]:
"""
distributions of the number of exhausted targets in each cell type
"""
df = pd.DataFrame([(cell_reads.organ_dict[x.organ.get_gen_name()], len(x.allele.get_used_targets())) for x in cell_reads.reads],
                  columns=('cell type', 'number of exhausted targets'))
plt.figure(figsize=(20, 3))
sns.violinplot(x='cell type', y='number of exhausted targets', data=df, color='lightgrey')

In [None]:
"""
Really basic data characteristics
"""
num_cells = len(cell_reads.reads)
print("# of cells:", num_cells)
print("# of unique alleles (alleles):", len(cell_reads.uniq_alleles))

In [None]:
"""
Organ data distribution
"""
organ_allele_dict = dict()
for b in cell_reads.reads:
    organ= cell_reads.organ_dict[b.organ.get_gen_name()]
    if organ not in organ_allele_dict:
        organ_allele_dict[organ] = []
    organ_allele_dict[organ].append(b)
    
for organ, alleles in organ_allele_dict.items():
    allele_strs = [".".join([str(evt) for evt in b.events]) for b in alleles]
    uniq_alleles = set(allele_strs)
    num_organ_cells = len(alleles)
    print("# of %s cells: %d (%f%%)" % (organ, num_organ_cells, (100 * num_organ_cells)/num_cells))
    print("  # of unique alleles:", len(uniq_alleles))
        

In [None]:
# Process unique insertions
uniq_inserts = set()
uniq_insert_strs = set()
for b in cell_reads.uniq_alleles:
    for evt in b.events:
        if evt.del_len == 0:
            if str(evt) in uniq_insert_strs:
                continue
            else:
                uniq_insert_strs.add(str(evt))
                uniq_inserts.add(evt)

In [None]:
# Process insertions with target idx
all_insert_target_pos = []
for b in cell_reads.uniq_alleles:
    for evt in b.events:
        if evt.del_len == 0:
            all_insert_target_pos.append((evt.min_target, evt))

In [None]:
"""
Interpretation: There aren't that many insertion locations possible
"""
num_insert_evt = np.sum([evt.del_len == 0 for evt in cell_reads.uniq_events])
print("# unique insertion events:", num_insert_evt)
insert_pos = [ins.start_pos for ins in uniq_inserts]
num_uniq_insert_start = np.unique(insert_pos).size
print("# unique insertion positions:", num_uniq_insert_start)

In [None]:
"""
A summary table of the length of unique insertion events

Interpretation: There is actually a wide range of insertion lengths (up to 48 bp!).
Most insertions are short. However some insertion lengths seem to be more favorable.
Even though we only look at unique insertions, there are a large number of
unique insertions that are all length 12, 15, 20, and 23.
"""
insert_len = [ins.insert_len for ins in uniq_inserts]
Counter(insert_len)

In [None]:
"""
Where are things inserted for each target, what is the usual position, what is the usual length?
The numbers below are median (min, max)
"""
insert_target_summary = [
    {'count': 0, 'locations': [], 'lengths': [], 'uniq_strs': set()} for i in range(NUM_BARCODE_V7_TARGETS)
]
for target_i, evt in all_insert_target_pos:
    if str(evt) in insert_target_summary[target_i]['uniq_strs']:
        continue
    else:
        insert_target_summary[target_i]['uniq_strs'].add(str(evt))
        insert_target_summary[target_i]["count"] += 1
        insert_target_summary[target_i]["locations"].append(evt.start_pos)
        insert_target_summary[target_i]["lengths"].append(evt.insert_len)

for target_i, target_dict in enumerate(insert_target_summary):
    print("Target", target_i)
    print("  Count:", target_dict['count'])
    if target_dict['count'] > 0:
        locs = target_dict['locations']
        print("  Location:", np.median(locs), "(", np.min(locs), ",", np.max(locs), ")")
        lens = target_dict['lengths']
        print("  Lengths:", np.median(lens), "(", np.min(lens), ",", np.max(lens), ")")

In [None]:
"""
Plot: histogram of insertion positions
"""
plt.hist(insert_pos, bins=50, log=True)

In [None]:
# Process unique deletions
uniq_deletes = set()
for b in cell_reads.uniq_alleles:
    for evt in b.events:
        if evt.del_len > 0:
            if evt in uniq_deletes:
                continue
            else:
                uniq_deletes.add(evt)

# Process deletions with target idx
all_delete_target_pos = []
for b in cell_reads.uniq_alleles:
    for evt in b.events:
        if evt.del_len > 0:
            all_delete_target_pos.append((evt.min_target, evt))

In [None]:
"""
Counting deletion events and positions

Interpretation: There are quite a lot of deletion positions possible. The total allele is only 270 long.
We are starting deletes from almost every position.
"""
num_delete_evt = np.sum([evt.del_len == 0 for evt in cell_reads.uniq_events])
print("# unique delete events:", num_delete_evt)
delete_pos = [deletion.start_pos for deletion in uniq_deletes]
num_uniq_delete_start = np.unique(delete_pos).size
print("# unique deletion start positions:", num_uniq_delete_start)
delete_end_pos = [deletion.start_pos + deletion.del_len - 1 for deletion in uniq_deletes]
num_uniq_delete_end = np.unique(delete_end_pos).size
print("# unique deletion end positions:", num_uniq_delete_end)

In [None]:
"""
Histogram of the lengths of deletions
"""
# A summary table of the length of unique deletion events
del_len = [ins.del_len for ins in uniq_deletes]
plt.hist(del_len, bins=20)

In [None]:
"""
Plot: histogram of deletion start locations

Picture: deletion start locations for each unique deletion event.
You can spot 10 peaks, but it's a bit difficult. Deleions can start anywhere!
"""
plt.hist(delete_pos, bins=100)

In [None]:
"""
Plot: histogram of deletion end locations
"""
plt.hist(delete_end_pos, bins=100)

In [None]:
"""
Where are things deleted for each target, what is the usual position, what is the usual length?
The numbers below are median (min, max)

Interpretation: The median position for the deletion for each target is pretty evenly spaced.
Median deletion length is actually quite high!
"""
del_target_summary = [
    {'count': 0, 'locations': [], 'lengths': [], 'uniq_evts': set()}
    for i in range(NUM_BARCODE_V7_TARGETS)
]
for target_i, evt in all_delete_target_pos:
    del_target_summary[target_i]['uniq_evts'].add(evt)
    del_target_summary[target_i]["count"] += 1
    del_target_summary[target_i]["locations"].append(evt.start_pos)
    del_target_summary[target_i]["lengths"].append(evt.del_len)

for target_i, target_dict in enumerate(del_target_summary):
    print("Target", target_i)
    print("  Count:", target_dict['count'])
    if target_dict['count'] > 0:
        locs = target_dict['locations']
        print("  Location:", np.median(locs), "(", np.min(locs), ",", np.max(locs), ")")
        lens = target_dict['lengths']
        print("  Lengths:", np.median(lens), "(", np.min(lens), ",", np.max(lens), ")")

In [None]:
"""
Plot: deletion length histogram for deletion events that start at this target.

Interpretation: The deletion length profiles vary across the targets.
The deletion lengths can vary widely, some going up to 200+ bp.
Shorter deletions are preferred.
"""
plt.figure(figsize=(8,14))
for target_i, target_dict in enumerate(del_target_summary):
    print("Target", target_i)
    plt.subplot(NUM_BARCODE_V7_TARGETS, 1, target_i + 1, xlim = [0, 250])
    plt.hist(target_dict['lengths'], bins=30)

In [None]:
"""
How often do intertarget deletions occur?
e.g. do targets 0 and 9 often get cut together?

Interpretation: most deletions are in the same target. The next most common deletions occur across two targets.
There are very few very long inter-target deletions.
"""
uniq_deletion_strs = set()
intertarget_pairs = np.zeros((NUM_BARCODE_V7_TARGETS, NUM_BARCODE_V7_TARGETS))
for b in cell_reads.uniq_alleles:
    for evt in b.events:
        if evt.del_len > 0:
            uniq_deletion_strs.add(evt)
            intertarget_pairs[evt.min_target, evt.max_target] += 1
plt.imshow(intertarget_pairs, cmap='hot', interpolation='none')
plt.colorbar()

In [None]:
"""
Average number of targets disturbed for each allele

Interpretation: Most alleles have most targets modified by the time we sequence.
This is like how the paper mentioned that we have saturated the alleles.
"""
all_num_disturbed = []
for b in cell_reads.uniq_alleles:
    disturbed_targs = [0 for i in range(10)]
    for evt in b.events:
        for targ in range(evt.min_target, evt.max_target + 1):
            disturbed_targs[targ] = 1
    num_disturbed = sum(disturbed_targs)
    all_num_disturbed.append(num_disturbed)
Counter(all_num_disturbed)