In [None]:
%matplotlib inline

import random
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from umi_tools import UMIClusterer
from simulation_tools import *


Make simulated library.

In [None]:
amplified = amplify_library(unique_seqs=30000, length=30, sample_size=900000, cycles=8, bias_mean=0, bias_sigma=0.1, dup_mean=0.8, dup_sigma=0.05, mut_rate=0.0001, error_rate=0.001)
amplified_expanded = [seq for seq_group in amplified.values() for seq in seq_group.elements()]
# print([amplified[i] for i in itertools.islice(amplified, 5)])
# print(amplified_expanded[:20])

Check library qualities.

In [None]:
print(f'Total sequences: {len(amplified_expanded)}')
unique_per_cluster = [len(list(amplified[center])) for center in amplified]
print(f'Number of clusters with no sequences: {sum([unique == 0 for unique in unique_per_cluster])}')
cluster_sizes = [cluster.total() for cluster in amplified.values()]
plt.hist(cluster_sizes, bins=range(max(cluster_sizes) + 1))
plt.title('Distribution of sequences per cluster')
plt.show()
plt.hist(unique_per_cluster, bins=range(max(unique_per_cluster) + 1))
plt.title('Distribution of unique sequences per cluster')
plt.show()

Define function to cluster sequences with umi_tools.

In [None]:
# Takes a dict of Counters (amplify_library output) as input, and returns the same data structure as output.
# The key for each counter in the output is either the sequence with highest frequency in the cluster (consensus=False)
# or the consensus sequence for the cluster (consensus=True)
def cluster_seqs(seqs, method, threshold, consensus=False):
  all_seqs = [seq.encode() for seq_group in seqs.values() for seq in seq_group.elements()]
  all_seqs_counts = Counter(all_seqs) 
  clusterer = UMIClusterer(cluster_method=method)
  clustered = clusterer(all_seqs_counts, threshold=threshold)
  
  cluster_counts = {}

  for cluster in clustered:
    cluster_seqs = []
    for seq in cluster:
      cluster_seqs.extend([seq.decode()] * all_seqs_counts[seq])
    cluster_counter = Counter(cluster_seqs)

    if consensus:
      cluster_counts[get_consensus(cluster_seqs)] = cluster_counter
    else:
      cluster_counts[cluster[0].decode()] = cluster_counter

  return cluster_counts

Use `cluster_seqs` on simulated data.

In [None]:
clustered = cluster_seqs(amplified, 'cluster', 6)

Compare true counts to sequenced counts.

In [None]:
diff, not_found = compare_counter_dicts(amplified, clustered)
print(len(not_found))
print([i for i in diff if i != 1.0])

In [None]:
wrong_center_true_counts = {}
wrong_center_cluster_counts = {}
dists = []
for seq in not_found:
  if amplified[seq].total() == 0:
    continue
  else:
    closest, dist = find_closest_match(seq, [seq for seq in clustered], hamming_distance)
    dists.append(dist)
    wrong_center_true_counts[seq] = amplified[seq]
    wrong_center_cluster_counts[seq] = clustered[closest]

In [None]:
print(Counter(dists))
diff, not_found = compare_counter_dicts(wrong_center_true_counts, wrong_center_cluster_counts)
print(len(not_found))
print([i for i in diff if i != 1.0])