In [None]:
%matplotlib inline

import itertools
import csv
import matplotlib.pyplot as plt
from collections import Counter
from simulation_tools import *

Make simulated library.

In [None]:
amplified = amplify_library(unique_seqs=300000, length=30, sample_size=9000000, cycles=8, bias_mean=0, bias_sigma=0.1, dup_mean=0.8, dup_sigma=0.05, mut_rate=0.0001, error_rate=0.001)
amplified_expanded = [seq for seq_group in amplified.values() for seq in seq_group.elements()]
# print([amplified[i] for i in itertools.islice(amplified, 5)])
# print(amplified_expanded[:20])

Check library qualities.

In [None]:
print(f'Total sequences: {len(amplified_expanded)}')
unique_per_cluster = [len(list(amplified[center])) for center in amplified]
print(f'Number of clusters with no sequences: {sum([unique == 0 for unique in unique_per_cluster])}')
cluster_sizes = [cluster.total() for cluster in amplified.values()]
plt.hist(cluster_sizes, bins=range(max(cluster_sizes) + 1))
plt.title('Distribution of sequences per cluster')
plt.show()
plt.hist(unique_per_cluster, bins=range(max(unique_per_cluster) + 1))
plt.title('Distribution of unique sequences per cluster')
plt.show()

Save as csv so it can be run through bartender. 

In [None]:
# create two columns which bartender expects
amplified_expanded_cols = [f'{seq},{i}' for i, seq in enumerate(amplified_expanded)]
with open('bartender_data/bartender_test_seqs.txt', 'w') as f:
  f.write('\n'.join(amplified_expanded_cols))

# ! head bartender_test_seqs.txt

Run sequences through bartender .

In [None]:
! bartender_single_com -f bartender_data/bartender_test_seqs.txt -o bartender_data/bartender_test_clustered -d 6 -z -1

# ! head bartender_data/bartender_test_clustered_barcode.csv
# ! head bartender_data/bartender_test_clustered_cluster.csv

Read bartender clustering results into format comparable to the original library (dict of counter objects).

In [None]:
def read_bartender_results(barcode_fp, cluster_fp, consensus=False):
  with open(barcode_fp, 'r') as f:
    barcode_data = [tuple(line) for line in csv.reader(f)]
    del barcode_data[0]
  with open(cluster_fp, 'r') as f:
    cluster_data = [tuple(line) for line in csv.reader(f)]
    del cluster_data[0]

  center_counts = {center: int(count) for cluster_id, center, score, count in cluster_data}
  cluster_ids = {cluster_id: center for cluster_id, center, score, count in cluster_data}

  clusters = {center: [] for cluster_id, center, score, count in cluster_data}

  for seq, count, id in barcode_data:
    center = cluster_ids[id]
    clusters[center].append((seq, int(count)))

  if consensus:
    clusters_counter = {}
    for i, center in enumerate(clusters):
      sorted_seqs = sorted(clusters[center], key=lambda x: x[1], reverse=True)
      expanded_seqs = [seq_and_count[0] for seq_and_count in sorted_seqs for i in range(seq_and_count[1])]
      consensus_center = get_consensus(expanded_seqs)
      clusters_counter[consensus_center] = Counter(dict(clusters[center]))
      
  else:
    clusters_counter = {center: Counter(dict(clusters[center])) for center in clusters}

  return clusters_counter, center_counts


In [None]:
clusters_counter, center_counts = read_bartender_results('bartender_data/bartender_test_clustered_barcode.csv', 'bartender_data/bartender_test_clustered_cluster.csv')
clusters_counter_consensus, center_counts_consensus = read_bartender_results('bartender_data/bartender_test_clustered_barcode.csv', 'bartender_data/bartender_test_clustered_cluster.csv', consensus=True)
# print([clusters_counter[i] for i in itertools.islice(clusters_counter, 1)])
# print([center_counts[i] for i in itertools.islice(center_counts, 1)])

Compare true counts to bartender clustering results.

In [None]:
diff, not_found = compare_counter_dicts(amplified, clusters_counter_consensus)
print(len(not_found))
print([i for i in diff if i != 1.0])

In [None]:
diff, not_found = compare_counter_dicts(amplified, clusters_counter)
print(len(not_found))
print([i for i in diff if i != 1.0])

In [None]:
wrong_center_true_counts = {}
wrong_center_cluster_counts = {}
dists = []
for seq in not_found:
  if amplified[seq].total() == 0:
    continue
  else:
    closest, dist = find_closest_match(seq, [seq for seq in clusters_counter], hamming_distance)
    dists.append(dist)
    wrong_center_true_counts[seq] = amplified[seq]
    wrong_center_cluster_counts[seq] = clusters_counter[closest]


In [None]:
print(Counter(dists))
diff, not_found = compare_counter_dicts(wrong_center_true_counts, wrong_center_cluster_counts)
print(len(not_found))
print([i for i in diff if i != 1.0])