In [None]:
%matplotlib inline

import itertools
import csv
import matplotlib.pyplot as plt
from collections import Counter
from simulation_tools import *

Make simulated library.

In [None]:
amplified = amplify_library(unique_seqs=300000, length=30, sample_size=9000000, cycles=8, bias_mean=0, bias_sigma=0.1, dup_mean=0.8, dup_sigma=0.05, mut_rate=0.0001, error_rate=0.005)
amplified_expanded = [seq for seq_group in amplified.values() for seq in seq_group.elements()]
# print([amplified[i] for i in itertools.islice(amplified, 5)])
# print(amplified_expanded[:20])

Check library qualities.

In [None]:
print(f'Total sequences: {len(amplified_expanded)}')
print(f'Unique sequences: {len(set(amplified_expanded))}')
unique_per_cluster = [len(list(amplified[center])) for center in amplified]
print(f'Number of clusters with no sequences: {sum([unique == 0 for unique in unique_per_cluster])}')
cluster_sizes = [cluster.total() for cluster in amplified.values()]
plt.hist(cluster_sizes, bins=range(max(cluster_sizes) + 1))
plt.title('Distribution of sequences per cluster')
plt.show()
plt.hist(unique_per_cluster, bins=range(max(unique_per_cluster) + 1))
plt.title('Distribution of unique sequences per cluster')
plt.show()

Save as text file so it can be run through starcode. 

In [None]:
with open('starcode_data/starcode_test_seqs.txt', 'w') as f:
  f.write('\n'.join(amplified_expanded))

! head starcode_data/starcode_test_seqs.txt

Run sequences through starcode.

In [None]:
! starcode -i starcode_data/starcode_test_seqs.txt -o starcode_data/starcode_test_clustered.txt --print-clusters --seq-id -r 2

# ! head starcode_data/starcode_test_clustered.txt

Read starcode clustering results into format comparable to the original library (dict of counter objects).

In [None]:
def read_starcode_results(cluster_fp, input_seqs):
  with open(cluster_fp, 'r') as f:
    cluster_data = [tuple(line) for line in csv.reader(f, delimiter='\t')]

  center_counts = {cluster[0]: int(cluster[1]) for cluster in cluster_data}
  cluster_counts = {}
  for cluster in cluster_data:
    indices = [int(i) for i in cluster[3].split(',')]
    seqs = [input_seqs[i - 1] for i in indices] # Sequence IDs start at 1
    cluster_counts[cluster[0]] = Counter(seqs)
  return center_counts, cluster_counts

In [None]:
center_counts, cluster_counts = read_starcode_results('starcode_data/starcode_test_clustered.txt', amplified_expanded)

Compare true counts to bartender clustering results.

In [None]:
print(f'Total clusters found: {len(center_counts)}')
diff, not_found = compare_counter_dicts(amplified, cluster_counts)
print(f'Number of true cluster centers not found: {len(not_found)}')
cluster_size_change = [i for i in diff if i != 1.0]
print(f'Number of clusters with incorrect size: {len(cluster_size_change)}')
plt.hist(cluster_size_change)
plt.title('Distribution of incorrect clustered/true count ratio')
plt.show()

In [None]:
wrong_center_true_counts = {}
wrong_center_cluster_counts = {}
dists = []
for seq in not_found:
  if amplified[seq].total() == 0:
    continue
  else:
    closest, dist = find_closest_match(seq, [seq for seq in cluster_counts], hamming_distance)
    dists.append(dist)
    wrong_center_true_counts[seq] = amplified[seq]
    wrong_center_cluster_counts[seq] = cluster_counts[closest]


In [None]:
print(Counter(dists))
diff, not_found = compare_counter_dicts(wrong_center_true_counts, wrong_center_cluster_counts)
print(len(not_found))
print([i for i in diff if i != 1.0])