In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt

Load data

In [None]:
small_input_PCR_1 = np.load('../data/counts/small_libs_directional_clust_t3/GP_PCR_1_counts.npy')
small_input_PCR_2 = np.load('../data/counts/small_libs_directional_clust_t3/GP_PCR_3_counts.npy')
small_assembled_PCR_1 = np.load('../data/counts/small_libs_directional_clust_t3/GP_PCR_5_counts.npy')
small_assembled_PCR_2 = np.load('../data/counts/small_libs_directional_clust_t3/GP_PCR_7_counts.npy')
small_input_LIG_1 = np.load('../data/counts/small_libs_directional_clust_t3/GP_LIG_1_counts.npy')
small_input_LIG_2 = np.load('../data/counts/small_libs_directional_clust_t3/GP_LIG_5_counts.npy')
small_assembled_LIG_1 = np.load('../data/counts/small_libs_directional_clust_t3/GP_LIG_3_counts.npy')
small_assembled_LIG_2 = np.load('../data/counts/small_libs_directional_clust_t3/GP_LIG_7_counts.npy')

rep1_encoded = [small_input_PCR_1, small_assembled_PCR_1, small_input_LIG_1, small_assembled_LIG_1]
rep2_encoded = [small_input_PCR_2, small_assembled_PCR_2, small_input_LIG_2, small_assembled_LIG_2]

rep1_labels = ['small_input_PCR_1', 'small_assembled_PCR_1', 'small_input_LIG_1', 'small_assembled_LIG_1']
rep2_labels = ['small_input_PCR_2', 'small_assembled_PCR_2', 'small_input_LIG_2', 'small_assembled_LIG_2']

Check distributions of counts

In [None]:
rep1_counts = [[int(obs[1].decode()) for obs in sample] for sample in rep1_encoded]
for sample in rep1_counts:
  plt.hist(sample, bins=range(300))
  plt.show()

Convert data to dict with sequences as keys and counts of the sequence as values.

In [None]:
def make_seq_dict(samples, min_count=0):
  seq_counts = []
  for sample in samples:
    decoded_sample = {}
    for obs in sample:
      decoded_seq = obs[0].decode()
      decoded_count = int(obs[1].decode())
      if len(decoded_seq) == 30:
        decoded_seq = decoded_seq[1:-1]
      if decoded_count >= min_count:
        decoded_sample[decoded_seq] = decoded_count
    seq_counts.append(decoded_sample)
  return seq_counts


rep1 = make_seq_dict(rep1_encoded)
rep2 = make_seq_dict(rep2_encoded)

Find sequences that are common to all samples.

In [None]:
def get_common_seqs(rep):
  rep_seqs = []
  for sample in rep:
    rep_seqs.append(set(seq for seq in sample))
    print(len(sample))
  unique_seqs = set.intersection(*rep_seqs)
  return unique_seqs

common = get_common_seqs(rep1)
print(len(common))

Make a list of counts for each common sequence for each sample.

In [None]:
rep1_counts = [[], [], [], []]
for seq in common:
  for i, sample in enumerate(rep1):
    rep1_counts[i].append(rep1[i][seq])

Make dataframe containing sequences and their corresponding counts for each sample.

In [None]:
data = zip(common, *rep1_counts)
df = pd.DataFrame(data, columns= ['Sequence'] + rep1_labels)

Calculate fractional counts to normalize for number of reads present in the sample.

In [None]:
def add_fractions(samples):
  for sample in samples:
    df[f'{sample}_fraction'] = df[sample] / sum(df[sample])

add_fractions(rep1_labels)

Calculate gibson assembly efficiency for each library preparation method.

In [None]:
df['PCR_efficiency'] = df['small_assembled_PCR_1_fraction'] / df['small_input_PCR_1_fraction']
df['LIG_efficiency'] = df['small_assembled_LIG_1_fraction'] / df['small_input_LIG_1_fraction']

Look at correlation between library preparation methods.

In [None]:
df.plot.scatter(x='small_assembled_PCR_1_fraction', y='small_assembled_LIG_1_fraction')

In [None]:
df['small_assembled_PCR_1_fraction'].corr(df['small_assembled_LIG_1_fraction'])

In [None]:
df.plot.scatter(x='small_input_PCR_1_fraction', y='small_input_LIG_1_fraction')

In [None]:
df['small_input_PCR_1_fraction'].corr(df['small_input_LIG_1_fraction'])

In [None]:
df['PCR_efficiency'].hist(bins=100)

In [None]:
df['LIG_efficiency'].hist(bins=100)

Look at correlation of gibson efficiency between library prep methods.

In [None]:
df['PCR_efficiency'].corr(df['LIG_efficiency'])

In [None]:
df.plot.scatter(x='PCR_efficiency', y='LIG_efficiency')