In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt

Load data

In [None]:
input_1 = np.load('../data/counts/large_libs_directional_clust_t3/GP_PCR_2_counts.npy')
input_2 = np.load('../data/counts/large_libs_directional_clust_t3/GP_PCR_4_counts.npy')
assembled_1 = np.load('../data/counts/large_libs_directional_clust_t3/GP_PCR_6_counts.npy')
assembled_2 = np.load('../data/counts/large_libs_directional_clust_t3/GP_PCR_8_counts.npy')

rep1_encoded = [input_1, assembled_1]
rep2_encoded = [input_2, assembled_2]

rep1_labels = ['input_1', 'assembled_1']
rep2_labels = ['input_2', 'assembled_2']

Check distributions of counts

In [None]:
rep1_counts = [[int(obs[1].decode()) for obs in sample] for sample in rep1_encoded]
for sample in rep1_counts:
  plt.hist(sample, bins=range(200))
  plt.show()

rep2_counts = [[int(obs[1].decode()) for obs in sample] for sample in rep2_encoded]
for sample in rep2_counts:
  plt.hist(sample, bins=range(200))
  plt.show()

Convert data to dict with sequences as keys and counts of the sequence as values.

In [None]:
def make_seq_dict(samples):
  seq_counts = []
  for sample in samples:
    decoded_sample = {}
    for obs in sample:
      decoded_seq = obs[0].decode()
      decoded_count = int(obs[1].decode())
      if len(decoded_seq) == 30:
        decoded_seq = decoded_seq[1:-1]
      decoded_sample[decoded_seq] = decoded_count
    seq_counts.append(decoded_sample)
  return seq_counts


rep1 = make_seq_dict(rep1_encoded)
rep2 = make_seq_dict(rep2_encoded)

Find sequences that are common to all samples.

In [None]:
def get_common_seqs(rep):
  rep_seqs = []
  for sample in rep:
    rep_seqs.append(set(seq for seq in sample))
    print(len(sample))
  unique_seqs = set.intersection(*rep_seqs)
  return unique_seqs

common = get_common_seqs(rep1)
print(len(common))

In [None]:
removed = set(seq for seq in rep1[0]).difference(common)
removed_counts = []
for seq in removed:
  removed_counts.append((seq, int(rep1[0][seq])))

removed_sorted = sorted(removed_counts, key=lambda x: x[1], reverse=True)
print(removed_sorted[:20])

Make a list of counts for each common sequence for each sample.

In [None]:
rep1_counts = [[], []]
for seq in common:
  if bool(sum(1 if sample[seq] > 20 else 0 for sample in rep1)):
    for i, sample in enumerate(rep1):
      rep1_counts[i].append(rep1[i][seq])

len(rep1_counts[0])

Make dataframe containing sequences and their corresponding counts for each sample.

In [None]:
data = zip(common, *rep1_counts)
df = pd.DataFrame(data, columns= ['Sequence'] + rep1_labels)

Calculate fractional counts to normalize for number of reads present in the sample.

In [None]:
def add_fractions(samples):
  for sample in samples:
    df[f'{sample}_fraction'] = df[sample] / sum(df[sample])

add_fractions(rep1_labels)

In [None]:
def add_CPM(samples):
  for sample in samples:
    df[f'{sample}_CPM'] = (df[sample] * 1_000_000) / sum(df[sample])

add_CPM(rep1_labels)

Calculate gibson assembly efficiency for each library preparation method.

In [None]:
df['efficiency'] = df['assembled_1_CPM'] / df['input_1_CPM']
df['logFC'] = np.log2(df['efficiency'])

In [None]:
df['efficiency'].hist(bins=300)
plt.xlim([0,4])
plt.xlabel('fold change')
plt.ylabel('count')
plt.grid(None)
plt.savefig(f"fc.svg")

In [None]:
df['logFC'].hist(bins=150)
plt.xlim([-2,2])
plt.xlabel('log2(FC)')
plt.ylabel('count')
plt.grid(None)
plt.savefig(f"logfc.svg")

In [None]:
def GC(seq):
  return sum([base in {'G', 'C'} for base in seq]) / len(seq)

In [None]:
df['GC'] = df['Sequence'].apply(GC)
df['GC'][:10]

In [None]:
df.plot.scatter(x='GC', y='logFC')

In [None]:
df['logFC'].corr(df['GC'])

In [None]:
df_outliers = df.loc[df['logFC'] < -7]

In [None]:
df_outliers[['Sequence', 'logFC', 'input_1', 'assembled_1']]