In [1]:
import gzip
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from GibsonPrediction.utils import get_fold_change, contains_RE_site, read_fastq

%matplotlib inline

In [2]:
df = pd.read_pickle("../data/20230414_pipeline_run/output/counts/PCR_large_rep1_count_df.pkl")
df['fold_change'] = get_fold_change(df, 1, 1)
df['log2_FC'] = np.log2(df['fold_change'])

In [3]:
df[(df['input_count'] > 100) & (df['assembled_count'] < 20)].sort_values('log2_FC')

Unnamed: 0,sequence,input_count,assembled_count,fold_change,log2_FC
477226,TAGAAGCCGTCCGTGGGAGTCCTGATACAG,513,0,0.001961,-8.993884
144825,GATGGGCTATAGTGCTCTTCAAGTGCAGTG,410,0,0.002453,-8.671254
565822,TTCCAAGGGAGCGTTTCGGGGTCATCTTAG,377,0,0.002667,-8.550502
28942,GGGTGGTCTGCAGAGATGCGGTCCGTATCA,376,0,0.002674,-8.546680
108155,GGCAAGTAGGTGGCTGCGATTAGTTGACGG,316,0,0.003180,-8.296598
...,...,...,...,...,...
67910,TTGAGTCCATGCGCGGGGCGTGGTTTTTTC,140,18,0.135853,-2.879883
109048,CATGAGTGTGGTCAGTGGCCAGGATGGGCA,109,15,0.146643,-2.769619
347903,AGGAGGTCGTTGCTCTTCATGCAGTTGCCA,127,19,0.157527,-2.666331
293188,GGAAGCATTTCGGGGAAGAGCGGGATTAGT,121,19,0.165274,-2.597069


In [5]:
df[(df['input_count'] > 20)].sort_values('log2_FC', ascending=False)

Unnamed: 0,sequence,input_count,assembled_count,fold_change,log2_FC
587121,AGGGTACCGCGCATGAAATCAGCCAGCTCG,21,61,2.841210,1.506505
268248,CATACACGAAACTTGGCCCGGCAAGGTCAT,21,60,2.795384,1.483046
168473,TTTTGCTTGTACTCATCTCAGCGTGGTCGC,28,79,2.781162,1.475688
564469,GCCTTAAAGGGGTACGGAGAGTGGGGACCA,22,60,2.673845,1.418916
541227,TGGAGATTTCCGTAGGCAGGCGTAGATATA,21,57,2.657906,1.410290
...,...,...,...,...,...
108155,GGCAAGTAGGTGGCTGCGATTAGTTGACGG,316,0,0.003180,-8.296598
28942,GGGTGGTCTGCAGAGATGCGGTCCGTATCA,376,0,0.002674,-8.546680
565822,TTCCAAGGGAGCGTTTCGGGGTCATCTTAG,377,0,0.002667,-8.550502
144825,GATGGGCTATAGTGCTCTTCAAGTGCAGTG,410,0,0.002453,-8.671254


In [6]:
mask = df['sequence'].apply(lambda x: contains_RE_site(x, 'GCTCTTC'))
df[mask].sort_values('input_count', ascending=False).head()
df[mask].sort_values('input_count', ascending=False).to_csv("cut.tsv", sep='\t')


In [7]:
df_no_cut = df[~mask]
df_no_cut[(df_no_cut['input_count'] > 100) & (df_no_cut['assembled_count'] < 20)].sort_values('log2_FC')

Unnamed: 0,sequence,input_count,assembled_count,fold_change,log2_FC
477226,TAGAAGCCGTCCGTGGGAGTCCTGATACAG,513,0,0.001961,-8.993884
565822,TTCCAAGGGAGCGTTTCGGGGTCATCTTAG,377,0,0.002667,-8.550502
28942,GGGTGGTCTGCAGAGATGCGGTCCGTATCA,376,0,0.002674,-8.546680
108155,GGCAAGTAGGTGGCTGCGATTAGTTGACGG,316,0,0.003180,-8.296598
86157,AGCGTGGGCGCACGCCAGTCTTTAGGCGTT,304,0,0.003305,-8.240925
...,...,...,...,...,...
388030,CTTCGGGAGTGTCAGTCAATGAATACGGAT,119,13,0.117620,-3.087795
434662,GTCTTCCCGATGCTTGGTGGCCTATCGCGC,104,13,0.134423,-2.895150
67910,TTGAGTCCATGCGCGGGGCGTGGTTTTTTC,140,18,0.135853,-2.879883
109048,CATGAGTGTGGTCAGTGGCCAGGATGGGCA,109,15,0.146643,-2.769619


In [8]:
with gzip.open("../data/HNJKLDSX3_2_1_TCTTCCTG-CGTTGGAA_150bp.concat.fastq.gz", 'rt') as f:
    fq_seqs = [seq for header, seq, qual in  read_fastq(f)]

In [13]:
large_decrease_sorted = df_no_cut[(df_no_cut['input_count'] > 100) & (df_no_cut['assembled_count'] == 0)].sort_values('log2_FC')
len(large_decrease_sorted)

70

In [14]:
def get_flanking_sites(N30, fq):
    sites = []
    for seq in fq:
        i = seq.find(N30)
        if i > -1:
            sites.append((seq[i - 7: i], seq[i + 30: i + 37]))
    return (Counter(sites))


In [15]:
large_decrease_sorted['flanking sites'] = large_decrease_sorted['sequence'].apply(lambda x: get_flanking_sites(x, fq_seqs))

In [18]:
large_decrease_sorted.to_csv('large_decrease_flanking.csv')

In [20]:
get_flanking_sites('TAGAAGCCGTCCGTGGGAGTCCTGATACAG', fq_seqs)

Counter({('GCTCTTC', 'GAAGAGC'): 500,
         ('GCTCATC', 'GAAGAGC'): 1,
         ('GCTCTTC', 'GAATAGC'): 2,
         ('GCACTTC', 'GAAGAGC'): 2,
         ('TCTCTTC', 'GAAGAGC'): 1})

In [24]:
get_flanking_sites('TTCCAAGGGAGCGTTTCGGGGTCATCTTAG', fq_seqs)

Counter({('GCTCTTC', 'GAAGAGC'): 368,
         ('GCTCTTC', 'GAAGGGC'): 1,
         ('GCTCTTA', 'GAAGAGC'): 2,
         ('GCTCTTC', 'GAATAGC'): 1})