In [1]:
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import statsmodels.stats.multitest as smm
from scipy import stats
import seaborn as sns

from Bio import Seq

pd.options.mode.chained_assignment = None

#makes viewing pandas tables better
pd.set_option('display.max_colwidth', 0)

Change the following parameters to your output directory (where you want output files to be written), your metafile, and TNseq output. 

If these files aren't in the same directory as this notebook, you need use the full path. For example, the full path of 'barseq_output' would be '/usr2/people/clairedubin/barseq/barseq_output'

Also change the control and experimental temperatures if needed.

In [2]:
#where the BarSeq FASTQs are
FASTQ_directory = '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo'


## Barseq metafile

You'll need to create your own metafile - easiest way is edit in Excel. A sample metafile is located at: /usr2/people/clairedubin/barseq/Kluyv_BarSeq_metadata.txt

    Fastq: path to barseq FASTQ for each competition
    SampleName: unique name for each competition, MUST include ctrl_temp or exp_temp as specified above
    DualIndex: index from sequencing (in read name lines of FASTQ and the sequencing info file that Adam sends)
    
    UsePrecounted: True if you've already run the Barseq counting script on a competition, False otherwise
    
    poolfile: path to Tnseq output file, should be the same for each competition
    output_dir: path to output directory
    minRandom: number of bases between start of read and DualIndex, usually in sequence of 1,2,3,4,1,2,3,4 for each sample
    maxRandom: same as minRandom
    
    BeforeBarcode: sequence preceding all barcodes
    AfterBarcode: sequence following all barcodes
    BarcodeLengths: lengths of barcodes to search for, with expected length of barcode first

    

In [3]:
!ls {FASTQ_directory}/*fastq

/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT085_S277_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT086_S278_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT087_S279_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT088_S280_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT089_S281_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT090_S282_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT091_S283_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT092_S284_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT093_S285_L003_R1_001.fastq
/usr2/people/calabel1/NovaSeq_Cinco_d

In [4]:
#format: sample_name, fastq_path, dual_index, offset
#add or delete more lines as needed

sample_info = [('28C_1', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT085_S277_L003_R1_001.fastq', 'GCCTTA', '1'),
               ('28C_2', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT086_S278_L003_R1_001.fastq', 'GCTCCA', '2'),
               ('28C_3', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT087_S279_L003_R1_001.fastq', 'GGCACA', '3'),
               ('28C_4', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT088_S280_L003_R1_001.fastq', 'GGCCTG', '4'),
               ('28C_5', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT089_S281_L003_R1_001.fastq', 'TCTACC', '1'),
               ('28C_6', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT090_S282_L003_R1_001.fastq', 'TGAATG', '2'),
               ('42C_1', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT091_S283_L003_R1_001.fastq', 'TGCCAT', '3'),
               ('42C_2', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT092_S284_L003_R1_001.fastq', 'TGCTGG', '4'),
               ('42C_3', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT093_S285_L003_R1_001.fastq', 'TGGCGC', '1'),
               ('42C_4', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT094_S286_L003_R1_001.fastq', 'TTCGAA', '2'),
               ('42C_5', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT095_S287_L003_R1_001.fastq', 'TTCTCC', '3'),
               ('42C_6', '/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT096_S288_L003_R1_001.fastq', 'AGGTTT', '4'),
              ]

In [7]:
!head /usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT085_S277_L003_R1_001.fastq

@K00364:269:HNCLMBBXY:3:1101:29488:1349 1:N:0:GCCTTA
TNTTCCGGTCGACCTGCAGCGTACGCGTTCAGACCTTCCGTAATCAGAGAC
+
A#<<AFF<-A<AJJJAJJ<JF7AJ7FFF<AF--7<--FA-FJJFJ<<A-AF
@K00364:269:HNCLMBBXY:3:1101:2077:1367 1:N:0:GCCTTA
GATTCCGGTCGACCTGCAGCGTACGTGGGTCCGTATGGAAGTCAAAGAGAC
+
AAAAFJ-FAJJFJJFJJJJJJFJJ<JJF7-7F77AF--F<JF-FJJJJFFJ
@K00364:269:HNCLMBBXY:3:1101:2808:1367 1:N:0:GCCTTA
GATTCCGGTCGACCTGCAGCGTACGATGAGGGTGCTGAGTTGGCAAGAGAC


In [5]:
#if there are any samples where you already counted barcodes, add the sample names here
precounted_samples = []

In [6]:
#these variables should be the same for each replicate

metafile_name = 'BarSeq_metafile.txt'
logfile_name = 'BarSeq_counting.log'
poolfile = '30_to_41_pools_poolfile_combined_annotated.csv'
output_dir = 'TNSeq_barseq_outputs'
before_barcode = 'GTCGACCTGCAGCGTACG'
after_barcode = 'AGAGACCTC'
barcode_lengths = '20,19,18,21'

In [8]:
metafile_columns = ['FileIndex','SampleName', 'Fastq', 'DualIndex', 'Poolfile', 'OutputDir', 'minRandom', 'maxRandom',
                   'BeforeBarcode', 'AfterBarcode', 'BarcodeLengths', 'UsePrecounted']

with open(metafile_name, 'w') as f:
    
    #write column names
    f.write('\t'.join(metafile_columns)+'\n')
    
    #write a line for each pool
    for sample_name, fastq, dual_index, offset in sample_info:
        
        #DUAL INDECES ARE BACKWARDS FROM WHAT THE FLOWCELL OUTPUT SAYS
        to_write = '\t'.join([sample_name, sample_name, fastq, dual_index[::-1], poolfile, output_dir, offset, offset,
                             before_barcode, after_barcode, barcode_lengths, 'False'])
        f.write(to_write+'\n')
        
    for precounted_sample_name in precounted_samples:
        
        to_write = '\t'.join([sample_name, sample_name, fastq, dual_index, poolfile, output_dir, offset, offset,
                             before_barcode, after_barcode, barcode_lengths, 'True'])




### Check metafile

In [9]:
pd.read_csv(metafile_name, sep='\t')

Unnamed: 0,FileIndex,SampleName,Fastq,DualIndex,Poolfile,OutputDir,minRandom,maxRandom,BeforeBarcode,AfterBarcode,BarcodeLengths,UsePrecounted
0,28C_1,28C_1,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT085_S277_L003_R1_001.fastq,ATTCCG,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,1,1,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
1,28C_2,28C_2,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT086_S278_L003_R1_001.fastq,ACCTCG,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,2,2,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
2,28C_3,28C_3,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT087_S279_L003_R1_001.fastq,ACACGG,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,3,3,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
3,28C_4,28C_4,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT088_S280_L003_R1_001.fastq,GTCCGG,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,4,4,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
4,28C_5,28C_5,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT089_S281_L003_R1_001.fastq,CCATCT,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,1,1,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
5,28C_6,28C_6,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT090_S282_L003_R1_001.fastq,GTAAGT,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,2,2,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
6,42C_1,42C_1,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT091_S283_L003_R1_001.fastq,TACCGT,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,3,3,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
7,42C_2,42C_2,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT092_S284_L003_R1_001.fastq,GGTCGT,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,4,4,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
8,42C_3,42C_3,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT093_S285_L003_R1_001.fastq,CGCGGT,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,1,1,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False
9,42C_4,42C_4,/usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_IT094_S286_L003_R1_001.fastq,AAGCTT,30_to_41_pools_poolfile_combined_annotated.csv,TNSeq_barseq_outputs,2,2,GTCGACCTGCAGCGTACG,AGAGACCTC,20191821,False


## Barseq counting

Might take a few hours!

In [10]:
!python3 /usr2/people/clairedubin/barseq/latest_pipeline/RBseq_Count_BarCodes_v2.py -m {metafile_name} -l {logfile_name}

2022-05-27 12:13:31 RBseq_Count_BarCodes.py
2022-05-27 12:13:31 Version: 1.1.4
2022-05-27 12:13:31 Release Date: July 16, 2020
2022-05-27 12:13:31 Options passed:  metafile:BarSeq_metafile.txt  logFile:BarSeq_counting.log  minQual:10  matchBefore:6  matchAfter:6  quietMode:False 
2022-05-27 12:13:31 Logging status updates in BarSeq_counting.log
2022-05-27 12:13:31 Loading TnSeq library metadata from BarSeq_metafile.txt
2022-05-27 12:13:31 Setting output directory as: TNSeq_barseq_outputs (from first line of metadata file)
2022-05-27 12:13:31 Loading mapped barcodes in mutant pool from: 30_to_41_pools_poolfile_combined_annotated.csv (from first line of metadata file)
2022-05-27 12:13:33 Read 270652 barcodes from 30_to_41_pools_poolfile_combined_annotated.csv
2022-05-27 12:13:33 Finding barcodes in fastqs and counting occurances
2022-05-27 12:13:33 ---------------------
2022-05-27 12:13:33   Mapping reads from /usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_Mayo/FEBA_BS_365_I

2022-05-27 12:25:17   Total barcodes seen (incudes sequencing errors): 363194
2022-05-27 12:25:17   Matching barcodes to poolfile
2022-05-27 12:25:17     Number of barcodes from poolfile seen: 12082
2022-05-27 12:25:17     Reads with barcodes from poolfile: 343329
2022-05-27 12:25:17   Most abundant barcode: ATTATGCCTGGTGTGGTGTC seen 1409 times.
2022-05-27 12:25:17   Number of reads that differ from this barcode by one base pair (likely sequencing errors):30
2022-05-27 12:25:17   Estimated sequencing error rate for barcodes: 2.1%
2022-05-27 12:25:17   Barcodes seen once (highly inflated by sequencing errors): 162757
2022-05-27 12:25:17   Barcodes seen twice (slightly inflated by sequencing errors): 7985
2022-05-27 12:25:17   Barcodes seen three times or more: 192452
2022-05-27 12:25:17   Chao estimate of population size (ones^2/2*twos): 1700000
2022-05-27 12:25:17 ---------------------
2022-05-27 12:25:17   Mapping reads from /usr2/people/calabel1/NovaSeq_Cinco_de_Mayo/BarSeq_Cinco_de_

2022-05-27 12:33:53       55164 reads without expected sequence before the barcode region.
2022-05-27 12:33:53       12143 reads without expeced sequence after the barcode region.
2022-05-27 12:33:53       183 reads with noncompliant barcdes. (Contains Ns, etc).
2022-05-27 12:33:53       187 reads with quality scores less than 10
2022-05-27 12:33:53   Saving barcode counts to TNSeq_barseq_outputs/countsFiles/42C_2.counts
2022-05-27 12:33:54   Total barcodes seen (incudes sequencing errors): 424325
2022-05-27 12:33:54   Matching barcodes to poolfile
2022-05-27 12:33:54     Number of barcodes from poolfile seen: 31819
2022-05-27 12:33:54     Reads with barcodes from poolfile: 219540
2022-05-27 12:33:54   Most abundant barcode: CTATGCGCCGAGGTTGCTTA seen 1337 times.
2022-05-27 12:33:54   Number of reads that differ from this barcode by one base pair (likely sequencing errors):27
2022-05-27 12:33:54   Estimated sequencing error rate for barcodes: 2.0%
2022-05-27 12:33:54   Barcodes seen onc

2022-05-27 12:38:58   Total barcodes seen (incudes sequencing errors): 573962
2022-05-27 12:38:58   Matching barcodes to poolfile
2022-05-27 12:38:58     Number of barcodes from poolfile seen: 51530
2022-05-27 12:38:58     Reads with barcodes from poolfile: 447034
2022-05-27 12:38:58   Most abundant barcode: GTGAGCCCCAAAGGCAAGCA seen 2685 times.
2022-05-27 12:38:58   Number of reads that differ from this barcode by one base pair (likely sequencing errors):41
2022-05-27 12:38:58   Estimated sequencing error rate for barcodes: 1.5%
2022-05-27 12:38:58   Barcodes seen once (highly inflated by sequencing errors): 215536
2022-05-27 12:38:58   Barcodes seen twice (slightly inflated by sequencing errors): 37525
2022-05-27 12:38:58   Barcodes seen three times or more: 320901
2022-05-27 12:38:58   Chao estimate of population size (ones^2/2*twos): 620000
2022-05-27 12:38:58 ---------------------
2022-05-27 12:38:58 Saving summary statistics for fastqs to: TNSeq_barseq_outputs/fastqSummaryStats.t