In [1]:
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import statsmodels.stats.multitest as smm
from scipy import stats
import seaborn as sns

from Bio import Seq

pd.options.mode.chained_assignment = None

#makes viewing pandas tables better
pd.set_option('display.max_colwidth', 0)

Change the following parameters to your output directory (where you want output files to be written), your metafile, and TNseq output. 

If these files aren't in the same directory as this notebook, you need use the full path. For example, the full path of 'barseq_output' would be '/usr2/people/clairedubin/barseq/barseq_output'

Also change the control and experimental temperatures if needed.

In [2]:
#where the BarSeq FASTQs are
FASTQ_directory = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs'


## Barseq metafile

You'll need to create your own metafile - easiest way is edit in Excel. A sample metafile is located at: /usr2/people/clairedubin/barseq/Kluyv_BarSeq_metadata.txt

    Fastq: path to barseq FASTQ for each competition
    SampleName: unique name for each competition, MUST include ctrl_temp or exp_temp as specified above
    DualIndex: index from sequencing (in read name lines of FASTQ and the sequencing info file that Adam sends)
    
    UsePrecounted: True if you've already run the Barseq counting script on a competition, False otherwise
    
    poolfile: path to Tnseq output file, should be the same for each competition
    output_dir: path to output directory
    minRandom: number of bases between start of read and DualIndex, usually in sequence of 1,2,3,4,1,2,3,4 for each sample
    maxRandom: same as minRandom
    
    BeforeBarcode: sequence preceding all barcodes
    AfterBarcode: sequence following all barcodes
    BarcodeLengths: lengths of barcodes to search for, with expected length of barcode first

    

In [3]:
!ls {FASTQ_directory}/*fastq

/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB006_S1_L001_R1_001.fastq
/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB007_S1_L001_R1_001.fastq
/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB008_S1_L001_R1_001.fastq
/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB009_S1_L001_R1_001.fastq


In [4]:
#format: sample_name, fastq_path, dual_index, offset
#add or delete more lines as needed

sample_info = [('OORB006_1', '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB006_S1_L001_R1_001.fastq', 'ATCACG', '1'),
               ('OORB007_1', '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB007_S1_L001_R1_001.fastq', 'CGATGT', '2'), 
               ('OORB008_1', '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB008_S1_L001_R1_001.fastq', 'ATCACG', '1'),
               ('OORB009_1', '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB009_S1_L001_R1_001.fastq', 'CGATGT', '2'),
               ]

In [5]:
!head /usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB006_S1_L001_R1_001.fastq

@M02248:478:000000000-DMCTD:1:1101:15779:1330 1:N:0:NTCACG
AGCACTAGTCGACCTGCAGCGTACGTTCTACTTCTTAGCCAACTAAGAGACCTCGTGGACATCAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAA
+
CCCDDFFFFFCCGGGGGGGGGGGGGHHHGHHHHHHHHHHHGHGHHHHHHHHGHHGHHGGHHHHHHHHHHGGGGHHHGHHHGHHHGHHHHHHHHHHHHHHHHHGGHFGHHGGHHHHGGGHGHHHHHHHHHFHHGGGCD>--
@M02248:478:000000000-DMCTD:1:1101:15674:1330 1:N:0:NTCACG
CGCACTAGTCGACCTGCAGCGTACGGTGCTCATAACAGACGGGGTAGAGACCTCGTGGACATCAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAA
+
BCCCDDFFFFDCGGGGGGGGGGGGGGGHGHHHHHHHHHHHGGGGGGGHHHHGHHGGHGGHHHHHHHHHHGGGGHHHHHHHGHHHGHHHHHHHHHHGGHHHHHGGHAGHHGGHHHHGGGHGHHHHHHHHGHGHGG@D@-<-
@M02248:478:000000000-DMCTD:1:1101:15877:1332 1:N:0:NTCACG
CGCACTAGTCGACCTGCAGCGTACGTAATTTGGCACACATACCACAGAGACCTCGTGGACATCAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAA


In [6]:
#if there are any samples where you already counted barcodes, add the sample names here
precounted_samples = []

In [11]:
#these variables should be the same for each replicate

metafile_name = 'BarSeq_metafile_12_04_2023.txt'
logfile_name = 'BarSeq_counting_12_04_2023.log'
poolfile = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNseq_single_position_nMainLocation_annotated_file.csv'
output_dir = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_nMainLocation_12_04_2023'
before_barcode = 'GTCGACCTGCAGCGTACG' #bottom strand sequence was used
#before_barcode = 'GATGTCCACGAGGTCTCT' what was there before 
after_barcode =  'AGAGACCTCGTGGACATC' #bottom strand sequence was used
#after_barcode = 'CGTACGCTGCAGGTCGAC'  what was there before 
barcode_lengths = '20,19,18,21'

In [12]:
metafile_columns = ['FileIndex','SampleName', 'Fastq', 'DualIndex', 'Poolfile', 'OutputDir', 'minRandom', 'maxRandom',
                   'BeforeBarcode', 'AfterBarcode', 'BarcodeLengths', 'UsePrecounted']

with open(metafile_name, 'w') as f:
    
    #write column names
    f.write('\t'.join(metafile_columns)+'\n')
    
    #write a line for each pool
    for sample_name, fastq, dual_index, offset in sample_info:
        
        #DUAL INDECES ARE BACKWARDS FROM WHAT THE FLOWCELL OUTPUT SAYS
        to_write = '\t'.join([sample_name, sample_name, fastq, dual_index[::-1], poolfile, output_dir, offset, offset,
                             before_barcode, after_barcode, barcode_lengths, 'False'])
        f.write(to_write+'\n')
        
    for precounted_sample_name in precounted_samples:
        
        to_write = '\t'.join([sample_name, sample_name, fastq, dual_index, poolfile, output_dir, offset, offset,
                             before_barcode, after_barcode, barcode_lengths, 'True'])




### Check metafile

In [13]:
pd.read_csv(metafile_name, sep='\t')

Unnamed: 0,FileIndex,SampleName,Fastq,DualIndex,Poolfile,OutputDir,minRandom,maxRandom,BeforeBarcode,AfterBarcode,BarcodeLengths,UsePrecounted
0,OORB006_1,OORB006_1,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB006_S1_L001_R1_001.fastq,GCACTA,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNseq_single_position_nMainLocation_annotated_file.csv,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_nMainLocation_12_04_2023,1,1,GTCGACCTGCAGCGTACG,AGAGACCTCGTGGACATC,20191821,False
1,OORB007_1,OORB007_1,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB007_S1_L001_R1_001.fastq,TGTAGC,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNseq_single_position_nMainLocation_annotated_file.csv,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_nMainLocation_12_04_2023,2,2,GTCGACCTGCAGCGTACG,AGAGACCTCGTGGACATC,20191821,False
2,OORB008_1,OORB008_1,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB008_S1_L001_R1_001.fastq,GCACTA,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNseq_single_position_nMainLocation_annotated_file.csv,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_nMainLocation_12_04_2023,1,1,GTCGACCTGCAGCGTACG,AGAGACCTCGTGGACATC,20191821,False
3,OORB009_1,OORB009_1,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/BarSeq_FASTQs/OORB009_S1_L001_R1_001.fastq,TGTAGC,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNseq_single_position_nMainLocation_annotated_file.csv,/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_nMainLocation_12_04_2023,2,2,GTCGACCTGCAGCGTACG,AGAGACCTCGTGGACATC,20191821,False


## Barseq counting

Might take a few hours!

In [14]:
!python3 /usr2/people/clairedubin/barseq/latest_pipeline/RBseq_Count_BarCodes_v2.py -m {metafile_name} -l {logfile_name}

2023-12-04 12:01:30 RBseq_Count_BarCodes.py
2023-12-04 12:01:30 Version: 1.1.4
2023-12-04 12:01:30 Release Date: July 16, 2020
2023-12-04 12:01:30 Options passed:  metafile:BarSeq_metafile_12_04_2023.txt  logFile:BarSeq_counting_12_04_2023.log  minQual:10  matchBefore:6  matchAfter:6  quietMode:False 
2023-12-04 12:01:30 Logging status updates in BarSeq_counting_12_04_2023.log
2023-12-04 12:01:30 Loading TnSeq library metadata from BarSeq_metafile_12_04_2023.txt
2023-12-04 12:01:30 Setting output directory as: /usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_nMainLocation_12_04_2023 (from first line of metadata file)
2023-12-04 12:01:30 Loading mapped barcodes in mutant pool from: /usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNseq_single_position_nMainLocation_annotated_file.csv (from first line of metadata file)
2023-12-04 1

2023-12-04 12:01:55   Total barcodes seen (incudes sequencing errors): 4657
2023-12-04 12:01:55   Matching barcodes to poolfile
2023-12-04 12:01:55     Number of barcodes from poolfile seen: 82
2023-12-04 12:01:55     Reads with barcodes from poolfile: 122514
2023-12-04 12:01:55   Most abundant barcode: GGTATCGGAATTGAAGATCT seen 129357 times.
2023-12-04 12:01:55   Number of reads that differ from this barcode by one base pair (likely sequencing errors):2434
2023-12-04 12:01:55   Estimated sequencing error rate for barcodes: 1.8%
2023-12-04 12:01:55   Barcodes seen once (highly inflated by sequencing errors): 3430
2023-12-04 12:01:55   Barcodes seen twice (slightly inflated by sequencing errors): 347
2023-12-04 12:01:55   Barcodes seen three times or more: 880
2023-12-04 12:01:55   Chao estimate of population size (ones^2/2*twos): 17000
2023-12-04 12:01:55 ---------------------
2023-12-04 12:01:55 Saving summary statistics for fastqs to: /usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_seq

### To convert the annotated file to csv and not to be separated by tab

In [None]:
df=pd.read_csv(poolfile,sep='\t')

In [None]:
df.to_csv('/usr2/people/shollyt22/shollyt22//TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/Tnseq_11_04_2023_annotated_mod1.csv',index=None)

In [None]:
df=pd.read_csv('/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/Tnseq_11_04_2023_annotated_mod1.csv')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_11_06_2023/poolCount.txt', sep='\t')

# Select the columns you want to use for the scatter plot
x_column = 'OORB006_1'
y_column = 'OORB007_1'

# Create a scatter plot using the selected columns
plt.scatter(df[x_column], df[y_column], color='black')
plt.xlabel('OORB006_1 Abundance', fontsize=14)
plt.ylabel('OORB007_1 Abundance', fontsize=14)
plt.title('Plot of TN_insertion abundances', fontsize=16, color='purple')
#plt.grid(True)  # Optionally, add grid lines
plt.savefig("fitness_of_TN_abundances_09_01_2023.png", dpi=600, bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_11_06_2023/poolCount.txt', sep='\t')

# Select the columns you want to use for the scatter plot
x_column = 'OORB006_1'
y_column = 'OORB007_1'

# Create a scatter plot using the selected columns
plt.scatter(df[x_column], df[y_column], color='black')
plt.xlabel('OORB006_1 Abundance', fontsize=14)
plt.ylabel('OORB007_1 Abundance', fontsize=14)
plt.title('Plot of TN_insertion abundances', fontsize=16, color='purple')

# Adjust the x and y-axis scale ranges
plt.xlim(0, 60)  # Adjust the range for the x-axis
plt.ylim(0, 60)  # Adjust the range for the y-axis

# Optionally, add grid lines
#plt.grid(True)

plt.savefig("fitness_of_TN_abundances_09_01_2023.png", dpi=600, bbox_inches='tight')
plt.show()


In [None]:
# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_11_06_2023/poolCount.txt', sep='\t')

# Select the columns you want to use for the scatter plot
x_column = 'OORB008_1'
y_column = 'OORB009_1'

# Create a scatter plot using the selected columns
plt.scatter(df[x_column], df[y_column], color='black')
plt.xlabel('OORB008_1 Abundance', fontsize=14)
plt.ylabel('OORB009_1 Abundance', fontsize=14)
plt.title('Plot of TN_insertion abundances', fontsize=16, color='purple')
#plt.grid(True)  # Optionally, add grid lines
plt.savefig("fitness_of_TN_abundances_09_01_2023.png", dpi=600, bbox_inches='tight')
plt.show()

In [None]:
# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_Temp_study_11_04_2023/TNSeq_mapping_output_new_genome_without_vector_seq_11_04_2023/TNSeq_barseq_outputs_11_06_2023/poolCount.txt', sep='\t')

# Select the columns you want to use for the scatter plot
x_column = 'OORB006_1'
y_column = 'OORB008_1'

# Create a scatter plot using the selected columns
plt.scatter(df[x_column], df[y_column], color='black')
plt.xlabel('OORB006_1 Abundance', fontsize=14)
plt.ylabel('OORB008_1 Abundance', fontsize=14)
plt.title('Plot of TN_insertion abundances', fontsize=16, color='purple')
#plt.grid(True)  # Optionally, add grid lines
plt.savefig("fitness_of_TN_abundances_09_01_2023.png", dpi=600, bbox_inches='tight')
plt.show()