Notebook with system commands to process FastQ files: mapping with Bowtie2, sorting and indexing with samtools, conversion to BigWigs with bamCoverage

In [1]:
# list all files ending in q.gz
!ls -lh *q.gz

-rw-r--r-- 1 boycem4 GEU3302521 53M Nov 23  2020 chip_R1.fastq.gz
-rw-r--r-- 1 boycem4 GEU3302521 46M Nov 19 11:16 chip_R1_val_1.fq.gz
-rw-r--r-- 1 boycem4 GEU3302521 53M Nov 23  2020 chip_R2.fastq.gz
-rw-r--r-- 1 boycem4 GEU3302521 46M Nov 19 11:16 chip_R2_val_2.fq.gz


In [2]:
# settings for subsequent commands

# select the sample to be processed:
sample = 'chip'

# prefix for output files can be the same as sample name
out_prefix = sample

# reference genome
reference = 'C_glabrata.fa'

# name for bowtie2 index files
bowtie_index = 'C_glabrata'

# number of threads to use in parallel
threads = 1

# execute stepwise or as a pipeline
#processing = 'stepwise'
processing = 'pipeline'

In [3]:
import os.path

# make sure the mapping index files are in place
for i in range(1,5) :
    bowtie_index_file = f'{bowtie_index}.{i}.bt2'

    if not os.path.isfile(bowtie_index_file) :
        
        print(f'file {bowtie_index_file} does not exist')
        
        # try to generate the index
        job = f'bowtie2-build {reference} {bowtie_index}'
        print(f'job: {job}')
        !time {job}
        
        break

In [4]:
# Define the input file(s)
# (for paired-end reads we have two input files):
infile1 = f'{sample}_R1.fastq.gz'
infile2 = f'{sample}_R2.fastq.gz'

# one could also use trimmed files if available:
#infile1 = f'{sample}_R1_val_1.fq.gz'
#infile2 = f'{sample}_R2_val_2.fq.gz'

# make sure input files are in place
for file in (infile1, infile2) :
    if not os.path.isfile(file) :
        raise Exception(f'file {file} does not exist')

In [5]:
if processing == 'stepwise' :
    
    # run bowtie2 mapper and save output in SAM format
    job = f'bowtie2 -x {bowtie_index} -1 {infile1} -2 {infile2} --mm -p {threads} > {out_prefix}.sam'
    print(f'job: {job}')
    !time {job}
    
    # convert SAM into BAM format, so that it can be sorted
    job = f'samtools view -b {out_prefix}.sam > {out_prefix}.bam'
    print(f'job: {job}')
    !time {job}
    
    # sort BAM file (requires BAM file as input)
    job = f'samtools sort -@ {threads} {out_prefix}.bam > {out_prefix}.sorted.bam'
    print(f'job: {job}')
    !time {job}
    
    # replace unsorted BAM file with sorted one
    job = f'mv {out_prefix}.sorted.bam {out_prefix}.bam'
    print(f'job: {job}')
    !time {job}
    
else :
    
    # run bowtie2 mapper and save compressed and sorted output in {out_prefix}.bam in one go
    job = f'bowtie2 -x {bowtie_index} -1 {infile1} -2 {infile2} --mm -p {threads} | samtools view -b | samtools sort -@ {threads} > {out_prefix}.bam'
    print(f'job: {job}')
    !time {job}

job: bowtie2 -x C_glabrata -1 chip_R1.fastq.gz -2 chip_R2.fastq.gz --mm -p 1 | samtools view -b | samtools sort -@ 1 > chip.bam
1062114 reads; of these:
  1062114 (100.00%) were paired; of these:
    321209 (30.24%) aligned concordantly 0 times
    666617 (62.76%) aligned concordantly exactly 1 time
    74288 (6.99%) aligned concordantly >1 times
    ----
    321209 pairs aligned concordantly 0 times; of these:
      68375 (21.29%) aligned discordantly 1 time
    ----
    252834 pairs aligned 0 times concordantly or discordantly; of these:
      505668 mates make up the pairs; of these:
        466911 (92.34%) aligned 0 times
        25174 (4.98%) aligned exactly 1 time
        13583 (2.69%) aligned >1 times
78.02% overall alignment rate

real	1m59.210s
user	2m46.422s
sys	0m2.590s


In [6]:
# list all the SAM and BAM files in the directory
!ls -lh *.bam *.sam

ls: cannot access '*.sam': No such file or directory
-rw-r--r-- 1 boycem4 GEU3302521 87M Nov 19 12:19  chip.bam


In [7]:
# create an index for the BAM file to allow direct access at certain positions
job = f'samtools index {out_prefix}.bam'
print(f'job: {job}')
!time {job}

job: samtools index chip.bam

real	0m1.289s
user	0m1.251s
sys	0m0.036s


In [8]:
# create a BAM file from reads mapped to chrA only
job = f'samtools view -b {out_prefix}.bam chrA > {out_prefix}A.bam'
print(f'job: {job}')
!time {job}

# index this BAM file
job = f'samtools index {out_prefix}A.bam'
print(f'job: {job}')
!time {job}

job: samtools view -b chip.bam chrA > chipA.bam

real	0m0.528s
user	0m0.498s
sys	0m0.016s
job: samtools index chipA.bam

real	0m0.081s
user	0m0.079s
sys	0m0.000s


In [9]:
# generate BigWig files from BAM files
job = f'/usr/local/deeptools3/bin/bamCoverage -p {threads} -b {out_prefix}.bam -o {out_prefix}.bw'
print(f'job: {job}')
!time {job}

job: /usr/local/deeptools3/bin/bamCoverage -p 1 -b chip.bam -o chip.bw
bamFilesList: ['chip.bam']
binLength: 50
numberOfSamples: None
blackListFileName: None
skipZeroOverZero: False
bed_and_bin: False
genomeChunkSize: None
defaultFragmentLength: read length
numberOfProcessors: 1
verbose: False
region: None
bedFile: None
minMappingQuality: None
ignoreDuplicates: False
chrsToSkip: []
stepSize: 50
center_read: False
samFlag_include: None
samFlag_exclude: None
minFragmentLength: 0
maxFragmentLength: 0
zerosToNans: False
smoothLength: None
save_data: False
out_file_for_raw_data: None
maxPairedFragmentLength: 1000

real	0m12.448s
user	0m12.114s
sys	0m0.068s


In [10]:
# list all BigWig files:
!ls -lh *.bw

-rw-r--r-- 1 boycem4 GEU3302521 931K Nov 19 12:19 chip.bw
