In [None]:
#################################
# Raw FASTQ files preprocessing #
#################################
# 
# (Latest) library structure (may be changed):
# [Alu primer - 12 bp][Alu sequence - 6 bp][Flank][Adapter 1 - 10 bp][Adapter 2, barcode - 9 bp][Adapter3 - 12 bp]
# 
# R1: alu primer mate
# R2: adapter mate
# 
# 
# Steps:
# - separate reads into good and bad, depending on the mistake in the primers, the adapter
#                                                                            and wrong elements in flanks
#     (as well as keeps barcodes good reads in the file for good R2 and location of mistake in the file bad R1)

# Import module (main code)
import trimmR

# Variable parameters
#################################
# The number of permissible error:
mist = 1
# Primer, ad1 = Adapter 1, ad2 = Adapter 3 (aka Green)
primer = 'GAGCCACCGCGC'
ad1 = 'GCGTGCTGCGG'
ad2 = 'AGGGCGGT'
# Length of barcode
barlen = 9
# List of wrong elements in flank
elem_remove = ['ACGT']

# Input FASTQ files folder path.
inputdir = '~/data'
# Output folder for processed FASTQ files.
outputdir = '~/data/processed'
#################################

# Main function
trimmR.main(inputdir, outputdir, mist, primer, ad1, ad2, barlen, elem_remove)

In [None]:
#################################
# Filtered FASTQ files to SAM files  #
#################################
# 
# You can see more options for bwa mem on http://bio-bwa.sourceforge.net/bwa.shtml
#
# Steps:
# - run bwa mem (mapping on human genome) with all paired FASTQ in folder.

# Import module (main code)
import bwamemR

# Variable parameters
#################################
# Input FASTQ files folder path.
inputdir = '~/data'
# Output folder for processed SAM files.
outputdir = '~/data/processed'
# Folder with indexed human genome (by bwa index). Name - name of indexed human genome without extension
refway = '~/data/name'
# Main part of bwa mem program. You can add options.
memline = 'bwa mem'
#################################

# Main function
bwamemR.main(inputdir, refway, outputdir, memline)

In [None]:
#################################
# SAM files filtering           #
#################################
# 
# Steps:
# - filtering sam files. Reads must be mapped, paired and not duplicated. GOOD reads write in table with
#    columns names:
#         ID CHR STRAND START END READ1 READ2 BARCODE ALU CIGAR_R1 CIGAR_R2 MDFLAG_R1 MDFLAG_R2
#    and BAD reads write in new sam file.

# Import module (main code)
import samfilterR

# Variable parameters
#################################
# Input SAM files folder path.
inputdir = '~/data'
# Output folder for processed SAM files.
outputdir = '~/data/processed'
#################################

# Main function
samfilterR.main(inputdir, outputdir)

In [None]:
#################################
# Tables clustering             #
#################################
# 
# Steps:
# - clustering reads in tables by distance. Create 2 new tables from 1 with headers:
#       1) CLUSTER_ID CHR STRAND START END READ1_BEST ID_LIST CIGAR_BEST MDFLAG_BEST
#       2) CLUSTER_ID BARCODE_LIST ALU_LIST

# Import module (main code)
import bigtableR

# Variable parameters
#################################
# Input SAM files folder path.
inputdir = '~/data'
# Output folder for processed SAM files.
outputdir = '~/data/processed'
# Distance (in bp) for clustering reads
perm_gap = 1000000
#################################

# Main function
bigtableR.main(inputdir, outputdir, perm_gap)