In [None]:
#################################
# Raw FASTQ files preprocessing #
#################################
# 
# (Latest) library structure (may be changed):
# [Alu primer - 12 bp][Alu sequence - 6 bp][Flank][Adapter 1 - 10 bp][Adapter 2, barcode - 9 bp][Adapter3 - 12 bp]
# 
# R1: alu primer mate
# R2: adapter mate
# 
# 
# Steps:
# - separate reads into good and bad, depending on the mistake in the primers, the adapter
#                                                                            and wrong elements in flanks
#     (as well as keeps barcodes good reads in the file for good R2 and location of mistake in the file bad R1)

# Import module (main code)
import imp
import trimmR
imp.reload(trimmR)

# Variable parameters
#################################
# The number of permissible error:
mist1 = 1
mist2 = 2
# Primer, ad1 = Adapter 1, ad2 = Adapter 3 (aka Green)
primer = 'GAGCCACCGCGC'
ad1 = 'GCGTGCTGCGG'
ad2 = 'AGGGCGGT'
# Length of barcode
barlen = 9
# List of wrong elements in flank
elem_remove = ['AGCT']

# Input FASTQ files folder path
inputdir = '~/data'
# Output folder for processed FASTQ files
outputdir = '~/data/processed'
#################################

# Main function
trimmR.main(inputdir, outputdir, mist1, mist2, primer, ad1, ad2, barlen, elem_remove)

In [None]:
#################################
# FASTQ to SAM by bwa           #
#################################
# 
# You can see more options for bwa mem on http://bio-bwa.sourceforge.net/bwa.shtml
#
# Steps:
# - run bwa mem (mapping on human genome) with all paired FASTQ in folder.

# Import module (main code)
import imp
import bwamemR
imp.reload(bwamemR)

# Variable parameters
#################################
# Input FASTQ files folder path
inputdir = '~/data'
# Output folder for processed SAM files
outputdir = '~/data/processed'
# Folder with indexed human genome (by bwa index). Name - name of indexed human genome without extension
refway = '~/data/name'
# Main part of bwa mem program. You can add options
memline = 'bwa mem'
#################################

# Main function
bwamemR.main(inputdir, refway, outputdir, memline)

In [None]:
#################################
# SAM files filtering           #
#################################
# 
# Steps:
# - filtering sam files. Reads must be mapped, paired and not duplicated. GOOD reads write in table with
#    columns names:
#         ID CHR STRAND START END READ1 READ2 BARCODE ALU CIGAR_R1 CIGAR_R2 MDFLAG_R1 MDFLAG_R2
#    and BAD reads write in new sam file.

# Import module (main code)
import imp
import samfilterR
imp.reload(samfilterR)

# Variable parameters
#################################
# Input SAM files folder path
inputdir = '~/data'
# Output folder for processed SAM files and Tables
outputdir = '~/data/processed'
#################################

# Main function
samfilterR.main(inputdir, outputdir)

In [None]:
#################################
# Tables clustering             #
#################################
# 
# FOR ONE FILE PER CLUSTERING
#
# Steps:
# - clustering reads from tables by distance. Create 2 new tables from 1 with headers:
#       1) CLUSTER_ID CHR STRAND START END READ1_BEST CIGAR_BEST MDFLAG_BEST NUM_BARCODES NUM_READS
#       2) CLUSTER_ID ID_LIST BARCODE_LIST ALU_LIST

# Import module (main code)
import imp
import bigtableR
imp.reload(bigtableR)

# Variable parameters
#################################
# Input files folder path
inputdir = '~/data'
# Output folder for processed cluster_Tables
outputdir = '~/data/processed'
# Distance (in bp) for clustering reads
window = 20
#################################

# Main function
bigtableR.main(inputdir, outputdir, window)

In [None]:
#################################
# Tables MEGAclustering         #
#################################
# 
# FOR ALL FILES
#
# Steps:
# - clustering reads from tables by distance. Create MEGAtable from all bigtables with headers:
#       1) MEGACLUSTER_ID CHR STRAND START END READ1_BEST CIGAR_BEST MDFLAG_BEST + 
#                           for every file creating 2 additional columns: FILENAME_NUM_BARCODES FILENAME_NUM_READS 

# Import module (main code)
import imp
import megatableR
imp.reload(megatableR)

# Variable parameters
#################################
# Input files folder path
inputdir = '~/data'
# Output folder for processed megacluster_Table
outputdir = '~/data/processed'
# Distance (in bp) for megaclustering reads
window = 20
#################################

# Main function
megatableR.main(inputdir, outputdir, window)

In [None]:
#################################
# Intersect with existing libs  #
#################################
#
# Steps:
# - intersect reads from megatable by distance with existing libraries (like ALU). Add to MEGAtable additional columns:
#       Name_of_library1 Name_of_library2 etc (in each library: if seq from library intersect reads from megatable: 
#                                               writing Name_of_seq, else: 'NA')

# Import module (main code)
import imp
import intersectionR
imp.reload(intersectionR)

# Variable parameters
#################################
# Input mega_Table path
inputtable = '~/data/megatable.txt'
# Input rep_libraries forlder path
inputlibrary = '~/data/rep_libraries'
# Output folder for processed megacluster_Table with extracolumns (Name_of_library)
outputdir = '~/data/processed'
# Distance (in bp) for clustering reads
window = 20
#################################

# Main function
intersectionR.main(inputtable, inputlibrary, outputdir, window)