In [1]:
# # Raw FASTQ files preprocessing #
#################################
# 
# (Latest) library structure (may be changed):
# [Alu primer - 12 bp][Alu sequence - 6 bp][Flank][Adapter 1 - 10 bp][Adapter 2, barcode - 9 bp][Adapter3 - 12 bp]
# 
# R1: alu primer mate
# R2: adapter mate
# 
# 
# Steps:
# - separate reads into good and bad, depending on the mistake in the primers, the adapter
#                                                                            and wrong elements in flanks
#     (as well as keeps barcodes good reads in the file for good R2 and location of mistake in the file bad R1)

# Import module (main code)
import imp
import trimmR
imp.reload(trimmR)

# Variable parameters
#################################
# The number of permissible error:
mist1 = 1
mist2 = 1
# Primer, ad1 = Adapter 1, ad2 = Adapter 3 (aka Green)
primer = 'GAGCCACCGCGC'
ad1 = 'GCGTGCTGCGG'
ad2 = 'AGGGCGGT'
# Length of barcode
barlen = 9
# List of wrong elements in flank
elem_remove = ['AGCT']
# Shift for search
shift = 4
# Window in flanks for searching primers or adapters
search_win = 20


# Input FASTQ files folder path
inputdir = '~/data'
inputdir = 'input'
# Output folder for processed FASTQ files
outputdir = '~/data/processed'
outputdir = 'output'
#################################

# Main function
trimmR.main(inputdir, outputdir, shift,
 mist1, mist2, primer, ad1, ad2, barlen, elem_remove, search_win)

For index7: mistake(place-amount) = primer-31,ad-42,green-42,flank_simple-21,flank_strange-0;  
reads: 1000, good: 0.91, bad: 0.09

For new_: mistake(place-amount) = primer-240,ad-73,green-73,flank_simple-48,flank_strange-0;  
reads: 3750, good: 0.90, bad: 0.10



In [2]:
#### # Filtered FASTQ files to SAM files  #
#################################
# 
# You can see more options for bwa mem on http://bio-bwa.sourceforge.net/bwa.shtml
#
# Steps:
# - run bwa mem (mapping on human genome) with all paired FASTQ in folder.

# Import module (main code)
import imp
import bwamemR
imp.reload(bwamemR)

# Variable parameters
#################################
# Input FASTQ files folder path
inputdir = '~/data'
inputdir = 'output'
# Output folder for processed SAM files
outputdir = '~/data/processed'
outputdir = 'foo1'
# Folder with indexed human genome (by bwa index). Name - name of indexed human genome without extension
refway = '~/data/name'
refway = 'chr1/chr1.fa'
# Main part of bwa mem program. You can add options
memline = 'bwa mem'
#################################

# Main function
bwamemR.main(inputdir, refway, outputdir, memline)

[['new_R1_good.fq', 'new_R2_good.fq'], ['index7.R1_good.fq', 'index7.R2_good.fq']]
bwa mem chr1/chr1.fa output/new_R1_good.fq output/new_R2_good.fq > foo1/new_.sam
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 6792 sequences (522981 bp)...
[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 255, 0, 0)
[M::mem_pestat] skip orientation FF as there are not enough pairs
[M::mem_pestat] analyzing insert size distribution for orientation FR...
[M::mem_pestat] (25, 50, 75) percentile: (111, 176, 211)
[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 411)
[M::mem_pestat] mean and std.dev: (167.25, 70.72)
[M::mem_pestat] low and high boundaries for proper pairs: (1, 511)
[M::mem_pestat] skip orientation RF as there are not enough pairs
[M::mem_pestat] skip orientation RR as there are not enough pairs
[M::mem_process_seqs] Processed 6792 reads in 3.316 CPU sec, 3.268 real sec
[main] Version: 0.7.13-r1126
[main] CMD: bwa mem chr1/chr1.fa

In [6]:
#################################
# SAM files filtering           #
#################################
# 
# Steps:
# - filtering sam files. Reads must be mapped, paired and not duplicated. GOOD reads write in table with
#    columns names:
#         ID CHR STRAND START END READ1 READ2 BARCODE ALU CIGAR_R1 CIGAR_R2 MDFLAG_R1 MDFLAG_R2
#    and BAD reads write in new sam file.

# Import module (main code)
import imp
import samfilterR
imp.reload(samfilterR)

# Variable parameters
#################################
# Input SAM files folder path
inputdir = '~/data'
inputdir = 'foo1'
# Output folder for processed SAM files and Tables
outputdir = '~/data/processed'
outputdir = 'foo2'
#################################

# Main function
samfilterR.main(inputdir, outputdir)

In [22]:
#################################
# Tables clustering             #
#################################
# 
# FOR ONE FILE PER CLUSTERING
#
# Steps:
# - clustering reads from tables by distance. Create 2 new tables from 1 with headers:
#       1) CLUSTER_ID CHR STRAND START END READ1_BEST CIGAR_BEST MDFLAG_BEST NUM_BARCODES NUM_READS
#       2) CLUSTER_ID ID_LIST BARCODE_LIST ALU_LIST

# Import module (main code)
import imp
import bigtableR
imp.reload(bigtableR)

# Variable parameters
#################################
# Input files folder path
inputdir = '~/data'
inputdir = 'foo2'
# Output folder for processed cluster_Tables
outputdir = '~/data/processed'
outputdir = 'foo3'
# Distance (in bp) for clustering reads
window = 20
#################################

# Main function
bigtableR.main(inputdir, outputdir, window)

Done new__table

Done index7_table



In [23]:
#################################
# Tables MEGAclustering         #
#################################
# 
# FOR ALL FILES
#
# Steps:
# - clustering reads from tables by distance. Create MEGAtable from all bigtables with headers:
#       1) MEGACLUSTER_ID CHR STRAND START END READ1_BEST CIGAR_BEST MDFLAG_BEST + 
#                           for every file creating 2 additional columns: FILENAME_NUM_BARCODES FILENAME_NUM_READS 

# Import module (main code)
import imp
import megatableR
imp.reload(megatableR)

# Variable parameters
#################################
# Input files folder path
inputdir = '~/data'
inputdir = 'foo3'
# Output folder for processed megacluster_Table
outputdir = '~/data/processed'
outputdir = 'foo4'
# Distance (in bp) for megaclustering reads
window = 20
# Standart Alu
standart_alu = 'CCGGCC'
#################################

# Main function
megatableR.main(inputdir, outputdir, window, standart_alu)

{'CCGGCC': 1}
{'CCGGCC': 3}
{'CCGGCC': 2}
{'CCGGCC': 12}
{'CTGGCC': 1}
{'CCGGCC': 2}
{'CCGGCC': 1}
{'CCGGCC': 2}
{'CCGGCC': 5}
{'CCGGCC': 3}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 2}
{'CCGGCC': 1}
{'CCGGCC': 2}
{'CCGGCC': 3}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 2}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCT': 1}
{'CCGGCC': 4}
{'CCGGCC': 3}
{'CCGGCC': 4}
{'CCGACT': 1}
{'CCGGCT': 4}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 4}
{'CCGGCC': 1}
{'CCGTCT': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 3}
{'CCTGCC': 7}
{'CCGGCC': 2}
{'CCGGCC': 1}
{'CCGGCC': 5}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 2}
{'CCTGCC': 1}
{'CCGGCC': 2}
{'CCGGCC': 3}
{'CCGGCC': 1}
{'CCGGCC': 9}
{'CCGGCC': 2}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCAGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 2}
{'CCGGCC': 3}
{'CCGGCC': 6}
{'CCGGCC': 2}
{'CCGGCC': 1}
{'CCGGCC': 5}
{'CCGGCC': 5}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCGGCC': 3}
{'CCGGCC': 1}
{'CCGGCC': 1}
{'CCG

In [56]:
#################################
# Intersect with existing libs  #
#################################
#
# Steps:
# - intersect reads from megatable by distance with existing libraries (like ALU). Add to MEGAtable additional columns:
#       Name_of_library1 Name_of_library2 etc (in each library: if seq from library intersect reads from megatable: 
#                                               writing Name_of_seq, else: 'NA')

# Import module (main code)
import imp
import intersectionR
imp.reload(intersectionR)

# Variable parameters
#################################
# Input files folder path
inputdir = '~/data'
inputtable = 'foo4/megatable.txt'
# Input rep_libraries forlder path
inputlibrary = '~/data/rep_libraries'
inputlibrary = 'alu_rep'
# Output folder for processed megacluster_Table with extracolumns (Name_of_library)
outputdir = '~/data/processed'
outputdir = 'foo5'
# Distance (in bp) for clustering reads
window = 20
#################################

# Main function
intersectionR.main(inputtable, inputlibrary, outputdir, window)

In [57]:
#################################
# Recovery reads by FASTQ_good  #
#################################
#
# Steps:
# - recovery reads by FASTQ_good by (using cols FILENAME and READNAME

# Import module (main code)
import imp
import readfromfqR
imp.reload(readfromfqR)

# Variable parameters
#################################
# Input files folder path
inputdir = '~/data'
inputtable = 'foo5/megatable_inter_lib.txt'
# Input rep_libraries forlder path
inputlibrary = '~/data/fq_good_folder'
inputlibrary = 'output'
# Output folder for processed megacluster_Table with extracolumns (Name_of_library)
# Distance (in bp) for clustering reads
outputdir = '~/data/processed'
outputdir = 'foo5'
#################################

# Main function
readfromfqR.main(inputtable, inputlibrary, outputdir)

['MEGACLUSTER_ID', 'FILENAME', 'READNAME', 'CHR', 'POS', 'STRAND', 'ALU_BEST', 'ALU_AMOUNT', 'ALU_HAMMING', 'READ1_BEST', 'READ2_BEST', 'TLEN', 'CIGAR_BEST', 'MDFLAG_BEST', 'new__NUM_READS', 'new__NUM_BARCODES', 'index7_NUM_READS', 'index7_NUM_BARCODES', 'Alu_hg19']
['MEGACLUSTER_ID', 'FILENAME', 'READNAME', 'CHR', 'POS', 'STRAND', 'ALU_BEST', 'ALU_AMOUNT', 'ALU_HAMMING', 'READ1_BEST', 'READ2_BEST', 'TLEN', 'CIGAR_BEST', 'MDFLAG_BEST', 'new__NUM_READS', 'new__NUM_BARCODES', 'index7_NUM_READS', 'index7_NUM_BARCODES', 'Alu_hg19', 'READ1', 'READ2']
