From 21e34fa62e6131f2dd953afc2b5680be0f78d95d Mon Sep 17 00:00:00 2001 From: "Michael T. Wolfinger" Date: Thu, 27 Nov 2014 14:30:20 +0100 Subject: [PATCH] -a --- Changes | 6 +- README | 4 ++ lib/Bio/ViennaNGS.pm | 86 +++++++++++++++++++++++++++-- scripts/assembly_hub_constructer.pl | 84 ---------------------------- 4 files changed, 88 insertions(+), 92 deletions(-) delete mode 100755 scripts/assembly_hub_constructer.pl diff --git a/Changes b/Changes index 4de4c76..fd8b6bc 100644 --- a/Changes +++ b/Changes @@ -57,12 +57,14 @@ Revision history for Perl extension ViennaNGS. - everything is exported via @EXPORT_OK now - updated README -0.10 +0.10 Thu Nov 27 2014 - requires Perl >= 5.12.0 - added sortbed() + - added uniquify_bam() + - added scripts/bam_uniq.pl - added scripts/sj_visualizer.pl - refactored scripts/motiffinda.pl - - updated scripts/assembly_hub_constructer.pl + - removed development version of scripts/assembly_hub_constructer.pl - made scripts/gff2bed.pl work with OO Bio::ViennaNGS::AnnoC - removed get_stranded_subsequence (now lives in Bio::ViennaNGS::Fasta) - updated POD diff --git a/README b/README index 2f1ac65..1f377f3 100644 --- a/README +++ b/README @@ -13,6 +13,10 @@ statistics. Optionally create BED output, as well as normalized bedGraph and bigWig files for coverage visualization in genome browsers (see dependencies on third-patry tools below). +bam_uniq.pl +Extract unique and multi mapping reads from BAM alignment files and create +a separate BAM file for both uniqe (.uniq.) and multi (.mult.) mappers. + gff2bed.pl: Convert RefSeq GFF3 annotation files to BED12 format. Individual BED12 files are created for each feature type (CDS/tRNA/rRNA/etc.). Tested with diff --git a/lib/Bio/ViennaNGS.pm b/lib/Bio/ViennaNGS.pm index 2613184..b3ff11d 100644 --- a/lib/Bio/ViennaNGS.pm +++ b/lib/Bio/ViennaNGS.pm @@ -1,11 +1,11 @@ # -*-CPerl-*- -# Last changed Time-stamp: <2014-10-22 15:49:09 mtw> +# Last changed Time-stamp: <2014-11-27 14:21:31 mtw> package Bio::ViennaNGS; use 5.12.0; use Exporter; -use version; our $VERSION = qv('0.10_03'); +use version; our $VERSION = qv('0.10'); use strict; use warnings; use Bio::Perl 1.006924; @@ -19,7 +19,7 @@ use Carp; our @ISA = qw(Exporter); our @EXPORT = (); -our @EXPORT_OK = qw ( split_bam bam2bw bed2bw sortbed +our @EXPORT_OK = qw ( split_bam uniquify_bam bam2bw bed2bw sortbed bed2bigBed computeTPM featCount_data parse_multicov write_multicov totalreads ); @@ -291,6 +291,68 @@ sub split_bam { return @processed_files; } +# uniquify_bam ($bam,$dest,$log) +# Deconvolute BAM files into unique and multi mappers +# Returns array of filenames to .uniq. and .mult. BAM files +sub uniquify_bam { + my ($bamfile,$dest,$log) = @_; + my ($bam, $bn,$path,$ext,$read,$header); + my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult); + my ($count_all,$count_uniq,$count_mult) = (0)x3; + my @processed_files = (); + my $this_function = (caller(0))[3]; + + croak "ERROR [$this_function] Cannot find $bamfile\n" + unless (-e $bamfile); + croak "ERROR [$this_function] $dest does not exist\n" + unless (-d $dest); + + ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/); + + (undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0); + (undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0); + + $bam = Bio::DB::Bam->open($bamfile, "r"); + $fn_uniq = file($dest,$bn.".uniq".$ext); + $fn_mult = file($dest,$bn.".mult".$ext); + $header = $bam->header; # TODO: modify header, leave traces ... + + $bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w') + or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; + $bam_mult = Bio::DB::Bam->open($tmp_mult,'w') + or croak "ERROR [$this_function] Cannot open temp file for writing: $!"; + $bam_uniq->header_write($header); + $bam_mult->header_write($header); + + while ($read = $bam->read1() ) { + $count_all++; + if ($read->aux_get("NH") == 1){ # uniquely mapped reads + $bam_uniq->write1($read); + $count_uniq++; + } + else { # multiply mapped reads + $bam_mult->write1($read); + $count_mult++; + } + } + + croak "ERROR [$this_function] Read counts don't match\n" + unless ($count_uniq + $count_mult == $count_all); + + rename ($tmp_uniq, $fn_uniq); + rename ($tmp_mult, $fn_mult); + push (@processed_files, $fn_uniq); + push (@processed_files, $fn_mult); + + if (defined $log){ + my $lf = file($dest,$log); + open(LOG, ">>", $lf) or croak $!; + printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n", + $count_all,$count_uniq,$count_mult; + close(LOG); + } +} + # bam2bw ( $bam,$chromsizes ) # Generate BedGraph and BigWig coverage from BAM via two third-party tools: # genomeCoverageBed from BEDtools @@ -577,6 +639,9 @@ analysis # split a single-end or paired-end BAM file by strands @result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile); + # extract unique and multi mappers from a BAM file + @result = uniquify_bam($bam_in,$outdir,$logfile); + # make bigWig from BAM bam2bw($bam_in,$chromsizes) @@ -600,10 +665,11 @@ analysis =head1 DESCRIPTION -ViennaNGS is a collection of utilities and subroutines for building -Next-Generation Sequencing (NGS) data analysis pipelines. +Bio::ViennaNGS is a collection of utilities and subroutines for +building efficient Next-Generation Sequencing (NGS) data analysis +pipelines. -=over +=over =item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log) @@ -633,6 +699,14 @@ separately, thus allowing for scenarios where eg. one read is a multi-mapper, whereas its associate mate is a unique mapper, resulting in an ambiguous alignment of the entire fragment. +=item uniquify_bam($bam,$dest,$log) + +Extract I and I mapping reads from BAM file +C<$bam>. New BAM files for unique and multi mappers are created in the +output folder C<$dest>, which are named B and +B, respectively. If defined, a logfile named +C<$log> is created in the output folder. + =item bam2bw($bam,$chromsizes) Creates BedGraph and BigWig coverage profiles from BAM files. These diff --git a/scripts/assembly_hub_constructer.pl b/scripts/assembly_hub_constructer.pl deleted file mode 100755 index f81ad92..0000000 --- a/scripts/assembly_hub_constructer.pl +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env perl -# -*-CPerl-*- -# Last changed Time-stamp: <2014-09-24 14:17:55 egg> -# -# Construct UCSC assembly hub -# Display of novel genome sequences on the UCSC Genome Hub. -# -# usage: assembly_hub_constructer.pl -i file.fa -a /output/path -b http://www.test.com/test/ -# -# *********************************************************************** -# * Copyright notice -# * -# * Copyright 2014 Michael Thomas Wolfinger -# * Florian Eggenhofer -# * All rights reserved -# * -# * This program is free software: you can redistribute it and/or modify -# * it under the terms of the GNU General Public License as published by -# * the Free Software Foundation, either version 3 of the License, or -# * (at your option) any later version. -# * -# * This program is distributed in the hope that it will be useful, -# * but WITHOUT ANY WARRANTY; without even the implied warranty of -# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# * GNU General Public License for more details. -# * -# * You should have received a copy of the GNU General Public License -# * along with this program. If not, see . -# * -# * This copyright notice MUST APPEAR in all copies of the script! -# *********************************************************************** - -use strict; -use warnings; -use Getopt::Long; -use Data::Dumper; -use File::Basename; -use Bio::ViennaNGS::UCSC qw( make_assembly_hub ); - -#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# -#^^^^^^^^^^ Variables ^^^^^^^^^^^# -#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# -my ($fasta_file_path,$assembly_hub_destination_path,$base_URL,$assembly_hub_return_value,$log_path); - -#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# -#^^^^^^^^^^^^^^ Main ^^^^^^^^^^^^^# -#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# -Getopt::Long::config('no_ignore_case'); -&usage() unless GetOptions("i=s" => \$fasta_file_path, - "a=s" => \$assembly_hub_destination_path, - "b=s" => \$base_URL, - "-help" => \&usage, - "v"); - -die "ERROR in [assembly_hub_constructer.pl]: no input fasta file provided" unless(defined $fasta_file_path); -die "ERROR in [assembly_hub_constructer.pl]: input fasta file not found" unless (-e $fasta_file_path); -die "ERROR in [assembly_hub_constructer.pl]: no assembly hub destination path provided" unless(defined $assembly_hub_destination_path); -die "ERROR in [assembly_hub_constructer.pl]: assembly hub destination path not writable" unless (-e $assembly_hub_destination_path); -die "ERROR in [assembly_hub_constructer.pl]: no URL (network location for upload to UCSC) provided" unless(defined $base_URL); -unless ($assembly_hub_destination_path =~ /\/$/){$assembly_hub_destination_path.= "/";} -$log_path = $assembly_hub_destination_path . "Log/"; -unless (-d $log_path){my $cmd = "mkdir -p $log_path"; system($cmd);} - -$assembly_hub_return_value = make_assembly_hub($fasta_file_path,$assembly_hub_destination_path,$base_URL,$log_path); - -#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# -#^^^^^^^^^^^ Subroutines ^^^^^^^^^^# -#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^# - - -sub usage { - print < ($fasta_file_path) - -a output directory ($assembly_hub_destination_path) - -b URL - network location for upload to UCSC ($base_URL) - -help print this information - -EOF -exit; -}