From 21e34fa62e6131f2dd953afc2b5680be0f78d95d Mon Sep 17 00:00:00 2001
From: "Michael T. Wolfinger" <michael@wolfinger.eu>
Date: Thu, 27 Nov 2014 14:30:20 +0100
Subject: [PATCH] -a

---
 Changes                             |  6 +-
 README                              |  4 ++
 lib/Bio/ViennaNGS.pm                | 86 +++++++++++++++++++++++++++--
 scripts/assembly_hub_constructer.pl | 84 ----------------------------
 4 files changed, 88 insertions(+), 92 deletions(-)
 delete mode 100755 scripts/assembly_hub_constructer.pl

diff --git a/Changes b/Changes
index 4de4c76..fd8b6bc 100644
--- a/Changes
+++ b/Changes
@@ -57,12 +57,14 @@ Revision history for Perl extension ViennaNGS.
 	- everything is exported via @EXPORT_OK now
 	- updated README
 
-0.10  
+0.10  Thu Nov 27 2014
 	- requires Perl >= 5.12.0
 	- added sortbed()
+	- added uniquify_bam()
+	- added scripts/bam_uniq.pl
 	- added scripts/sj_visualizer.pl
 	- refactored scripts/motiffinda.pl
-	- updated scripts/assembly_hub_constructer.pl
+	- removed development version of scripts/assembly_hub_constructer.pl
 	- made scripts/gff2bed.pl work with OO Bio::ViennaNGS::AnnoC 
 	- removed get_stranded_subsequence (now lives in Bio::ViennaNGS::Fasta)
 	- updated POD
diff --git a/README b/README
index 2f1ac65..1f377f3 100644
--- a/README
+++ b/README
@@ -13,6 +13,10 @@ statistics. Optionally create BED output, as well as normalized bedGraph
 and bigWig files for coverage visualization in genome browsers (see
 dependencies on third-patry tools below).
 
+bam_uniq.pl
+Extract unique and multi mapping reads from BAM alignment files and create
+a separate BAM file for both uniqe (.uniq.) and multi (.mult.) mappers.
+
 gff2bed.pl:
 Convert RefSeq GFF3 annotation files to BED12 format. Individual BED12
 files are created for each feature type (CDS/tRNA/rRNA/etc.). Tested with
diff --git a/lib/Bio/ViennaNGS.pm b/lib/Bio/ViennaNGS.pm
index 2613184..b3ff11d 100644
--- a/lib/Bio/ViennaNGS.pm
+++ b/lib/Bio/ViennaNGS.pm
@@ -1,11 +1,11 @@
 # -*-CPerl-*-
-# Last changed Time-stamp: <2014-10-22 15:49:09 mtw>
+# Last changed Time-stamp: <2014-11-27 14:21:31 mtw>
 
 package Bio::ViennaNGS;
 
 use 5.12.0;
 use Exporter;
-use version; our $VERSION = qv('0.10_03');
+use version; our $VERSION = qv('0.10');
 use strict;
 use warnings;
 use Bio::Perl 1.006924;
@@ -19,7 +19,7 @@ use Carp;
 
 our @ISA = qw(Exporter);
 our @EXPORT = ();
-our @EXPORT_OK = qw ( split_bam bam2bw bed2bw sortbed
+our @EXPORT_OK = qw ( split_bam uniquify_bam bam2bw bed2bw sortbed
 		    bed2bigBed computeTPM featCount_data
 		    parse_multicov write_multicov totalreads );
 
@@ -291,6 +291,68 @@ sub split_bam {
   return @processed_files;
 }
 
+# uniquify_bam ($bam,$dest,$log)
+# Deconvolute BAM files into unique and multi mappers
+# Returns array of filenames to .uniq. and .mult. BAM files
+sub uniquify_bam {
+  my ($bamfile,$dest,$log) = @_;
+  my ($bam, $bn,$path,$ext,$read,$header);
+  my ($tmp_uniq,$tmp_mult,$fn_uniq,$fn_mult,$bam_uniq,$bam_mult);
+  my ($count_all,$count_uniq,$count_mult) = (0)x3;
+  my @processed_files = ();
+  my $this_function = (caller(0))[3];
+
+  croak "ERROR [$this_function] Cannot find $bamfile\n"
+    unless (-e $bamfile);
+  croak "ERROR [$this_function] $dest does not exist\n"
+    unless (-d $dest);
+
+  ($bn,$path,$ext) = fileparse($bamfile, qr /\..*/);
+
+  (undef,$tmp_uniq) = tempfile('BAM_UNIQ_XXXXXXX',UNLINK=>0);
+  (undef,$tmp_mult) = tempfile('BAM_MULT_XXXXXXX',UNLINK=>0);
+
+  $bam     = Bio::DB::Bam->open($bamfile, "r");
+  $fn_uniq = file($dest,$bn.".uniq".$ext);
+  $fn_mult = file($dest,$bn.".mult".$ext);
+  $header  = $bam->header; # TODO: modify header, leave traces ...
+
+  $bam_uniq = Bio::DB::Bam->open($tmp_uniq,'w')
+    or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
+  $bam_mult = Bio::DB::Bam->open($tmp_mult,'w')
+    or croak "ERROR [$this_function] Cannot open temp file for writing: $!";
+  $bam_uniq->header_write($header);
+  $bam_mult->header_write($header);
+
+  while ($read = $bam->read1() ) {
+    $count_all++;
+    if ($read->aux_get("NH") == 1){ # uniquely mapped reads
+      $bam_uniq->write1($read);
+      $count_uniq++;
+    }
+    else { # multiply mapped reads
+      $bam_mult->write1($read);
+      $count_mult++;
+    }
+  }
+
+  croak "ERROR [$this_function] Read counts don't match\n"
+    unless ($count_uniq + $count_mult == $count_all);
+
+  rename ($tmp_uniq, $fn_uniq);
+  rename ($tmp_mult, $fn_mult);
+  push (@processed_files, $fn_uniq);
+  push (@processed_files, $fn_mult);
+
+  if (defined $log){
+    my $lf = file($dest,$log);
+    open(LOG, ">>", $lf) or croak $!;
+    printf LOG "%15d reads total\n%15d unique reads\n%15d multiple reads\n",
+      $count_all,$count_uniq,$count_mult;
+    close(LOG);
+  }
+}
+
 # bam2bw ( $bam,$chromsizes )
 # Generate BedGraph and BigWig coverage from BAM via two third-party tools:
 # genomeCoverageBed from BEDtools
@@ -577,6 +639,9 @@ analysis
   # split a single-end  or paired-end BAM file by strands
   @result = split_bam($bam_in,$rev,$want_uniq,$want_bed,$destdir,$logfile);
 
+  # extract unique and multi mappers from a BAM file
+  @result = uniquify_bam($bam_in,$outdir,$logfile);
+
   # make bigWig from BAM
   bam2bw($bam_in,$chromsizes)
 
@@ -600,10 +665,11 @@ analysis
 
 =head1 DESCRIPTION
 
-ViennaNGS is a collection of utilities and subroutines for building
-Next-Generation Sequencing (NGS) data analysis pipelines.
+Bio::ViennaNGS is a collection of utilities and subroutines for
+building efficient Next-Generation Sequencing (NGS) data analysis
+pipelines.
 
-=over 
+=over
 
 =item split_bam($bam,$reverse,$want_uniq,$want_bed,$dest_dir,$log)
 
@@ -633,6 +699,14 @@ separately, thus allowing for scenarios where eg. one read is a
 multi-mapper, whereas its associate mate is a unique mapper, resulting
 in an ambiguous alignment of the entire fragment.
 
+=item uniquify_bam($bam,$dest,$log)
+
+Extract I<unique> and I<multi> mapping reads from BAM file
+C<$bam>. New BAM files for unique and multi mappers are created in the
+output folder C<$dest>, which are named B<basename.uniq.bam> and
+B<basename.mult.bam>, respectively. If defined, a logfile named
+C<$log> is created in the output folder.
+
 =item bam2bw($bam,$chromsizes)
 
 Creates BedGraph and BigWig coverage profiles from BAM files. These
diff --git a/scripts/assembly_hub_constructer.pl b/scripts/assembly_hub_constructer.pl
deleted file mode 100755
index f81ad92..0000000
--- a/scripts/assembly_hub_constructer.pl
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env perl
-# -*-CPerl-*-
-# Last changed Time-stamp: <2014-09-24 14:17:55 egg>
-#
-# Construct UCSC assembly hub
-# Display of novel genome sequences on the UCSC Genome Hub.
-#
-# usage: assembly_hub_constructer.pl -i file.fa -a /output/path -b http://www.test.com/test/
-#
-# ***********************************************************************
-# *  Copyright notice
-# *
-# *  Copyright 2014 Michael Thomas Wolfinger <michael@wolfinger.eu>
-# *                 Florian Eggenhofer <florian.eggenhofer@univie.ac.at>
-# *  All rights reserved
-# *
-# *  This program is free software: you can redistribute it and/or modify
-# *  it under the terms of the GNU General Public License as published by
-# *  the Free Software Foundation, either version 3 of the License, or
-# *  (at your option) any later version.
-# *
-# *  This program is distributed in the hope that it will be useful,
-# *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-# *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# *  GNU General Public License for more details.
-# *
-# *  You should have received a copy of the GNU General Public License
-# *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-# *
-# *  This copyright notice MUST APPEAR in all copies of the script!
-# ***********************************************************************
-
-use strict;
-use warnings;
-use Getopt::Long;
-use Data::Dumper;
-use File::Basename;
-use Bio::ViennaNGS::UCSC qw( make_assembly_hub );
-
-#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
-#^^^^^^^^^^ Variables ^^^^^^^^^^^#
-#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
-my ($fasta_file_path,$assembly_hub_destination_path,$base_URL,$assembly_hub_return_value,$log_path);
-
-#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
-#^^^^^^^^^^^^^^ Main ^^^^^^^^^^^^^#
-#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
-Getopt::Long::config('no_ignore_case');
-&usage() unless GetOptions("i=s"           => \$fasta_file_path,
-			   "a=s"           => \$assembly_hub_destination_path,
-                           "b=s"           => \$base_URL,
-                           "-help"         => \&usage,
-                           "v");
-
-die "ERROR in [assembly_hub_constructer.pl]: no input fasta file provided" unless(defined $fasta_file_path);
-die "ERROR in [assembly_hub_constructer.pl]: input fasta file not found" unless (-e $fasta_file_path);
-die "ERROR in [assembly_hub_constructer.pl]: no assembly hub destination path provided" unless(defined $assembly_hub_destination_path);
-die "ERROR in [assembly_hub_constructer.pl]: assembly hub destination path not writable" unless (-e $assembly_hub_destination_path);
-die "ERROR in [assembly_hub_constructer.pl]: no URL (network location for upload to UCSC) provided" unless(defined $base_URL);
-unless ($assembly_hub_destination_path =~ /\/$/){$assembly_hub_destination_path.= "/";}
-$log_path = $assembly_hub_destination_path . "Log/";
-unless (-d $log_path){my $cmd = "mkdir -p  $log_path"; system($cmd);}
-
-$assembly_hub_return_value = make_assembly_hub($fasta_file_path,$assembly_hub_destination_path,$base_URL,$log_path);
-
-#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
-#^^^^^^^^^^^ Subroutines ^^^^^^^^^^#
-#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#
-
-
-sub usage {
-  print <<EOF;
-Calculate normalized expression values (TPM/RPKM) for multicov data
-
-usage: $0 [options]
-program specific options:                                default:
- -i      <file>                                          ($fasta_file_path)
- -a      <path>   output directory                       ($assembly_hub_destination_path)
- -b      <url>    URL - network location for upload to UCSC                            ($base_URL)
- -help            print this information
-
-EOF
-exit;
-}