From 568ba3b2cb321ffd0aa9beda8cbec3e701746bd8 Mon Sep 17 00:00:00 2001 From: Meg Staton Date: Thu, 23 Jan 2014 16:04:36 -0500 Subject: [PATCH] adding comments --- hwg_gssr_scripts/find_dup_begins.pl | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/hwg_gssr_scripts/find_dup_begins.pl b/hwg_gssr_scripts/find_dup_begins.pl index 5120bbe..48c0994 100755 --- a/hwg_gssr_scripts/find_dup_begins.pl +++ b/hwg_gssr_scripts/find_dup_begins.pl @@ -27,7 +27,7 @@ # # Output: # ------ -# Four output files are produced: +# Four output files are produced, in the same directory as the input files: # # .uniq # A fastq file with forward sequences that do not share duplicated beginning @@ -58,8 +58,17 @@ # The length of the subsequence samples, currently set at 20, can be modified # with the $LEN variable below. # -# Main caveat: if the duplicated region is shifted by even one base in either -# direction, this script will not detect it. +# WARNING: This script puts the entire forward file into memory in a hash, so +# use with caution on large files. +# +# Needed improvements and caveat emptors: +# - if the duplicated region is shifted by even one base in either +# direction, this script will not detect it. +# +# - memory requirements are ridiculous, a better strategy is needed +# +# - the output files are put whereever the input files are because the path +# is not parsed off, could be fixed with File::Basename # # #--------------------------------------------------------------------------------- @@ -192,7 +201,7 @@ print "total pairs examined: $count\n"; print "pairs with dup beginning: $dups\n"; print "pairs with uniq beginning: $uniq2\n"; -print "pct: $pct\n"; +print "percent duplicates: $pct %\n"; ################################################################################