Permalink
Browse files

Rethinking things... changed output file description (not yet changed…

… in code), removed constraint about SSRs being a certain length from end, changed default parameter values.
  • Loading branch information...
1 parent 1493413 commit eb2f61996070fbbd188b1d0380f0324e02dd263b mestato committed Jul 7, 2015
Showing with 19 additions and 43 deletions.
  1. +19 −43 hwg_gssr_scripts/findSSRs_post_assembly.pl
@@ -36,53 +36,40 @@
# Eight output files are produced:
#
# <input-file-name>.ssr.fasta
-# A fasta file with sequences with a single identified SSR.
-#
-# <input-file-name>.ssr_multi_seqs.fasta
-# A fasta with sequences with more than one identified SSR.
+# A fasta file with sequences with a SSR. (Compound SSRs are not considered)
#
# <input-file-name>.ssr_stats.txt
# A text file of statistics about the SSRs discovered.
#
# <input-file-name>.ssr_report.txt
# A tab-delimited file with each SSR. The columns are sequence name,
-# motif, number of repeats, start position and end position.
+# motif, number of repeats, start position, end position, compound (T/F).
#
# <input-file-name>.ssr_report.xlsx
# A excel file with SSR results and stats
#
# <input-file-name>.di_primer_report.txt
-# A tab-delimited file with sequences with a 2-bp SSR motif. Columns are
-# sequence name, motif, start position, end position, left primer,
-# right primer, left primer Tm, right primer Tm, amplicon size, full
-# sequence, masked sequence
-#
# <input-file-name>.tri_primer_report.txt
-# A tab-delimited file with sequences with a 3-bp SSR motif. Columns are
-# sequence name, motif, start position, end position, left primer,
-# right primer, left primer Tm, right primer Tm, amplicon size, full
-# sequence, masked sequence
-#
# <input-file-name>.tetra_primer_report.txt
-# A tab-delimited file with sequences with a 4-bp SSR motif. Columns are
+# Tab-delimited files with sequences with a specified SSR motif length. Columns are
# sequence name, motif, start position, end position, left primer,
-# right primer, left primer Tm, right primer Tm, amplicon size, full
-# sequence, masked sequence
-#
+# right primer, left primer Tm, right primer Tm, amplicon size
#
# Details:
# -------
# By default the script finds:
-# 2 bp motifs repeated from 8 to 40 times,
-# 3 bp motifs repeated from 7 to 30 times,
-# 4 bp motifs repeated from 6 to 20 times,
-#
-# The script only reports SSRs that are not within 15 bases of either
-# end of the sequence, in order to allow for primer design.
+# 2 bp motifs repeated from 8 to 200 times,
+# 3 bp motifs repeated from 7 to 133 times,
+# 4 bp motifs repeated from 6 to 100 times,
#
# These parameters may be changed in the "GLOBAL PARAMETERS" part of
# the script.
#
+# Compound SSRs are defined as any SSRs that abut or are less than 15 bases
+# apart. These are essentially compound SSRs for the purposes of mapping
+# because it is unlikely that primers can be designed between the repeat
+# segments.
+#
use strict;
@@ -109,13 +96,9 @@
our $MIN_REPS_3bp = 7;
our $MIN_REPS_4bp = 6;
-our $MAX_REPS_2bp = 40;
-our $MAX_REPS_3bp = 30;
-our $MAX_REPS_4bp = 20;
-
-# SSRs at the beginning or end of a sequence prevents proper primers design.
-# This is how close we will allow an SSR to be to the ends of the sequence.
-our $LENGTH_FROM_END = 15;
+our $MAX_REPS_2bp = 200;
+our $MAX_REPS_3bp = 133;
+our $MAX_REPS_4bp = 100;
#------------
# PRIMER PARAMETERS
@@ -125,11 +108,11 @@
my $PRIMER_OPT_SIZE="20"; # default 20
my $PRIMER_MIN_SIZE="18"; # default 18
-my $PRIMER_MAX_SIZE="25"; # default 27
+my $PRIMER_MAX_SIZE="27"; # default 27
my $PRIMER_NUM_NS_ACCEPTED = "0"; # default 0
-my $PRIMER_PRODUCT_SIZE_RANGE = "100-200";
+my $PRIMER_PRODUCT_SIZE_RANGE = "100-450";
my $PRIMER_OPT_TM = "60.0";
my $PRIMER_MIN_TM = "55.0";
@@ -401,6 +384,7 @@ sub process_seq{
+###############################################################
sub quality_check_ssr{
my $contig_name = shift;
my $ssr = shift;
@@ -413,7 +397,6 @@ sub quality_check_ssr{
## CHECKS to see if this is a good ssr
my $flag_same_base = 0;
my $flag_already_seen = 0;
- my $flag_too_close_to_end = 0;
## Check #1
## ignore SSRs that are the same base repeated
@@ -436,14 +419,7 @@ sub quality_check_ssr{
$flag_already_seen = 1;
}
- # Check #3
- # Distance from end
- my $seqLen = length $seq;
- if($start_index >= $LENGTH_FROM_END && $end_index <= ($seqLen-$LENGTH_FROM_END)){
- $flag_too_close_to_end = 1;
- }
-
- if($flag_same_base && $flag_already_seen && $flag_too_close_to_end){
+ if($flag_same_base && $flag_already_seen){
return 1;
}
else{

0 comments on commit eb2f619

Please sign in to comment.