diff --git a/hwg_gssr_scripts/findSSRs_post_assembly.pl b/hwg_gssr_scripts/findSSRs_post_assembly.pl index b7c8f88..624757c 100755 --- a/hwg_gssr_scripts/findSSRs_post_assembly.pl +++ b/hwg_gssr_scripts/findSSRs_post_assembly.pl @@ -366,106 +366,137 @@ sub process_seq{ # holds the maximum number of repeats my $max_number_of_repeats = $MOTIF_SPECS[$index][2]-1; # the regular expression for looking for SSRs - my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,})"; + my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,$max_number_of_repeats})"; - ## LOOPC # run through the sequence and check for this motif spec while ($seq =~ /$regex/ig) { # Get the ssr and motif that were found my $ssr = $1; my $motif = lc $2; + my $start_index = $-[0]; + my $end_index = $+[0]; - ## initialize the repeats varaible to (motifLength - 1) - ## or 1 if motif length - 1 equals 0; - my $repeats = $motifLength - 1; - $repeats = 1 if(!$repeats); + if(quality_check_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq)){ + process_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq, $seq_masked, $out_fh, $p3_input_fh); - my $noRepeats = ((length $ssr)/$motifLength); - my $ssrEnd = pos $seq; - my $ssrStart = ($ssrEnd - (length $ssr) + 1); + } - ## LOOP D - check to make sure the motif isn't the same letter over and over again AND we don't have too many repeats + } + } +} - if (!($motif =~ /([gatc])\1{$repeats}/) && $noRepeats <= $max_number_of_repeats) { - ## LOOPE - # Only store the information if we have never - # seen this starting position before - # or anohter ssr starts within 2 bases of this one - if (!exists $seen{$contig_name."_ssr".$ssrStart} && - !exists $seen{$contig_name."_ssr".($ssrStart-1)} && - !exists $seen{$contig_name."_ssr".($ssrStart-2)} && - !exists $seen{$contig_name."_ssr".($ssrStart+1)} && - !exists $seen{$contig_name."_ssr".($ssrStart+2)} - ) { +sub quality_check_ssr{ + my $contig_name = shift; + my $ssr = shift; + my $motif = shift; + my $start_index = shift; + my $end_index = shift; + my $seq = shift; + + ##------------------------------------- + ## CHECKS to see if this is a good ssr + my $flag_same_base = 0; + my $flag_already_seen = 0; + my $flag_too_close_to_end = 0; + + ## Check #1 + ## ignore SSRs that are the same base repeated + if ($ssr !~ /^g+$/i && + $ssr !~ /^a+$/i && + $ssr !~ /^c+$/i && + $ssr !~ /^t+$/i ) { + $flag_same_base = 1; + } + + # Check #2 + # Make sure this isn't an already called SSR in disguise + # (a dinucleotide repeat posing as a tetranucleotide repeat, for instance) + if (!exists $SSR_STATS{$contig_name."_ssr".$start_index} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index-1)} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index-2)} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index+1)} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index+2)} + ) { + $flag_already_seen = 1; + } + + # Check #3 + # Distance from end + my $seqLen = length $seq; + if($start_index >= $LENGTH_FROM_END && $end_index <= ($seqLen-$LENGTH_FROM_END)){ + $flag_too_close_to_end = 1; + } + + if($flag_same_base && $flag_already_seen && $flag_too_close_to_end){ + return 1; + } + else{ + return 0; + } - # LOOP F - # Check to make sure the SSR is not within $LENGTH_FROM_END bp of either end of the sequence - my $seqLen = length $seq; - if($ssrStart >= $LENGTH_FROM_END && $ssrEnd <= ($seqLen-$LENGTH_FROM_END)){ +} - ## FOUND A SSR TO REPORT - my $ssr_id = $contig_name."_ssr".$ssrStart; - $seen{$ssr_id} = 1; - #print "$contig_name\tSSR $ssr"; - #print "\tmotif $motif"; - #print "\tnoRepeats $noRepeats"; - #print "\tssrStart $ssrStart\n"; - if(exists $CONTIG_SSR_STARTS{$contig_name}){ - push @{ $CONTIG_SSR_STARTS{$contig_name} }, $ssrStart; - } - else{ - $CONTIG_SSR_STARTS{$contig_name} = [$ssrStart]; - } +sub process_ssr{ + my $contig_name = shift; + my $ssr = shift; + my $motif = shift; + my $start_index = shift; + my $end_index = shift; + my $seq = shift; + my $seq_masked = shift; + my $out_fh = shift; + my $p3_input_fh = shift; - $SSR_COUNT++; - #print "$contig_name\t$ssr_id\t$motif\t$noRepeats\t$ssrStart\t$ssrEnd\n"; - printf $out_fh ("$contig_name\t$motif\t$noRepeats\t$ssrStart\t$ssrEnd\n"); - # change from soft mask to hard mask - $seq_masked =~ s/[actg]/N/g; - _addToPrimer3InputFile($p3_input_fh, $contig_name, $ssrStart, $ssrEnd, $seq_masked); - - # Store SSR_STATS - $SSR_STATS{$ssr_id}{MOTIF} = $motif; - $SSR_STATS{$ssr_id}{START} = $ssrStart; - $SSR_STATS{$ssr_id}{END} = $ssrEnd; - $SSR_STATS{$ssr_id}{MOTIF_LENGTH} = $motifLength; - $SSR_STATS{$ssr_id}{NO_REPEATS} = $noRepeats; - $SSR_STATS{$ssr_id}{SEQ} = $seq; - $SSR_STATS{$ssr_id}{SEQM} = $seq_masked; - - my $motiflen = length $motif; - my $tmp = $MOTIFLEN{$motiflen}; - $tmp++; - $MOTIFLEN{$motiflen} = $tmp; - - # Increment motif count - foreach my $group (keys %MOTIFS) { - my $motifUC = uc($motif); - if($group =~ /\|$motifUC\|/){ - # If this group contains this motif - #print "Incrementing $group for $motif\n"; - my $tmp = $MOTIFS{$group}++; - $tmp++; - $MOTIFS{$group} = $tmp; - } - }# end foreach $group - } ## LOOP F + ##------------------------------------- + ## generate a few more stats and variables + my $motif_len = length $motif; + my $num_of_repeats = ($end_index-$start_index)/$motif_len; + my $ssr_id = $contig_name."_ssr".$start_index; - ## LOOPE - } + #print "$contig_name $ssr $motif\n"; + #print "Start: $start_index\n"; + #print "End: $end_index\n"; - # LOOPD - } + ##------------------------------------- + ## store in data structures - ## LOOPC - } # end while (sequence =~ /$regex/ig); + $SSR_COUNT++; + $MOTIFLEN{$motif_len}++; + + if(exists $CONTIG_SSR_STARTS{$contig_name}){ + push @{ $CONTIG_SSR_STARTS{$contig_name} }, $start_index; + } + else{ + $CONTIG_SSR_STARTS{$contig_name} = [$start_index]; + } + + $SSR_STATS{$ssr_id}{MOTIF} = $motif; + $SSR_STATS{$ssr_id}{START} = $start_index; + $SSR_STATS{$ssr_id}{END} = $end_index; + $SSR_STATS{$ssr_id}{MOTIF_LENGTH} = $motif_len; + $SSR_STATS{$ssr_id}{NO_REPEATS} = $num_of_repeats; + $SSR_STATS{$ssr_id}{SEQ} = $seq; + $SSR_STATS{$ssr_id}{SEQM} = $seq_masked; + + foreach my $group (keys %MOTIFS) { + my $motifUC = uc($motif); + if($group =~ /\|$motifUC\|/){ + # If this group contains this motif + #print "Incrementing $group for $motif\n"; + $MOTIFS{$group}++; + } + } + + ##------------------------------------- + # print to text file of all repeats found and primer 3 input file + + printf $out_fh ("$contig_name\t$motif\t$num_of_repeats\t$start_index\t$end_index\n"); + _addToPrimer3InputFile($p3_input_fh, $contig_name, $start_index, $end_index, $seq_masked); - ## LOOPB - } # end for $index (0 .. (scalar @{$MOTIF_SPECS} - 1)) } @@ -477,6 +508,9 @@ sub _addToPrimer3InputFile{ my $ssrEnd = shift; my $seq = shift; + # change from soft mask to hard mask + $seq =~ s/[actg]/N/g; + my $len = $ssrEnd-$ssrStart; printf $p3_input_fh ("SEQUENCE_ID=$contig_name\_ssr$ssrStart\n");