diff --git a/hwg_gssr_scripts/findSSRs_post_assembly.pl b/hwg_gssr_scripts/findSSRs_post_assembly.pl index caf6956..f519ea4 100755 --- a/hwg_gssr_scripts/findSSRs_post_assembly.pl +++ b/hwg_gssr_scripts/findSSRs_post_assembly.pl @@ -230,21 +230,14 @@ sub main{ ##--------------------------------------------------------------- print "finding SSRs...\n"; - open(PI3, ">$p3_input") or die $!; - my $p3_input_fh = *PI3; process_file($fasta_file, $masked_file); - close PI3; - - collapse_compound_SSRs(); - # this subroutine accomplishes two things - # 1. adds a MULTI flag to the data hash indicating if the - # ssr is the only one in the sequence or one of many - # 2. prints a fasta file with sequences with a single ssr - # and another with sequences with multiple ssrs - print "identifying sequences with >1 SSR..."; - flag_multiSSRs($fasta_out); + collapse_compound_ssrs(); + flag_multiSSRs(); print "done.\n"; + #open(PI3, ">$p3_input") or die $!; + #my $p3_input_fh = *PI3; + #close PI3; # print "running primer3...\n"; # print "$PRIMER3 < $p3_input > $p3_output\n"; @@ -256,6 +249,7 @@ sub main{ ##--------------------------------------------------------------- ## Producing output - Fasta files and flat files + #($fasta_out); #print "printing output files..."; #create_primer_flat_files ($di_primer_out, $tri_primer_out, $tetra_primer_out); @@ -434,16 +428,9 @@ sub process_ssr{ my $num_of_repeats = ($end_index-$start_index)/$motif_len; my $ssr_id = $contig_name."_ssr".$start_index; - #print "$contig_name $ssr $motif\n"; - #print "Start: $start_index\n"; - #print "End: $end_index\n"; - ##------------------------------------- ## store in data structures - $SSR_COUNT++; - $MOTIFLEN{$motif_len}++; - if(exists $CONTIG_SSR_STARTS{$contig_name}){ push @{ $CONTIG_SSR_STARTS{$contig_name} }, $start_index; } @@ -458,25 +445,115 @@ sub process_ssr{ $SSR_STATS{$ssr_id}{NO_REPEATS} = $num_of_repeats; $SSR_STATS{$ssr_id}{SEQ} = $seq; $SSR_STATS{$ssr_id}{SEQM} = $seq_masked; + $SSR_STATS{$ssr_id}{COMPOUND} = "FALSE"; #assume its not until proven otherwise + - foreach my $group (keys %MOTIFS) { - my $motifUC = uc($motif); - if($group =~ /\|$motifUC\|/){ - # If this group contains this motif - #print "Incrementing $group for $motif\n"; - $MOTIFS{$group}++; +} + +####################################################### +sub collapse_compound_ssrs{ + foreach my $contig (keys %CONTIG_SSR_STARTS){ + my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; + if(@starts > 1){ + ## this contig has multiple ssrs + my $previous_start = -1; + my $previous_end = -1; + my $previous_ssr_id = ""; + + my $current_start = -1; + my $current_end = -1; + my $current_ssr_id = ""; + + foreach $current_start (sort {$a <=> $b} @starts){ + $current_ssr_id = $contig."_ssr".$current_start; + $current_end = $SSR_STATS{$current_ssr_id}{END}; + + if(too_close($previous_start, $previous_end, $current_start, $current_end)){ + collapse($contig, $previous_ssr_id, $current_ssr_id); + } + + $previous_start = $current_start; + $previous_end = $current_end; + $previous_ssr_id = $current_ssr_id; + } } } +} +################################################################ +sub too_close{ + my $previous_start = shift; + my $previous_end = shift; + my $current_start = shift; + my $current_end = shift; + + # if start is a -1, then go ahead and return ok + if($previous_start == -1){ + return 0; + } + # we want to know if they overlap, abut or are less than 15 bases apart + elsif($previous_end >= $current_start || + ($current_start - $previous_end) < 15){ + return 1; + } + else{ + return 0; + } } +################################################################ +sub collapse{ + my $contig = shift; + my $first_ssr_id = shift; + my $second_ssr_id = shift; + + ##fix SSR_STATS + $SSR_STATS{$first_ssr_id}{MOTIF} = "COMPOUND"; + $SSR_STATS{$first_ssr_id}{END} = $SSR_STATS{$second_ssr_id}{END}; + $SSR_STATS{$first_ssr_id}{MOTIF_LENGTH} = "COMPOUND"; + $SSR_STATS{$first_ssr_id}{NO_REPEATS} = "COMPOUND"; + $SSR_STATS{$first_ssr_id}{COMPOUND} = "TRUE"; #assume its not until proven otherwise + + delete $SSR_STATS{$second_ssr_id}; + print "deleting $second_ssr_id, part of compound ssr\n"; + + ##fix CONTIG_SSR_STARTS + # get rid of the start index for the second ssr (it is now part of the + # first ssr) + + my $index = 0; + my $second_ssr_start = $SSR_STATS{$second_ssr_id}{START}; + $index++ until $CONTIG_SSR_STARTS{$contig}[$index] == $second_ssr_start; + splice(@{$CONTIG_SSR_STARTS{$contig}}, $index, 1); -####################################################### -#sub collapse_compound_ssrs{ -#} -# -# +} + + +################################################################ +sub flag_multiSSRs{ + # adds a MULTI flag to the data hash indicating if the + # ssr is the only one in the sequence or one of many + + foreach my $contig (keys %CONTIG_SSR_STARTS){ + my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; + if(@starts == 1){ + ## this contig has only one ssr + my $start_index = $starts[0]; + my $ssr_id = $contig."_ssr".$start_index; + $SSR_STATS{$ssr_id}{MULTI} = "False"; + } + else{ + ## this contig has multiple ssrs + foreach my $start_index (@starts){ + my $ssr_id = $contig."_ssr".$start_index; + $SSR_STATS{$ssr_id}{MULTI} = "True"; + } + } + } + close FASTA; + +} ################################################################ -#sub flag_multiSSRs{ +#sub printFasta{ # my $fasta_out = shift; # # # this subroutine accomplishes two things @@ -733,6 +810,14 @@ sub process_ssr{ # # #} + #foreach my $group (keys %MOTIFS) { + # my $motifUC = uc($motif); + # if($group =~ /\|$motifUC\|/){ + # # If this group contains this motif + # #print "Incrementing $group for $motif\n"; + # $MOTIFS{$group}++; + # } + #} ## ## $di_worksheet->write("A$di_index", $contig, $formats->{text}); ## $di_worksheet->write("B$di_index", $motif, $formats->{text});