diff --git a/hwg_gssr_scripts/findSSRs_post_assembly.pl b/hwg_gssr_scripts/findSSRs_post_assembly.pl index 6038dbf..b026165 100755 --- a/hwg_gssr_scripts/findSSRs_post_assembly.pl +++ b/hwg_gssr_scripts/findSSRs_post_assembly.pl @@ -34,7 +34,7 @@ # Output: # ------ # .ssr.fasta -# A fasta file with sequences with a SSR. (Compound SSRs are not considered) +# A fasta file with sequences with a SSR. (Sequences with compound SSRs are included) # # .ssr_stats.txt # A text file of statistics about the SSRs discovered. @@ -221,13 +221,14 @@ sub main{ $p3_output = "$fasta_file.p3out.txt"; $ssr_out = "$fasta_file.ssr_report.txt"; - $ssr_xlsx = "$fasta_file.ssr_report.xlsx"; $fasta_out = "$fasta_file.ssr.fasta"; $stats_out = "$fasta_file.ssr_stats.txt"; $di_primer_out = "$fasta_file.di_primer_report.txt"; $tri_primer_out = "$fasta_file.tri_primer_report.txt"; $tetra_primer_out = "$fasta_file.tetra_primer_report.txt"; + $ssr_xlsx = "$fasta_file.ssr_report.xlsx"; + ##--------------------------------------------------------------- print "finding SSRs...\n"; process_file($fasta_file, $masked_file); @@ -245,11 +246,9 @@ sub main{ ##--------------------------------------------------------------- ## Producing output - Fasta files and flat files - #($fasta_out); - - #print "printing output files..."; - #create_primer_flat_files ($di_primer_out, $tri_primer_out, $tetra_primer_out); - + print "printing output files..."; + create_flat_files($ssr_out, $di_primer_out, $tri_primer_out, $tetra_primer_out); + create_fasta_file($fasta_out); ##--------------------------------------------------------------- ## Producing output - statistics @@ -523,13 +522,13 @@ sub flag_multiSSRs{ ## this contig has only one ssr my $start_index = $starts[0]; my $ssr_id = $contig."_ssr".$start_index; - $SSR_STATS{$ssr_id}{MULTI} = "False"; + $SSR_STATS{$ssr_id}{MULTI} = "FALSE"; } else{ ## this contig has multiple ssrs foreach my $start_index (@starts){ my $ssr_id = $contig."_ssr".$start_index; - $SSR_STATS{$ssr_id}{MULTI} = "True"; + $SSR_STATS{$ssr_id}{MULTI} = "TRUE"; } } } @@ -652,49 +651,38 @@ sub parseP3_output{ print "total identical primers: $identical_primer_cnt\n"; } -################################################################ -#sub printFasta{ -# my $fasta_out = shift; -# -# # this subroutine accomplishes two things -# # 1. adds a MULTI flag to the data hash indicating if the -# # ssr is the only one in the sequence or one of many -# # 2. prints a fasta file with sequences with a single ssr -# # and another with sequences with multiple ssrs -# -# open FASTA, ">$fasta_out"; -# -# foreach my $contig (keys %CONTIG_SSR_STARTS){ -# my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; -# if(@starts == 1){ -# ## this contig has only one ssr -# my $start_index = $starts[0]; -# my $ssr_id = $contig."_ssr".$start_index; -# $SSR_STATS{$ssr_id}{MULTI} = "False"; -# #print "\t$ssr_id:FALSE\n"; -# print FASTA ">$contig ". -# "($SSR_STATS{$ssr_id}{START}-$SSR_STATS{$ssr_id}{END})\n". -# "$SSR_STATS{$ssr_id}{SEQ}\n"; -# } -# else{ -# ## this contig has multiple ssrs -# print FASTA ">$contig ("; -# foreach my $start_index (@starts){ -# my $ssr_id = $contig."_ssr".$start_index; -# $SSR_STATS{$ssr_id}{MULTI} = "True"; -# #print "\t$ssr_id:TRUE\n"; -# print FASTA "$SSR_STATS{$ssr_id}{START}-$SSR_STATS{$ssr_id}{END} "; -# } -# #get the first ssr index just so we can get the sequence -# my $start_index = $starts[0]; -# my $ssr_id = $contig."_ssr".$start_index; -# print FASTA ")\n "; -# print FASTA "$SSR_STATS{$ssr_id}{SEQ}\n"; -# } -# } -# close FASTA; -# -#} +sub create_flat_files{ + my $ssr_out = shift; + my $di_primer_out = shift; + my $tri_primer_out = shift; + my $tetra_primer_out = shift; + +} + +sub create_fasta_file{ + my $fasta_out = shift; + open FASTA, ">$fasta_out"; + + foreach my $contig (keys %CONTIG_SSR_STARTS){ + my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; + print FASTA ">$contig ("; + foreach my $start_index (@starts){ + my $ssr_id = $contig."_ssr".$start_index; + print FASTA "$SSR_STATS{$ssr_id}{START}-$SSR_STATS{$ssr_id}{END} "; + if($SSR_STATS{$ssr_id}{COMPOUND} == 'TRUE'){ + print FASTA "*Compound "; + } + } + #get the first ssr index just so we can get the sequence + my $start_index = $starts[0]; + my $ssr_id = $contig."_ssr".$start_index; + print FASTA ")\n "; + print FASTA "$SSR_STATS{$ssr_id}{SEQ}\n"; + + } + close FASTA; + +} ################################################################