Permalink
Newer
Older
100755 1108 lines (913 sloc) 34.8 KB
Jan 23, 2014 @mestato adding post assembly script
1 #!/usr/bin/env perl
2 ################################################################################
3 # Author: Meg Staton & Stephen Ficklin
4 #
5 # DESCRIPTION
6 # -----------
7 # This script identifies simple sequence repeats (SSRs) and calls primers from
8 # the sequences in a fastq formatted file.
9 #
10 # Dependencies:
11 # ------------
12 # Perl must have access to the packages:
13 # Getopt::Long
14 # Bio::SeqIO
15 # Excel::Writer::XLSX
16 # All are available from CPAN.
17 #
18 # Also path to the primer3 executable and primer3 config files must be specified
19 # in the global variables section of the script.
Jul 3, 2015 Fixing spaces.
20 #
Jan 23, 2014 @mestato adding post assembly script
21 # Usage:
22 # -----
23 # Usage: findSSRs.pl <arguments>
24 #
25 # The list of arguments includes:
Jul 3, 2015 Fixing spaces.
26 #
Jan 23, 2014 @mestato adding post assembly script
27 # -f|--fasta_file <fasta_file>
Jul 3, 2015 Fixing spaces.
28 # Required. The file of the sequences to be searched.
29 #
Mar 10, 2014 The usage instructions now make it clear that masked file is required…
30 # -m|--masked_file <masked_fasta_file>
31 # Required. A soft-masked version of the fasta file (soft masked means low
32 # complexity sequences are in lower case bases.)
Jul 3, 2015 Fixing spaces.
33 #
Jan 23, 2014 @mestato adding post assembly script
34 # Output:
35 # ------
36 # <input-file-name>.ssr.fasta
Jul 7, 2015 Fasta file output done.
37 # A fasta file with sequences with a SSR. (Sequences with compound SSRs are included)
Jan 23, 2014 @mestato adding post assembly script
38 #
39 # <input-file-name>.ssr_stats.txt
40 # A text file of statistics about the SSRs discovered.
41 #
42 # <input-file-name>.ssr_report.txt
Jul 7, 2015 Added number of repeats to ssr flat file
43 # A tab-delimited file with each SSR. The columns are SSR ID,
44 # motif, number of repeats, start position, end position.
Jan 23, 2014 @mestato adding post assembly script
45 #
46 # <input-file-name>.ssr_report.xlsx
47 # A excel file with SSR results and stats
48 #
49 # <input-file-name>.di_primer_report.txt
50 # <input-file-name>.tri_primer_report.txt
51 # <input-file-name>.tetra_primer_report.txt
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
52 # Tab-delimited files with sequences with a specified SSR motif length. Columns are
Jul 7, 2015 Added number of repeats to ssr flat file
53 # SSR ID, motif, number of repeats, start position, end position, left primer,
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
54 # right primer, left primer Tm, right primer Tm, amplicon size
Jan 23, 2014 @mestato adding post assembly script
55 #
Jul 3, 2015 Fixing spaces.
56 # Details:
Jan 23, 2014 @mestato adding post assembly script
57 # -------
58 # By default the script finds:
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
59 # 2 bp motifs repeated from 8 to 200 times,
60 # 3 bp motifs repeated from 7 to 133 times,
61 # 4 bp motifs repeated from 6 to 100 times,
Jan 23, 2014 @mestato adding post assembly script
62 #
Jul 3, 2015 Fixing spaces.
63 # These parameters may be changed in the "GLOBAL PARAMETERS" part of
Jan 23, 2014 @mestato adding post assembly script
64 # the script.
65 #
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
66 # Compound SSRs are defined as any SSRs that abut or are less than 15 bases
67 # apart. These are essentially compound SSRs for the purposes of mapping
68 # because it is unlikely that primers can be designed between the repeat
69 # segments.
70 #
Jan 23, 2014 @mestato adding post assembly script
71
72
73 use strict;
74
75 #-------------------------------------------------------------------------------
76 # DEPENDENCIES
77 #-------------------------------------------------------------------------------
78
79 use Getopt::Long;
80 use Bio::SeqIO;
81 use Excel::Writer::XLSX;
82
83 #-------------------------------------------------------------------------------
84 # GLOBAL PARAMETERS
85 #-------------------------------------------------------------------------------
86
87 #--------------
88 # REPEAT IDENTIFICATION PARAMETERS
Jul 3, 2015 Fixing spaces.
89 # Specify Motif Frequency
Jan 23, 2014 @mestato adding post assembly script
90 # Motifs that occur less frequently than indicated below will be ignored.
91 # A 0 indicates that this motif length will be ignored.
92
93 our $MIN_REPS_2bp = 8;
94 our $MIN_REPS_3bp = 7;
95 our $MIN_REPS_4bp = 6;
96
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
97 our $MAX_REPS_2bp = 200;
98 our $MAX_REPS_3bp = 133;
99 our $MAX_REPS_4bp = 100;
Jan 23, 2014 @mestato adding post assembly script
100
101 #------------
102 # PRIMER PARAMETERS
103
Mar 10, 2014 The usage instructions now make it clear that masked file is required…
104 my $PRIMER3 = "/lustre/projects/staton/software/primer3-2.3.6/src/primer3_core";
105 my $PRIMER3_CONFIG = "/lustre/projects/staton/software/primer3-2.3.6/src/primer3_config/";
Jan 23, 2014 @mestato adding post assembly script
106
107 my $PRIMER_OPT_SIZE="20"; # default 20
108 my $PRIMER_MIN_SIZE="18"; # default 18
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
109 my $PRIMER_MAX_SIZE="27"; # default 27
Jan 23, 2014 @mestato adding post assembly script
110
111 my $PRIMER_NUM_NS_ACCEPTED = "0"; # default 0
112
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
113 my $PRIMER_PRODUCT_SIZE_RANGE = "100-450";
Jan 23, 2014 @mestato adding post assembly script
114
Jul 3, 2015 Fixing spaces.
115 my $PRIMER_OPT_TM = "60.0";
Jan 23, 2014 @mestato adding post assembly script
116 my $PRIMER_MIN_TM = "55.0";
117 my $PRIMER_MAX_TM = "65.0";
118
119 my $PRIMER_MIN_GC = "40";
120 my $PRIMER_MAX_GC = "60";
121
122 my $PRIMER_MAX_POLY_X = "3";
123 my $PRIMER_GC_CLAMP = "2";
124
Jul 21, 2015 Added primer3 parameter to utilize masking.
125 my $PRIMER_LOWERCASE_MASKING = 1;
126
Jan 23, 2014 @mestato adding post assembly script
127 #-------------------------------------------------------------------------------
128 # GLOBAL HASHES
129 #-------------------------------------------------------------------------------
130 # This makes life much easier than passing a bunch of hash refs all over the place.
Jul 4, 2015 The method parseP3_output now deals solely with parsing primer3 outpu…
131 #
132 #-------------------------------------------------------------------------------
133 # Data structures:
134
135 ## CONTIG_SSR_STARTS structure:
136 ## key: contig_name
137 ## value: array of starts of SSRs in that contig
138 my %CONTIG_SSR_STARTS = ();
139
140 ## SSR_STATS structure:
141 ## key: ssr_id
142 ## value -> keys: MOTIF START END MOTIF_LENGTH NO_REPEATS
143 my %SSR_STATS = ();
144 my $SEQ_COUNT = 0;
Jan 23, 2014 @mestato adding post assembly script
145
Jul 4, 2015 The method parseP3_output now deals solely with parsing primer3 outpu…
146 # Set up the Motif specifications, based on the chosen motif types:.
147 my @MOTIF_SPECS;
148 push(@MOTIF_SPECS,[2, $MIN_REPS_2bp, $MAX_REPS_2bp, 'dinucleotides']);
149 push(@MOTIF_SPECS,[3, $MIN_REPS_3bp, $MAX_REPS_3bp, 'trinucleotides']);
150 push(@MOTIF_SPECS,[4, $MIN_REPS_4bp, $MAX_REPS_4bp, 'tetranucleotides']);
Jan 23, 2014 @mestato adding post assembly script
151
Jul 4, 2015 The method parseP3_output now deals solely with parsing primer3 outpu…
152 #-------------------------------------------------------------------------------
153 # Generating statistics
Jul 7, 2015 Statistics being calculated and output.
154 my $TIME = scalar localtime; # Get the current time
155 my $SEQ_w_SSRS = 0;
156 my $SSR_COUNT = 0;
157 my $SSR_COUNT_COMPOUND = 0;
158 my $SSR_COUNT_PRIMER = 0;
159
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
160 my %MOTIFLEN = (2 => 0,
161 3 => 0,
162 4 => 0);
Jul 7, 2015 Statistics being calculated and output.
163
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
164 my %MOTIFLEN_w_PRIMERS = (2 => 0,
165 3 => 0,
166 4 => 0);
Jan 23, 2014 @mestato adding post assembly script
167 my %MOTIFS = ('|AT|TA|' => 0,
168 '|AG|GA|CT|TC|' => 0,
169 '|AC|CA|TG|GT|' => 0,
170 '|GC|CG|' => 0,
171
172 '|AAT|ATA|TAA|ATT|TTA|TAT|' => 0,
173 '|AAG|AGA|GAA|CTT|TTC|TCT|' => 0,
174 '|AAC|ACA|CAA|GTT|TTG|TGT|' => 0,
175
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
176 '|CCA|CAC|ACC|TGG|GTG|GGT|' => 0,
Jan 23, 2014 @mestato adding post assembly script
177 '|GGC|GCG|CGG|GCC|CCG|CGC|' => 0,
178 '|AGG|GAG|GGA|CCT|CTC|TCC|' => 0,
179
180 '|ATG|TGA|GAT|CAT|ATC|TCA|' => 0,
181 '|AGT|GTA|TAG|ACT|CTA|TAC|' => 0,
182 '|AGC|GCA|CAG|GCT|CTG|TGC|' => 0,
183 '|ACG|CGA|GAC|CGT|GTC|TCG|' => 0);
184
185
186
187 #-------------------------------------------------------------------------------
188 # EXECUTE
189 #-------------------------------------------------------------------------------
190 main();
191 #-------------------------------------------------------------------------------
192 # PUBLIC SUBROUTINES
193 #-------------------------------------------------------------------------------
194 # Function Name: main()
195
196 sub main{
Jul 7, 2015 cleaned up main method a bit
197
198 ##---------------------------------------------------------------
199 ## Get input parameters
Jan 23, 2014 @mestato adding post assembly script
200 my $fasta_file;
201 my $masked_file;
Mar 10, 2014 The usage instructions now make it clear that masked file is required…
202 my $project;
Jan 23, 2014 @mestato adding post assembly script
203
204 Getopt::Long::Configure ('bundling');
205 GetOptions('f|fasta_file=s' => \$fasta_file,
206 'm|masked_file=s' => \$masked_file,
Mar 10, 2014 The usage instructions now make it clear that masked file is required…
207 'p|project=s' => \$project);
Jan 23, 2014 @mestato adding post assembly script
208
209 ## Check that all required parameters have been included
210 if(!$fasta_file){ print "A fasta file is required.\n"; _printUsage(); exit;}
211 if(!$masked_file){ print "A masked file is required.\n"; _printUsage(); exit;}
212
213 ## Check that fasta file exists
214 if(! -e $fasta_file) { print "Fasta file $fasta_file does not exist\n"; exit; }
215 if(! -e $masked_file) { print "Masked file $masked_file does not exist\n"; exit; }
216
Jul 7, 2015 cleaned up main method a bit
217 ##---------------------------------------------------------------
218 ## Set up output files
219 my $p3_input = "$fasta_file.p3in.txt";
220 my $p3_output = "$fasta_file.p3out.txt";
221 my $ssr_out = "$fasta_file.ssr_report.txt";
222 my $fasta_out = "$fasta_file.ssr.fasta";
223 my $stats_out = "$fasta_file.ssr_stats.txt";
224 my $di_primer_out = "$fasta_file.di_primer_report.txt";
225 my $tri_primer_out = "$fasta_file.tri_primer_report.txt";
226 my $tetra_primer_out = "$fasta_file.tetra_primer_report.txt";
227 my $ssr_xlsx = "$fasta_file.ssr_report.xlsx";
Jul 7, 2015 Fasta file output done.
228
Jul 4, 2015 Changed name of function getContigHash to process_file. This makes mo…
229 ##---------------------------------------------------------------
Jan 23, 2014 @mestato adding post assembly script
230 print "finding SSRs...\n";
Jul 7, 2015 Back to basics. Control flow
231 process_file($fasta_file, $masked_file);
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
232 collapse_compound_ssrs();
233 flag_multiSSRs();
Jul 7, 2015 New subroutine flag_multiSSRs, to identify SSRs that are in contigs w…
234 print "done.\n";
235
Jul 7, 2015 parse primer3 output working again
236 ##---------------------------------------------------------------
Jul 7, 2015 primer3 input file being generated now
237 print "running primer3...\n";
238 addToPrimer3InputFile ($p3_input);
Jul 7, 2015 parse primer3 output working again
239 print "$PRIMER3 < $p3_input > $p3_output\n";
240 my $status = system("$PRIMER3 < $p3_input > $p3_output");
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
241 print "$status\n";
Jul 7, 2015 parse primer3 output working again
242 parseP3_output($p3_output);
243 print "done.\n";
Jul 7, 2015 Back to basics. Control flow
244
245 ##---------------------------------------------------------------
246 ## Producing output - Fasta files and flat files
Jul 7, 2015 Fasta file output done.
247 print "printing output files...";
248 create_flat_files($ssr_out, $di_primer_out, $tri_primer_out, $tetra_primer_out);
249 create_fasta_file($fasta_out);
Jul 7, 2015 Moved flag_multiSSRs above output_primer_flat_files b/c the latter de…
250
Jul 4, 2015 The method parseP3_output now deals solely with parsing primer3 outpu…
251 ##---------------------------------------------------------------
Jul 7, 2015 Separating out three types of output: fasta/flat files, statistics an…
252 ## Producing output - statistics
253
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
254 calculate_stats();
255 print_stats($stats_out);
Jul 7, 2015 New subroutine create_output_files now prints flat files
256
Jul 7, 2015 Separating out three types of output: fasta/flat files, statistics an…
257 ##---------------------------------------------------------------
258 ## Producing output - Excel
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
259 create_excel_file($ssr_xlsx, $project);
260 print "done.\n";
Mar 18, 2014 Added output fasta files with sequences in fasta format.
261
Jan 23, 2014 @mestato adding post assembly script
262 }
263
264 ###############################################################
265
Jul 4, 2015 Changed name of function getContigHash to process_file. This makes mo…
266 sub process_file{
Jan 23, 2014 @mestato adding post assembly script
267 my $fasta_file = $_[0]; # file name
268 my $masked_file = $_[1]; # file name
269
Jul 7, 2015 primer3 input file being generated now
270 my $seqio = Bio::SeqIO->new('-format' => 'fasta', -file => $fasta_file);
271 my $seqioM = Bio::SeqIO->new('-format' => 'fasta', -file => $masked_file);
Jan 23, 2014 @mestato adding post assembly script
272
273 # Get seq obj from io stream
Jul 7, 2015 primer3 input file being generated now
274 while(my $seqobj = $seqio->next_seq){
275 my $seqobjM = $seqioM->next_seq;
Jan 23, 2014 @mestato adding post assembly script
276
277 $SEQ_COUNT++;
278
Jul 7, 2015 primer3 input file being generated now
279 my $seqname = $seqobj->id; # get actual sequence as a string
280 my $seqnameM = $seqobjM->id; # get actual sequence as a string
Jan 23, 2014 @mestato adding post assembly script
281
Jul 7, 2015 primer3 input file being generated now
282 my $seqstr = $seqobj->seq(); # get actual sequence as a string
283 my $seqstrM = $seqobjM->seq(); # get actual sequence as a string
Jan 23, 2014 @mestato adding post assembly script
284
285 if($seqname ne $seqnameM){
286 die "masked sequence $seqnameM not in same order as regular sequence $seqname\n";
287 }
288
Jul 7, 2015 Back to basics. Control flow
289 process_seq($seqname, $seqstr, $seqstrM);
Jan 23, 2014 @mestato adding post assembly script
290 }
291
292 }
293
294 ###############################################################
295
296 sub process_seq{
297 my $contig_name = shift;
298 my $seq = shift;
299 my $seq_masked = shift;
300
301
302 my %seen; # used to keep track of start positions we've already seen
303 my $index; # used to iterate through the 2D motif specs array
304
305 ## LOOPB
306 # iterate through the motif specifications
307 for $index (0 .. (scalar @MOTIF_SPECS - 1)) {
308
309 # holds the motif size (1,2,3,4,5 or 6)
310 my $motifLength = $MOTIF_SPECS[$index][0];
311 # holds the minimum number of repeats
312 my $min_number_of_repeats = $MOTIF_SPECS[$index][1]-1;
313 # holds the maximum number of repeats
314 my $max_number_of_repeats = $MOTIF_SPECS[$index][2]-1;
315 # the regular expression for looking for SSRs
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
316 my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,$max_number_of_repeats})";
Jan 23, 2014 @mestato adding post assembly script
317
Jul 3, 2015 Fixing spaces.
318 # run through the sequence and check for this motif spec
Jan 23, 2014 @mestato adding post assembly script
319 while ($seq =~ /$regex/ig) {
320 # Get the ssr and motif that were found
321 my $ssr = $1;
322 my $motif = lc $2;
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
323 my $start_index = $-[0];
324 my $end_index = $+[0];
Jan 23, 2014 @mestato adding post assembly script
325
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
326 if(quality_check_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq)){
Jul 7, 2015 Back to basics. Control flow
327 process_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq, $seq_masked);
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
328
329 }
330
331 }
332 }
333 }
334
335
336
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
337 ###############################################################
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
338 sub quality_check_ssr{
339 my $contig_name = shift;
340 my $ssr = shift;
341 my $motif = shift;
342 my $start_index = shift;
343 my $end_index = shift;
344 my $seq = shift;
345
346 ##-------------------------------------
347 ## CHECKS to see if this is a good ssr
348 my $flag_same_base = 0;
349 my $flag_already_seen = 0;
350
351 ## Check #1
352 ## ignore SSRs that are the same base repeated
353 if ($ssr !~ /^g+$/i &&
354 $ssr !~ /^a+$/i &&
355 $ssr !~ /^c+$/i &&
356 $ssr !~ /^t+$/i ) {
357 $flag_same_base = 1;
358 }
359
360 # Check #2
361 # Make sure this isn't an already called SSR in disguise
362 # (a dinucleotide repeat posing as a tetranucleotide repeat, for instance)
363 if (!exists $SSR_STATS{$contig_name."_ssr".$start_index} &&
364 !exists $SSR_STATS{$contig_name."_ssr".($start_index-1)} &&
365 !exists $SSR_STATS{$contig_name."_ssr".($start_index-2)} &&
366 !exists $SSR_STATS{$contig_name."_ssr".($start_index+1)} &&
367 !exists $SSR_STATS{$contig_name."_ssr".($start_index+2)}
368 ) {
369 $flag_already_seen = 1;
370 }
371
Jul 7, 2015 Rethinking things... changed output file description (not yet changed…
372 if($flag_same_base && $flag_already_seen){
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
373 return 1;
374 }
375 else{
376 return 0;
377 }
378
379 }
380
381
382
Jul 7, 2015 Back to basics. Control flow
383 ######################################################
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
384 sub process_ssr{
385 my $contig_name = shift;
386 my $ssr = shift;
387 my $motif = shift;
388 my $start_index = shift;
389 my $end_index = shift;
390 my $seq = shift;
391 my $seq_masked = shift;
392
393 ##-------------------------------------
394 ## generate a few more stats and variables
395 my $motif_len = length $motif;
Jul 8, 2015 Last little bug - now correctly calculating number of repeats
396 my $num_of_repeats = ($end_index-$start_index)/$motif_len;
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
397 my $ssr_id = $contig_name."_ssr".$start_index;
398
399 ##-------------------------------------
400 ## store in data structures
401
402 if(exists $CONTIG_SSR_STARTS{$contig_name}){
403 push @{ $CONTIG_SSR_STARTS{$contig_name} }, $start_index;
404 }
405 else{
406 $CONTIG_SSR_STARTS{$contig_name} = [$start_index];
407 }
408
409 $SSR_STATS{$ssr_id}{MOTIF} = $motif;
410 $SSR_STATS{$ssr_id}{START} = $start_index;
411 $SSR_STATS{$ssr_id}{END} = $end_index;
412 $SSR_STATS{$ssr_id}{MOTIF_LENGTH} = $motif_len;
413 $SSR_STATS{$ssr_id}{NO_REPEATS} = $num_of_repeats;
414 $SSR_STATS{$ssr_id}{SEQ} = $seq;
415 $SSR_STATS{$ssr_id}{SEQM} = $seq_masked;
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
416 $SSR_STATS{$ssr_id}{COMPOUND} = 0; #assume its not until proven otherwise
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
417
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
418
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
419 }
420
421 #######################################################
422 sub collapse_compound_ssrs{
423 foreach my $contig (keys %CONTIG_SSR_STARTS){
424 my @starts = @{ $CONTIG_SSR_STARTS{$contig}};
425 if(@starts > 1){
426 ## this contig has multiple ssrs
427 my $previous_start = -1;
428 my $previous_end = -1;
429 my $previous_ssr_id = "";
430
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
431 foreach my $current_start (sort {$a <=> $b} @starts){
432 my $current_ssr_id = $contig."_ssr".$current_start;
433 my $current_end = $SSR_STATS{$current_ssr_id}{END};
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
434
435 if(too_close($previous_start, $previous_end, $current_start, $current_end)){
436 collapse($contig, $previous_ssr_id, $current_ssr_id);
437 }
438
439 $previous_start = $current_start;
440 $previous_end = $current_end;
441 $previous_ssr_id = $current_ssr_id;
442 }
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
443 }
444 }
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
445 }
Jul 3, 2015 Rewrote process_seq method, now is broken into process_seq, which cal…
446
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
447 ################################################################
448 sub too_close{
449 my $previous_start = shift;
450 my $previous_end = shift;
451 my $current_start = shift;
452 my $current_end = shift;
453
454 # if start is a -1, then go ahead and return ok
455 if($previous_start == -1){
456 return 0;
457 }
458 # we want to know if they overlap, abut or are less than 15 bases apart
459 elsif($previous_end >= $current_start ||
460 ($current_start - $previous_end) < 15){
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
461 #print "$previous_start - $previous_end, $current_start - $current_end\n";
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
462 return 1;
463 }
464 else{
465 return 0;
466 }
Jul 4, 2015 New subroutine, initiate_workbooks (mostly code from original parseP3…
467 }
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
468 ################################################################
469 sub collapse{
470 my $contig = shift;
471 my $first_ssr_id = shift;
472 my $second_ssr_id = shift;
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
473 my $second_ssr_start = $SSR_STATS{$second_ssr_id}{START};
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
474
475 ##fix SSR_STATS
476 $SSR_STATS{$first_ssr_id}{MOTIF} = "COMPOUND";
477 $SSR_STATS{$first_ssr_id}{END} = $SSR_STATS{$second_ssr_id}{END};
478 $SSR_STATS{$first_ssr_id}{MOTIF_LENGTH} = "COMPOUND";
479 $SSR_STATS{$first_ssr_id}{NO_REPEATS} = "COMPOUND";
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
480 $SSR_STATS{$first_ssr_id}{COMPOUND} = 1; #assume its not until proven otherwise
481
482 delete $SSR_STATS{$second_ssr_id}{MOTIF};
483 delete $SSR_STATS{$second_ssr_id}{START};
484 delete $SSR_STATS{$second_ssr_id}{END};
485 delete $SSR_STATS{$second_ssr_id}{MOTIF_LENGTH};
486 delete $SSR_STATS{$second_ssr_id}{NO_REPEATS};
487 delete $SSR_STATS{$second_ssr_id}{SEQ};
488 delete $SSR_STATS{$second_ssr_id}{SEQM};
489 delete $SSR_STATS{$second_ssr_id}{COMPOUND};
490 undef %{$SSR_STATS{$second_ssr_id}};
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
491 delete $SSR_STATS{$second_ssr_id};
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
492
493 print "\tdeleting $second_ssr_id, part of compound ssr\n";
494
495 if(exists $SSR_STATS{$second_ssr_id}){ print "\t still exists\n";}
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
496
497 ##fix CONTIG_SSR_STARTS
498 # get rid of the start index for the second ssr (it is now part of the
499 # first ssr)
500
501 my $index = 0;
502 $index++ until $CONTIG_SSR_STARTS{$contig}[$index] == $second_ssr_start;
503 splice(@{$CONTIG_SSR_STARTS{$contig}}, $index, 1);
Jul 4, 2015 New subroutine, initiate_workbooks (mostly code from original parseP3…
504
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
505 }
506
507
508 ################################################################
509 sub flag_multiSSRs{
510 # adds a MULTI flag to the data hash indicating if the
511 # ssr is the only one in the sequence or one of many
512
513 foreach my $contig (keys %CONTIG_SSR_STARTS){
514 my @starts = @{ $CONTIG_SSR_STARTS{$contig}};
515 if(@starts == 1){
516 ## this contig has only one ssr
517 my $start_index = $starts[0];
518 my $ssr_id = $contig."_ssr".$start_index;
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
519 $SSR_STATS{$ssr_id}{MULTI} = 0;
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
520 }
521 else{
522 ## this contig has multiple ssrs
523 foreach my $start_index (@starts){
524 my $ssr_id = $contig."_ssr".$start_index;
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
525 $SSR_STATS{$ssr_id}{MULTI} = 1;
Jul 7, 2015 Collapse compound ssrs and flag multi ssrs are now done.
526 }
527 }
528 }
529 close FASTA;
530
531 }
Jul 7, 2015 primer3 input file being generated now
532
533 ################################################################
534 sub addToPrimer3InputFile{
535 my $p3_file = shift;
536
537 open OUT, ">$p3_file";
538
539 foreach my $ssr_id (keys %SSR_STATS){
540 #skip compound SSRS
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
541 if($SSR_STATS{$ssr_id}{COMPOUND} == 0){
Jul 7, 2015 primer3 input file being generated now
542 my $ssrStart = $SSR_STATS{$ssr_id}{START};
543 my $ssrEnd = $SSR_STATS{$ssr_id}{END};
544 my $seq = $SSR_STATS{$ssr_id}{SEQM};
545
546 # change from soft mask to hard mask
547 $seq =~ s/[actg]/N/g;
548
549 my $len = $ssrEnd-$ssrStart;
550
551 printf OUT ("SEQUENCE_ID=$ssr_id\n");
552 printf OUT ("SEQUENCE_TEMPLATE=$seq\n");
553 printf OUT ("SEQUENCE_TARGET=$ssrStart,$len\n");
554 printf OUT ("PRIMER_TASK=generic\n");
555 printf OUT ("PRIMER_PICK_LEFT_PRIMER=1\n");
556 printf OUT ("PRIMER_PICK_INTERNAL_OLIGO=0\n");
557 printf OUT ("PRIMER_PICK_RIGHT_PRIMER=1\n");
558 printf OUT ("PRIMER_OPT_SIZE=$PRIMER_OPT_SIZE\n");
559 printf OUT ("PRIMER_MIN_SIZE=$PRIMER_MIN_SIZE\n");
560 printf OUT ("PRIMER_MAX_SIZE=$PRIMER_MAX_SIZE\n");
561 printf OUT ("PRIMER_NUM_NS_ACCEPTED=$PRIMER_NUM_NS_ACCEPTED\n");
562 printf OUT ("PRIMER_PRODUCT_SIZE_RANGE=$PRIMER_PRODUCT_SIZE_RANGE\n");
563 printf OUT ("PRIMER_OPT_TM=$PRIMER_OPT_TM\n");
564 printf OUT ("PRIMER_MIN_TM=$PRIMER_MIN_TM\n");
565 printf OUT ("PRIMER_MAX_TM=$PRIMER_MAX_TM\n");
566 printf OUT ("PRIMER_MIN_GC=$PRIMER_MIN_GC\n");
567 printf OUT ("PRIMER_MAX_GC=$PRIMER_MAX_GC\n");
568 printf OUT ("PRIMER_MAX_POLY_X=$PRIMER_MAX_POLY_X\n");
569 printf OUT ("PRIMER_GC_CLAMP=$PRIMER_GC_CLAMP\n");
570 printf OUT ("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=$PRIMER3_CONFIG\n");
Jul 21, 2015 Added primer3 parameter to utilize masking.
571 printf OUT ("PRIMER_LOWERCASE_MASKING=$PRIMER_LOWERCASE_MASKING\n");
Jul 7, 2015 primer3 input file being generated now
572 printf OUT ("=\n");
573 }
574 }
575 close OUT;
576 }
Jul 7, 2015 Back to basics. Control flow
577 ################################################################
Jul 7, 2015 parse primer3 output working again
578 sub parseP3_output{
579 my $p3_output = $_[0]; # file name
580
581 # We are going to keep track of a weird phenomenon only seen in one
582 # project - the generation of identical forward and reverse primers. The
583 # sequences from this project were overlapping paired ends that were joined.
584 # Apparently something went wrong and weird sequences were obtained, all of
585 # which yield the identical primers.
586 # This is not reported in the final stats, just as part of the standard output.
587 my $identical_primer_cnt = 0;
588
589 # The primers output file separates information about different sequences
590 # with an equal sign on a single line. So, we want to set the file line
591 # delimiter (for looping on the input file below) to a single equal sign
592 # followed by a line feed. This way were guranteed to have all the primer
593 # information together per line
594 local $/ = "=\n";
595
596 open (P3O, $p3_output) || die "could not open $_\n";
597
598 # Read in all of the lines of the input file
599 my $primer_record;
600 while ($primer_record = <P3O>) {
601 my $start = "";
602 my $seq_id = "";
603 my $ssr_id = "";
604 my $forward = "";
605 my $reverse = "";
606 my $product_size = "";
607 my $left_tm = "";
608 my $right_tm = "";
609
610 if ($primer_record =~ /SEQUENCE_ID=(\S+)/) {
611 $ssr_id = $1;
612 }
613 # get the primary primers only
614 if ($primer_record =~ /PRIMER_LEFT_0_SEQUENCE=(\S+)/) {
615 $forward = $1;
616 }
617 if ($primer_record =~ /PRIMER_RIGHT_0_SEQUENCE=(\S+)/) {
618 $reverse = $1;
619 }
620 if ($primer_record =~ /PRIMER_LEFT_0_TM=(\S+)/) {
621 $left_tm = $1;
622 }
623 if ($primer_record =~ /PRIMER_RIGHT_0_TM=(\S+)/) {
624 $right_tm = $1;
625 }
626 if ($primer_record =~ /PRIMER_PAIR_0_PRODUCT_SIZE=(\S+)/) {
627 $product_size = $1;
628 }
629
630 if(length $forward > 1){
631 if($forward eq $reverse){
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
632 print "\tFLAG: identical primer problem with $ssr_id\n";
Jul 7, 2015 parse primer3 output working again
633 $identical_primer_cnt++;
634 }
635 else{
636 $SSR_STATS{$ssr_id}{FORWARD} = $forward;
637 $SSR_STATS{$ssr_id}{REVERSE} = $reverse;
638 $SSR_STATS{$ssr_id}{PRODUCT_SIZE} = $product_size;
639 $SSR_STATS{$ssr_id}{LEFT_TM} = $left_tm;
640 $SSR_STATS{$ssr_id}{RIGHT_TM} = $right_tm;
641
642 }
643 }
644 }
645
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
646 print "\ttotal identical primers: $identical_primer_cnt\n";
Jul 7, 2015 parse primer3 output working again
647 }
648
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
649 ###################################################
Jul 7, 2015 Fasta file output done.
650 sub create_flat_files{
651 my $ssr_out = shift;
652 my $di_primer_out = shift;
653 my $tri_primer_out = shift;
654 my $tetra_primer_out = shift;
655
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
656 open OUTS, ">$ssr_out";
657 open OUT2, ">$di_primer_out";
658 open OUT3, ">$tri_primer_out";
659 open OUT4, ">$tetra_primer_out";
660
661 my $di_fh = *OUT2;
662 my $tri_fh = *OUT3;
663 my $tetra_fh = *OUT4;
664
665 ##printer headers
666 print OUTS join("\t", "SSR ID",
Jul 7, 2015 Added number of repeats to ssr flat file
667 "motif", "number of repeats", "start position", "end position");
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
668 print OUTS "\n";
669
670 print OUT2 join("\t", "SSR ID",
671 "motif", "number of repeats", "start position",
672 "end position", "forward primer", "reverse primer",
673 "forward Tm", "reverse Tm","product size" );
674 print OUT2 "\n";
675
676 print OUT3 join("\t", "SSR ID",
677 "motif", "number of repeats", "start position",
678 "end position", "forward primer", "reverse primer",
679 "forward Tm", "reverse Tm","product size" );
680 print OUT3 "\n";
681
682 print OUT4 join("\t", "SSR ID",
683 "motif", "number of repeats", "start position",
684 "end position", "forward primer", "reverse primer",
685 "forward Tm", "reverse Tm","product size" );
686 print OUT4 "\n";
687
688 foreach my $ssr_id (keys %SSR_STATS){
689 ## all ssrs including compound go in main ssr file
690 print OUTS join("\t",
691 $ssr_id,
692 $SSR_STATS{$ssr_id}{MOTIF},
Jul 7, 2015 Added number of repeats to ssr flat file
693 $SSR_STATS{$ssr_id}{NO_REPEATS},
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
694 $SSR_STATS{$ssr_id}{START},
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
695 $SSR_STATS{$ssr_id}{END},
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
696 );
697 print OUTS "\n";
698
699 # for primer flat files, only print SSRs with
700 # that have primers
701 if($SSR_STATS{$ssr_id}{COMPOUND} == 0 &&
702 $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/
703 ){
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
704 if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
705 _print_primer_flat_file_line($di_fh, $ssr_id);
706 }
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
707 elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
708 _print_primer_flat_file_line($tri_fh, $ssr_id);
709 }
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
710 elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
711 _print_primer_flat_file_line($tetra_fh, $ssr_id);
712 }
713 }
714 }
715 close OUTS;
716 close OUT2;
717 close OUT3;
718 close OUT4;
719
Jul 7, 2015 Fasta file output done.
720 }
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
721 ###################################################
722 sub _print_primer_flat_file_line{
723 my $fh = shift;
724 my $ssr_id = shift;
725
726 print $fh join("\t", $ssr_id,
727 $SSR_STATS{$ssr_id}{MOTIF},
728 $SSR_STATS{$ssr_id}{NO_REPEATS},
729 $SSR_STATS{$ssr_id}{START},
730 $SSR_STATS{$ssr_id}{END},
731 $SSR_STATS{$ssr_id}{FORWARD},
732 $SSR_STATS{$ssr_id}{REVERSE},
733 $SSR_STATS{$ssr_id}{LEFT_TM},
734 $SSR_STATS{$ssr_id}{RIGHT_TM},
735 $SSR_STATS{$ssr_id}{PRODUCT_SIZE},
736 );
737 print $fh "\n";
Jul 7, 2015 Fasta file output done.
738
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
739 }
740
741 ###################################################
Jul 7, 2015 Fasta file output done.
742 sub create_fasta_file{
743 my $fasta_out = shift;
744 open FASTA, ">$fasta_out";
745
746 foreach my $contig (keys %CONTIG_SSR_STARTS){
747 my @starts = @{ $CONTIG_SSR_STARTS{$contig}};
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
748 my $seq = "";
Jul 7, 2015 Fasta file output done.
749 print FASTA ">$contig (";
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
750 foreach my $start_index (sort {$a <=> $b} @starts){
Jul 7, 2015 Fasta file output done.
751 my $ssr_id = $contig."_ssr".$start_index;
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
752
753 #if($SSR_STATS{$ssr_id}{START} >= 0){
754 $seq = $SSR_STATS{$ssr_id}{SEQ};
Jul 7, 2015 Fasta file output done.
755 print FASTA "$SSR_STATS{$ssr_id}{START}-$SSR_STATS{$ssr_id}{END} ";
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
756 if($SSR_STATS{$ssr_id}{COMPOUND} == 1){
Jul 7, 2015 Fasta file output done.
757 print FASTA "*Compound ";
758 }
759 }
760 #get the first ssr index just so we can get the sequence
Jul 7, 2015 Printing fasta files and other flat files done. Collapsing compound s…
761 print FASTA ")\n";
762 print FASTA "$seq\n";
Jul 7, 2015 Fasta file output done.
763
764 }
765 close FASTA;
766
767 }
Jul 7, 2015 Back to basics. Control flow
768 ################################################################
Jul 7, 2015 Statistics being calculated and output.
769 sub calculate_stats{
Jul 7, 2015 primer3 input file being generated now
770
Jul 7, 2015 Statistics being calculated and output.
771 $SEQ_w_SSRS = keys %CONTIG_SSR_STARTS;
772
773 foreach my $ssr_id (keys %SSR_STATS){
774 $SSR_COUNT++;
775
776 if($SSR_STATS{$ssr_id}{COMPOUND} == 1){
777 $SSR_COUNT_COMPOUND++;
778 }
779 else{
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
780 my $motif_len = $SSR_STATS{$ssr_id}{MOTIF_LENGTH} ;
781 #print "motif length is $motif_len\n";
782 $MOTIFLEN{$motif_len}++;
Jul 7, 2015 Statistics being calculated and output.
783
784 my $motifUC = uc($SSR_STATS{$ssr_id}{MOTIF});
785 foreach my $group (keys %MOTIFS) {
786 if($group =~ /\|$motifUC\|/){
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
787 #print "Incrementing $group for $motifUC\n";
Jul 7, 2015 Statistics being calculated and output.
788 $MOTIFS{$group}++;
789 }
790 }
791
792 if($SSR_STATS{$ssr_id}{FORWARD} =~ /\S/){
793 $SSR_COUNT_PRIMER++;
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
794 $MOTIFLEN_w_PRIMERS{$motif_len}++;
Jul 7, 2015 Statistics being calculated and output.
795 }
796 }
797 }
798
799 }
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
800 sub print_stats{
Jul 7, 2015 Statistics being calculated and output.
801 my $stats_out = $_[0]; # file name
802
803 open (OUTS, ">".$stats_out) || die "ERROR cannot open $stats_out\n";
804
805 print OUTS 'SSR Summary Report\n';
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
806 print OUTS "Analysis of $SEQ_COUNT sequences\n";
Jul 7, 2015 Statistics being calculated and output.
807 print OUTS "$TIME\n";
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
808 print OUTS "\n";
Jul 7, 2015 Statistics being calculated and output.
809 print OUTS "Number of sequences with at least one SSR\t$SEQ_w_SSRS\n";
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
810 print OUTS "Number of SSRs identified\t$SSR_COUNT\n";
811 print OUTS "\n";
812 print OUTS "Number of compound SSRs*: $SSR_COUNT_COMPOUND\n";
813 print OUTS "Number of SSRs with primers**: $SSR_COUNT_PRIMER\n";
814 print OUTS "\n";
815 print OUTS "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n";
816 print OUTS "**No primers are designed for compound SSRs\n";
Jul 7, 2015 Statistics being calculated and output.
817 print OUTS "\n";
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
818 print OUTS "Parameters used for identifying SSRS:\n";
Jul 7, 2015 Statistics being calculated and output.
819 print OUTS "Base Pairs in Motif\tMin # Reps\tMax # Reps\n";
820 print OUTS "--------------------------------------\n";
821 print OUTS "2 (Dinucleotides)\t$MIN_REPS_2bp\t$MAX_REPS_2bp\n";
822 print OUTS "3 (Trinucleotides)\t$MIN_REPS_3bp\t$MAX_REPS_3bp\n";
823 print OUTS "4 (Tetranucleotides)\t$MIN_REPS_4bp\t$MAX_REPS_4bp\n";
824 print OUTS "\n";
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
825 print OUTS "Chart of motif pattern frequence (compound SSRs excluded)\n";
Jul 7, 2015 Statistics being calculated and output.
826 print OUTS "Motif Patterns\tNumber of SSRs Found\n";
827 print OUTS "--------------------------------------\n";
828 my $group;
829 foreach $group (sort {length $a <=> length $b} keys %MOTIFS){
830 $group =~ s/^|//;
831 $group =~ s/|$//;
832 print OUTS "$group\t$MOTIFS{$group}\n";
833 }
834 print OUTS "\n";
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
835 print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n";
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
836 print OUTS "Motif Pattern Length\tNumber of SSRs\n";
Jul 7, 2015 Statistics being calculated and output.
837 print OUTS "--------------------------------------\n";
838
839 foreach $group (sort keys %MOTIFLEN){
840 print OUTS "$group\t$MOTIFLEN{$group}\n";
841 }
842
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
843 print OUTS "\n";
844 print OUTS "SSRS with Primers \n";
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
845 print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n";
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
846 print OUTS "Motif Pattern Length\tNumber of SSRs\n";
Jul 7, 2015 Statistics being calculated and output.
847 print OUTS "--------------------------------------\n";
848
849 foreach $group (sort keys %MOTIFLEN_w_PRIMERS){
850 print OUTS "$group\t$MOTIFLEN_w_PRIMERS{$group}\n";
851 }
852
853
854 close OUTS;
855
856 }
Jul 7, 2015 parse primer3 output working again
857
Jul 7, 2015 Back to basics. Control flow
858 ################################################################
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
859
860 sub create_excel_file{
861 my $ssr_xlsx = shift;
862 my $project = shift;
863
864 # Create an excel workbook
865 my $workbook = Excel::Writer::XLSX->new("$ssr_xlsx");
866
867 # Setup the formats that will be necessary for the excel spreadsheet
868 my %header = (font => 'Calibri',
869 size => 12,
870 bold => 1,
871 color => 'black',
872 align => 'left',
873 text_wrap => 1);
874
875 my %text = (font => 'Calibri',
876 size => 12,
877 color => 'black',
878 align => 'left',
879 text_wrap => 1);
880
881 #add the formats to the workbook
882 my $header_format = $workbook->add_format(%header);
883 my $text_format = $workbook->add_format(%text);
884
Jul 7, 2015 Everything working including excel data sheets.
885 my $worksheet_stats = create_stats_worksheet($workbook, $header_format, $text_format, $project);
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
886
Jul 7, 2015 Everything working including excel data sheets.
887 build_data_worksheets($workbook, $header_format, $text_format);
888
889 $worksheet_stats->activate();
890 $worksheet_stats->select();
891 $workbook->close();
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
892
893
894 }
895
896 sub create_stats_worksheet{
897 my $workbook = shift;
898 my $header_format = shift;
899 my $text_format = shift;
900 my $project = shift;
901
902 my $worksheet = $workbook->add_worksheet("Summary");
903
904 ## set all cells to text format
905 ## only cells that need the header format will need to specify the format during write
906 ## set column widths
907 $worksheet->set_column('A:A', 75, $text_format);
908 $worksheet->set_column('B:B', 30, $text_format);
909 $worksheet->set_column('C:C', 30, $text_format);
910
911 $worksheet->write('A1', "SSR Summary Report for $project", $header_format);
912 $worksheet->write('A2', "Analysis of $SEQ_COUNT sequences");
913 $worksheet->write('A3', "$TIME");
914
915 $worksheet->write('A5', "Number of sequences with at least one SSR");
916 $worksheet->write('B5', "$SEQ_w_SSRS");
917
918 $worksheet->write('A6', "Number of SSRs identified");
919 $worksheet->write('B6', "$SSR_COUNT");
920
921 $worksheet->write('A8', "Number of compound SSRs*:");
922 $worksheet->write('B8', "$SSR_COUNT_COMPOUND");
923
924 $worksheet->write('A9', "Number of SSRs with primers**");
925 $worksheet->write('B9', "$SSR_COUNT_PRIMER");
926
927 $worksheet->write('A11', "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n");
928
929 $worksheet->write('A12', "**No primers are designed for compound SSRs\n");
930
931 $worksheet->write('A14', "Parameters used for identifying SSRS:\n", $header_format);
932
933 $worksheet->write('A15','Base Pairs in Motif', $header_format);
934 $worksheet->write('B15','Min # Reps', $header_format);
935 $worksheet->write('C15','Max # Reps', $header_format);
936
937 $worksheet->write('A16','2 (Dinucleotides)');
938 $worksheet->write('B16',"$MIN_REPS_2bp");
939 $worksheet->write('C16',"$MAX_REPS_2bp");
940
941 $worksheet->write('A17','3 (Trinucleotides)');
942 $worksheet->write('B17',"$MIN_REPS_3bp");
943 $worksheet->write('C17',"$MAX_REPS_3bp");
944
945 $worksheet->write('A18','4 (Tetranucleotides)');
946 $worksheet->write('B18',"$MIN_REPS_4bp");
947 $worksheet->write('C18',"$MAX_REPS_4bp");
948
949 ##----------------------------------------------------------
950 ##Chart of motif pattern frequence (compound SSRs excluded)
951
952 $worksheet->write('A20','Chart of motif pattern frequence (compound SSRs excluded)', $header_format);
953 $worksheet->write('A21','Motif Patterns', $header_format);
954 $worksheet->write('B21','Number of SSRs Found', $header_format);
955 my $group;
956 my $i = 21;
957 foreach $group (sort {length $a <=> length $b} keys %MOTIFS){
958 $group =~ s/^|//;
959 $group =~ s/|$//;
960 $i++;
961 $worksheet->write("A$i", $group);
962 $worksheet->write("B$i", $MOTIFS{$group});
963 }
964
965 ##----------------------------------------------------------
966 ## Chart of motif pattern length frequence (compound SSRs excluded)
967 $i++;
968 $i++;
969 $worksheet->write("A$i", "Chart of motif pattern length frequence (compound SSRs excluded)", $header_format);
970 $i++;
971 $worksheet->write("A$i",'Motif Pattern Length', $header_format);
972 $worksheet->write("B$i",'Number of SSRs Found', $header_format);
973 foreach $group (sort keys %MOTIFLEN){
974 $i++;
975 $worksheet->write("A$i", "$group bp");
976 $worksheet->write("B$i", $MOTIFLEN{$group});
977 }
978
979 ##----------------------------------------------------------
980 ## SSRs w primers:
981 ## Chart of motif pattern length frequence (compound SSRs excluded)
982 $i++;
983 $i++;
984 $worksheet->write("A$i",'SSRs with Primers', $header_format);
985 $i++;
986 $worksheet->write("A$i",'Chart of motif pattern length frequence (compound SSRs excluded)', $header_format);
987 $i++;
988 $worksheet->write("A$i",'Motif Pattern Length', $header_format);
989 $worksheet->write("B$i",'Number of SSRs Found', $header_format);
990 foreach $group (sort keys %MOTIFLEN_w_PRIMERS){
991 $i++;
992 $worksheet->write("A$i", "$group bp");
993 $worksheet->write("B$i", $MOTIFLEN_w_PRIMERS{$group});
994 }
995
Jul 7, 2015 Everything working including excel data sheets.
996 return $worksheet;
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
997 }
998
Jul 7, 2015 Everything working including excel data sheets.
999 ##############################################################
1000 sub build_data_worksheets{
1001 my $workbook = shift;
1002 my $header_format = shift;
1003 my $text_format = shift;
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
1004
Jul 7, 2015 Everything working including excel data sheets.
1005 my $di_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Dinucleotides");
1006 my $tri_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Trinucleotides");
1007 my $tetra_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Tetranucleotides");
1008
1009 my $di_index = 3;
1010 my $tri_index = 3;
1011 my $tetra_index = 3;
1012
1013 foreach my $ssr_id (keys %SSR_STATS){
1014 # for excel data files, only print SSRs
1015 # that have primers
1016 if($SSR_STATS{$ssr_id}{COMPOUND} == 0 &&
1017 $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/
1018 ){
1019 if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){
1020 _print_excel_file_line($di_worksheet, $di_index, $ssr_id);
1021 $di_index++;
1022 }
1023 elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){
1024 _print_excel_file_line($tri_worksheet, $tri_index, $ssr_id);
1025 $tri_index++;
1026 }
1027 elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){
1028 _print_excel_file_line($tetra_worksheet, $tetra_index, $ssr_id);
1029 $tetra_index++;
1030 }
1031 }
1032 }
1033
1034
1035 }
1036
1037 ##############################################################
1038 sub _initiate_worksheet{
1039 my $workbook = $_[0];
1040 my $header_format = $_[1];
1041 my $text_format = $_[2];
1042 my $name = $_[3];
1043
1044 my $worksheet = $workbook->add_worksheet($name);
1045 $worksheet->set_column('A:A', 60, $text_format);
1046 $worksheet->set_column('B:E', 10, $text_format);
1047 $worksheet->set_column('F:G', 30, $text_format);
1048 $worksheet->set_column('H:J', 10, $text_format);
1049
1050 $worksheet->write('A1', "$name with primers", $header_format);
1051 $worksheet->write('A2', 'SSR ID', $header_format);
1052 $worksheet->write('B2', 'Motif', $header_format);
1053 $worksheet->write('C2', '# Repeats', $header_format);
1054 $worksheet->write('D2', 'Start', $header_format);
1055 $worksheet->write('E2', 'End', $header_format);
1056 $worksheet->write('F2', 'Forward Primer', $header_format);
1057 $worksheet->write('G2', 'Reverse Primer', $header_format);
1058 $worksheet->write('H2', 'Forward Tm', $header_format);
1059 $worksheet->write('I2', 'Reverse Tm', $header_format);
1060 $worksheet->write('J2', 'Fragment Size', $header_format);
1061
1062 return $worksheet;
1063 }
Jul 7, 2015 Lots of fixes, excel summary sheet being produced.
1064
Jul 7, 2015 fixed lots of errors and bugs in stats and output files. excel not wo…
1065
Jan 23, 2014 @mestato adding post assembly script
1066 ################################################################
Jul 7, 2015 Everything working including excel data sheets.
1067 sub _print_excel_file_line{
1068 my $worksheet = shift;
1069 my $index = shift;
1070 my $ssr_id = shift;
1071
1072 $worksheet->write("A$index", $ssr_id);
1073 $worksheet->write("B$index", $SSR_STATS{$ssr_id}{MOTIF});
1074 $worksheet->write("C$index", $SSR_STATS{$ssr_id}{NO_REPEATS});
1075 $worksheet->write("D$index", $SSR_STATS{$ssr_id}{START});
1076 $worksheet->write("E$index", $SSR_STATS{$ssr_id}{END});
1077 $worksheet->write("F$index", $SSR_STATS{$ssr_id}{FORWARD});
1078 $worksheet->write("G$index", $SSR_STATS{$ssr_id}{REVERSE});
1079 $worksheet->write("H$index", $SSR_STATS{$ssr_id}{LEFT_TM});
1080 $worksheet->write("I$index", $SSR_STATS{$ssr_id}{RIGHT_TM});
1081 $worksheet->write("J$index", $SSR_STATS{$ssr_id}{PRODUCT_SIZE});
1082
1083 }
1084
1085 ###############################################################
1086 sub _printUsage {
1087 print "Usage: $0.pl <arguments>";
1088 print qq(
1089 The list of arguments includes:
1090
1091 -f|--fasta_file <fasta_file>
1092 Required. The file of the sequences to be searched.
1093
1094 -m|--masked_file <masked_fasta_file>
1095 Required. A soft-masked version of the fasta file (soft masked means low
1096 complexity sequences are in lower case bases.)
1097
1098 -p|--project "project name"
1099 Optional. A project name for use in the Excel output.
1100
1101 );
1102 print "\n";
1103 return;
1104 }
1105
1106
1107 1;