Permalink
Newer
Older
100755 516 lines (430 sloc) 16.5 KB
Jan 23, 2014 @mestato hwg gssr scripts
1 #!/usr/bin/env perl
2 ################################################################################
3 # Author: Meg Staton & Stephen Ficklin
4 # Date: Jan 14th, 2014
5 # Version: 1
6 #
7 # DESCRIPTION
8 # -----------
9 # This script identifies simple sequence repeats (SSRs) from
10 # the sequences in a fastq formatted file.
11 #
12 # Dependencies:
13 # ------------
14 # Perl must have access to the package Getopt::Long, available from
15 # CPAN.
16 #
17 # Usage:
18 # -----
19 # Usage: findSSRs.pl <arguments>
20 #
21 # The list of arguments includes:
22 #
23 # -f|--fastq_file <fastq_file>
24 # Required. The file of the sequences to be searched.
25 #
26 # -a|--fasta_format
27 # Optional. The two output sequence files will be in fasta format
28 # instead of fastq format.
29 #
30 # Output:
31 # ------
32 # Four output files are produced:
33 #
34 # <input-file-name>.ssr.fastq (or fasta)
35 # A fastq (or fasta file if -a is specified) with sequences with a
36 # single identified SSR.
37 #
38 # <input-file-name>.multissr.fastq (or fasta)
39 # A fastq (or fasta file if -a is specified) with sequences with more
40 # than one identified SSR.
41 #
42 # <input-file-name>.ssr_stats.txt
43 # A text file of statistics about the SSRs discovered.
44 #
45 # <input-file-name>.ssr_report.txt
46 # A tab-delimited file with each SSR. The columns are sequence name,
47 # motif, number of repeats, start position and end position.
48 #
49 #
50 # Details:
51 # -------
52 # By default the script finds:
53 # 2 bp motifs repeated from 8 to 40 times,
54 # 3 bp motifs repeated from 7 to 30 times,
55 # 4 bp motifs repeated from 6 to 20 times,
56 #
57 # The script only reports SSRs that are not within 15 bases of either
58 # end of the sequence, in order to allow for primer design.
59 #
60 # These parameters may be changed in the "GLOBAL PARAMETERS" part of
61 # the script.
62 #
63
64 use strict;
65
66 #-------------------------------------------------------------------------------
67 # DEPENDENCIES
68
69 use Getopt::Long;
70
71 #-------------------------------------------------------------------------------
72 # GLOBAL PARAMETERS
73 #-------------------------------------------------------------------------------
74
75 # REPEAT IDENTIFICATION PARAMETERS
76 # Specify Motif Frequency
77
78 # Motifs that occur less frequently than indicated below will be ignored.
79 our $MIN_REPS_2bp = 8;
80 our $MIN_REPS_3bp = 7;
81 our $MIN_REPS_4bp = 6;
82
83 # Motifs that occur more frequently than indicated below will be ignored.
84 our $MAX_REPS_2bp = 40;
85 our $MAX_REPS_3bp = 30;
86 our $MAX_REPS_4bp = 20;
87
88 # SSRs at the beginning or end of a sequence prevent flanking primer design.
89 # This is how close we will allow an SSR to be to the ends of the sequence.
90 our $LENGTH_FROM_END = 15;
91
92 #-------------------------------------------------------------------------------
93 # GLOBAL HASHES
94 #-------------------------------------------------------------------------------
95
96 # For counting the number of times each motif is found
97 my %MOTIFS = ('|AT|TA|' => 0,
98 '|AG|GA|CT|TC|' => 0,
99 '|AC|CA|TG|GT|' => 0,
100 '|GC|CG|' => 0,
101
102 '|AAT|ATA|TAA|ATT|TTA|TAT|' => 0,
103 '|AAG|AGA|GAA|CTT|TTC|TCT|' => 0,
104 '|AAC|ACA|CAA|GTT|TTG|TGT|' => 0,
105
106 '|CCA|CAC|CCA|TGG|GTG|TGG|' => 0,
107 '|GGC|GCG|CGG|GCC|CCG|CGC|' => 0,
108 '|AGG|GAG|GGA|CCT|CTC|TCC|' => 0,
109
110 '|ATG|TGA|GAT|CAT|ATC|TCA|' => 0,
111 '|AGT|GTA|TAG|ACT|CTA|TAC|' => 0,
112 '|AGC|GCA|CAG|GCT|CTG|TGC|' => 0,
113 '|ACG|CGA|GAC|CGT|GTC|TCG|' => 0);
114
115 # For counting the number of times each motif length is found
116 my %MOTIFLEN = ('2' => 0,
117 '3' => 0,
118 '4' => 0);
119
120
121 # Set up the Motif specifications, based on the chosen motif types:.
122 my @MOTIF_SPECS;
123 push(@MOTIF_SPECS,[2, $MIN_REPS_2bp, $MAX_REPS_2bp, 'dinucleotides']);
124 push(@MOTIF_SPECS,[3, $MIN_REPS_3bp, $MAX_REPS_3bp, 'trinucleotides']);
125 push(@MOTIF_SPECS,[4, $MIN_REPS_4bp, $MAX_REPS_4bp, 'tetranucleotides']);
126
127 my $SSR_COUNT = 0; # total number of SSRs found in sequences
128 my %STATS = (); # stores information about each SSR (motif, motif length, etc)
129 my %SEQ_SSR_LOC = (); # stores the start positions of SSRs found in each sequence
130 my $SEQ_COUNT = 0; # counts the number of sequences seen
131
132
133 #-------------------------------------------------------------------------------
134 # EXECUTE
135 #-------------------------------------------------------------------------------
136 main();
137 #-------------------------------------------------------------------------------
138 # PUBLIC SUBROUTINES
139 #-------------------------------------------------------------------------------
140 # Function Name: main()
141
142 sub main{
143 my $fastq_file; # holds the command line parameter for fastq file name
144 my $fasta_flag = 0; # whether to print fastq or fasta files, set on command line
145
146 Getopt::Long::Configure ('bundling');
147 GetOptions('f|fastq_file=s' => \$fastq_file,
148 'a|fasta_format' => \$fasta_flag);
149
150 ## Check that all required parameters have been included
151 if(!$fastq_file){ print "A fastq file is required.\n"; _printUsage(); exit;}
152
153 ## Check that fastq file exists
154 if(! -e $fastq_file) { print "Fasta file $fastq_file does not exist\n"; exit; }
155
156
157 print "finding SSRs...\n";
158 my $ssr_out = "$fastq_file.ssr_report.txt"; # the tab-delimited output file
159 getContigHash($fastq_file, $ssr_out);
160
161 print "stats...\n";
162 my $stats_out = "$fastq_file.ssr_stats.txt"; # file of statistics
163 printStats($stats_out);
164
165 if($fasta_flag){
166 my $fasta_out = "$fastq_file.ssr.fasta"; # fasta file of sequences with 1 SSR
167 my $fasta_out_multi = "$fastq_file.multissr.fasta"; # fasta file of sequences with >1 SSR
168 print "printing fasta file...";
169 printFasta($fastq_file, $fasta_out, $fasta_out_multi);
170 print "done.\n";
171 }
172 else{
173 my $fastq_out = "$fastq_file.ssr.fastq"; # fastq file of sequences with 1 SSR
174 my $fastq_out_multi = "$fastq_file.multissr.fastq"; # fastq file of sequences with >1 SSR
175 print "printing fastq file...";
176 printFastq($fastq_file, $fastq_out, $fastq_out_multi);
177 print "done.\n";
178 }
179
180 }
181
182 ###############################################################
183
184 sub getContigHash{
185 my $fastq_file = $_[0]; # file name
186 my $ssr_out = $_[1]; # file name
187
188 open (OUT, ">".$ssr_out) || die "ERROR cannot open $_\n";
189 my $out_fh = *OUT;
190
191 open (IN, $fastq_file) || die "Can't open $fastq_file\n";
192 my @aux = undef;
193 my ($seqname, $seqstr, $qual);
194 while (($seqname, $seqstr, $qual) = readfq(\*IN, \@aux)) {
195 $SEQ_COUNT++;
196 process_seq($seqname, $seqstr, $out_fh);
197 }
198
199 }
200
201 ###############################################################
202
203 sub process_seq{
204 my $contig_name = shift;
205 my $seq = shift;
206 my $out_fh = shift;
207
208
209 my %seen; # used to keep track of start positions we've already seen
210 my $index; # used to iterate through the 2D motif specs array
211
212 ## LOOPB
213 # iterate through the motif specifications
214 for $index (0 .. (scalar @MOTIF_SPECS - 1)) {
215
216 # holds the motif size (1,2,3,4,5 or 6)
217 my $motifLength = $MOTIF_SPECS[$index][0];
218 # holds the minimum number of repeats
219 my $min_number_of_repeats = $MOTIF_SPECS[$index][1]-1;
220 # holds the maximum number of repeats
221 my $max_number_of_repeats = $MOTIF_SPECS[$index][2]-1;
222 # the regular expression for looking for SSRs
223 my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,})";
224
225 ## LOOPC
226 # run through the sequence and check for this motif spec
227 while ($seq =~ /$regex/ig) {
228 # Get the ssr and motif that were found
229 my $ssr = $1;
230 my $motif = lc $2;
231
232 ## initialize the repeats varaible to (motifLength - 1)
233 ## or 1 if motif length - 1 equals 0;
234 my $repeats = $motifLength - 1;
235 $repeats = 1 if(!$repeats);
236
237 my $noRepeats = ((length $ssr)/$motifLength);
238 my $ssrEnd = pos $seq;
239 my $ssrStart = ($ssrEnd - (length $ssr) + 1);
240
241 ## LOOP D - check to make sure the motif isn't the same letter over and over again AND we don't have too many repeats
242
243 if (!($motif =~ /([gatc])\1{$repeats}/) && $noRepeats <= $max_number_of_repeats) {
244
245 ## LOOPE
246 # Only store the information if we have never
247 # seen this starting position before.
248 if (!exists $seen{$contig_name."_ssr".$ssrStart}) {
249
250 # LOOP F
251 # Check to make sure the SSR is not within $LENGTH_FROM_END bp of either end of the sequence
252 my $seqLen = length $seq;
253 if($ssrStart >= $LENGTH_FROM_END && $ssrEnd <= ($seqLen-$LENGTH_FROM_END)){
254
255 ## FOUND A SSR TO REPORT
256 my $ssr_id = $contig_name."_ssr".$ssrStart;
257 $seen{$ssr_id} = 1;
258
259 if(exists $SEQ_SSR_LOC{$contig_name}){
260 push @{ $SEQ_SSR_LOC{$contig_name} }, $ssrStart;
261 }
262 else{
263 $SEQ_SSR_LOC{$contig_name} = [$ssrStart];
264 }
265
266 $SSR_COUNT++;
267 printf $out_fh ("$contig_name\t$motif\t$noRepeats\t$ssrStart\t$ssrEnd\n");
268 # Store STATS
269 $STATS{$ssr_id}{MOTIF} = $motif;
270 $STATS{$ssr_id}{START} = $ssrStart;
271 $STATS{$ssr_id}{END} = $ssrEnd;
272 $STATS{$ssr_id}{MOTIF_LENGTH} = $motifLength;
273 $STATS{$ssr_id}{NO_REPEATS} = $noRepeats;
274
275 my $motiflen = length $motif;
276 my $tmp = $MOTIFLEN{$motiflen};
277 $tmp++;
278 $MOTIFLEN{$motiflen} = $tmp;
279
280 # Increment motif count
281 foreach my $group (keys %MOTIFS) {
282 my $motifUC = uc($motif);
283 if($group =~ /\|$motifUC\|/){
284 my $tmp = $MOTIFS{$group}++;
285 $tmp++;
286 $MOTIFS{$group} = $tmp;
287 }
288 }# end foreach $group
289 } ## LOOP F
290
291 ## LOOPE
292 }
293
294 # LOOPD
295 }
296
297 ## LOOPC
298 } # end while (sequence =~ /$regex/ig);
299
300 ## LOOPB
301 } # end for $index (0 .. (scalar @{$MOTIF_SPECS} - 1))
302
303 }
304
305 ################################################################
306 sub readfq {
307 my ($fh, $aux) = @_;
308 @$aux = [undef, 0] if (!(@$aux));
309 return if ($aux->[1]);
310 if (!($aux->[0])) {
311 while (<$fh>) {
312 chomp;
313 if (substr($_, 0, 1) eq '>' || substr($_, 0, 1) eq '@') {
314 $aux->[0] = $_;
315 last;
316 }
317 }
318 if (!($aux->[0])) {
319 $aux->[1] = 1;
320 return;
321 }
322 }
323 my $name = /^.(\S+)/? $1 : '';
324 my $seq = '';
325 my $c;
326 $aux->[0] = undef;
327 while (<$fh>) {
328 chomp;
329 $c = substr($_, 0, 1);
330 last if ($c eq '>' || $c eq '@' || $c eq '+');
331 $seq .= $_;
332 }
333 $aux->[0] = $_;
334 $aux->[1] = 1 if (!($aux->[0]));
335 return ($name, $seq) if ($c ne '+');
336 my $qual = '';
337 while (<$fh>) {
338 chomp;
339 $qual .= $_;
340 if (length($qual) >= length($seq)) {
341 $aux->[0] = undef;
342 return ($name, $seq, $qual);
343 }
344 }
345 $aux->[1] = 1;
346 return ($name, $seq);
347 }
348
349
350 ################################################################
351 sub printStats{
352 my $stats_out = $_[0]; # file name
353
354 open (OUTS, ">".$stats_out) || die "ERROR cannot open $stats_out\n";
355
356 my $time = scalar localtime; # Get the current time
357
358 ## count number of seqs with single or multiple SSRs
359 my $SINGLE_SSR_COUNT = 0;
360 my $MULTI_SSR_COUNT = 0;
361
362 foreach my $contig_name (keys %SEQ_SSR_LOC){
363 my $starts = scalar @{ $SEQ_SSR_LOC{$contig_name} };
364 if($starts == 1){ $SINGLE_SSR_COUNT++;}
365 elsif($starts > 1){ $MULTI_SSR_COUNT++; }
366 }
367
368 print OUTS 'SSR Summary Report\n';
369 print OUTS "Analsis of $SEQ_COUNT sequences\n";
370 print OUTS "$time\n";
371 print OUTS "Number of SSRs identified\t$SSR_COUNT\n";
372 print OUTS "Number of sequences with 1 SSR: $SINGLE_SSR_COUNT\n";
373 print OUTS "Number of sequences with more than one SSR: $MULTI_SSR_COUNT\n";
374 print OUTS "\n";
375 print OUTS "Base Pairs in Motif\tMin # Reps\tMax # Reps\n";
376 print OUTS "--------------------------------------\n";
377 print OUTS "2 (Dinucleotides)\t$MIN_REPS_2bp\t$MAX_REPS_2bp\n";
378 print OUTS "3 (Trinucleotides)\t$MIN_REPS_3bp\t$MAX_REPS_3bp\n";
379 print OUTS "4 (Tetranucleotides)\t$MIN_REPS_4bp\t$MAX_REPS_4bp\n";
380 print OUTS "\n";
381 print OUTS "Motif Patterns\tNumber of SSRs Found\n";
382 print OUTS "--------------------------------------\n";
383 my $group;
384 foreach $group (sort {length $a <=> length $b} keys %MOTIFS){
385 $group =~ s/^|//;
386 $group =~ s/|$//;
387 print OUTS "$group\t$MOTIFS{$group}\n";
388 }
389 print OUTS "\n";
390 print OUTS "Motif Pattern Length\tNumber of SSRs Found\n";
391 print OUTS "--------------------------------------\n";
392
393 foreach $group (sort keys %MOTIFLEN){
394 print OUTS "$group\t$MOTIFLEN{$group}\n";
395 }
396
397 close OUTS;
398
399 }
400
401 sub printFasta{
402 my $fastq_file = $_[0]; # file name
403 my $fasta_out = $_[1]; # file name
404 my $fasta_out_multi = $_[2]; # file name
405
406 my $line;
407 my $name;
408
409 open (IN, $fastq_file) || die "ERROR cannot open $_\n";
410 open (OUTF, ">".$fasta_out) || die "ERROR cannot open $_\n";
411 open (OUTFM, ">".$fasta_out_multi) || die "ERROR cannot open $_\n";
412
413 while($line = <IN>){
414 if($line =~ /^@(\S+)/){
415 $name = $1;
416 if(exists $SEQ_SSR_LOC{$name}){
417 my $ssr_count = scalar @{ $SEQ_SSR_LOC{$name} };
418 if($ssr_count == 1){
419 print OUTF ">$name\n";
420 my $seq = <IN>;
421 print OUTF "$seq\n";
422 $line = <IN>;
423 $line = <IN>;
424 }
425 else{
426 print OUTFM ">$name\n";
427 my $seq = <IN>;
428 print OUTFM "$seq\n";
429 $line = <IN>;
430 $line = <IN>;
431 }
432 }
433 else{
434 $line = <IN>;
435 $line = <IN>;
436 $line = <IN>;
437 }
438 }
439 }
440
441 close IN;
442 close OUTF;
443 close OUTFM;
444
445 }
446
447 sub printFastq{
448 my $fastq_file = $_[0]; # file name
449 my $fastq_out = $_[1]; # file name
450 my $fastq_out_multi = $_[2]; # file name
451
452 my $line;
453 my $name;
454
455 open (IN, $fastq_file) || die "ERROR cannot open $_\n";
456 open (OUTF, ">".$fastq_out) || die "ERROR cannot open $_\n";
457 open (OUTFM, ">".$fastq_out_multi) || die "ERROR cannot open $_\n";
458
459 while($line = <IN>){
460 if($line =~ /^@(\S+)/){
461 $name = $1;
462 if(exists $SEQ_SSR_LOC{$name}){
463 my $ssr_count = scalar @{ $SEQ_SSR_LOC{$name} };
464 if($ssr_count == 1){
465 print OUTF $line;
466 $line = <IN>;
467 print OUTF $line;
468 $line = <IN>;
469 print OUTF $line;
470 $line = <IN>;
471 print OUTF $line;
472 }
473 else{
474 print OUTFM $line;
475 $line = <IN>;
476 print OUTFM $line;
477 $line = <IN>;
478 print OUTFM $line;
479 $line = <IN>;
480 print OUTFM $line;
481 }
482 }
483 else{
484 $line = <IN>;
485 $line = <IN>;
486 $line = <IN>;
487 }
488 }
489 }
490
491 close IN;
492 close OUTF;
493 close OUTFM;
494
495 }
496
497 sub _printUsage {
498 print "Usage: $0 <arguments>";
499 print qq(
500 The list of arguments includes:
501
502 -f|--fastq_file <fastq_file>
503 Required. The file of the sequences to be searched.
504
505 -a|--fasta_format
506 Optional. The two output sequence files will be in fasta format
507 instead of fastq format.
508
509 );
510 print "\n";
511 return;
512 }
513
514
515 1;