diff --git a/hwg_gssr_scripts/findSSRs.pl b/hwg_gssr_scripts/findSSRs.pl new file mode 100755 index 0000000..f4cebbc --- /dev/null +++ b/hwg_gssr_scripts/findSSRs.pl @@ -0,0 +1,515 @@ +#!/usr/bin/env perl +################################################################################ +# Author: Meg Staton & Stephen Ficklin +# Date: Jan 14th, 2014 +# Version: 1 +# +# DESCRIPTION +# ----------- +# This script identifies simple sequence repeats (SSRs) from +# the sequences in a fastq formatted file. +# +# Dependencies: +# ------------ +# Perl must have access to the package Getopt::Long, available from +# CPAN. +# +# Usage: +# ----- +# Usage: findSSRs.pl +# +# The list of arguments includes: +# +# -f|--fastq_file +# Required. The file of the sequences to be searched. +# +# -a|--fasta_format +# Optional. The two output sequence files will be in fasta format +# instead of fastq format. +# +# Output: +# ------ +# Four output files are produced: +# +# .ssr.fastq (or fasta) +# A fastq (or fasta file if -a is specified) with sequences with a +# single identified SSR. +# +# .multissr.fastq (or fasta) +# A fastq (or fasta file if -a is specified) with sequences with more +# than one identified SSR. +# +# .ssr_stats.txt +# A text file of statistics about the SSRs discovered. +# +# .ssr_report.txt +# A tab-delimited file with each SSR. The columns are sequence name, +# motif, number of repeats, start position and end position. +# +# +# Details: +# ------- +# By default the script finds: +# 2 bp motifs repeated from 8 to 40 times, +# 3 bp motifs repeated from 7 to 30 times, +# 4 bp motifs repeated from 6 to 20 times, +# +# The script only reports SSRs that are not within 15 bases of either +# end of the sequence, in order to allow for primer design. +# +# These parameters may be changed in the "GLOBAL PARAMETERS" part of +# the script. +# + +use strict; + +#------------------------------------------------------------------------------- +# DEPENDENCIES + +use Getopt::Long; + +#------------------------------------------------------------------------------- +# GLOBAL PARAMETERS +#------------------------------------------------------------------------------- + +# REPEAT IDENTIFICATION PARAMETERS +# Specify Motif Frequency + +# Motifs that occur less frequently than indicated below will be ignored. +our $MIN_REPS_2bp = 8; +our $MIN_REPS_3bp = 7; +our $MIN_REPS_4bp = 6; + +# Motifs that occur more frequently than indicated below will be ignored. +our $MAX_REPS_2bp = 40; +our $MAX_REPS_3bp = 30; +our $MAX_REPS_4bp = 20; + +# SSRs at the beginning or end of a sequence prevent flanking primer design. +# This is how close we will allow an SSR to be to the ends of the sequence. +our $LENGTH_FROM_END = 15; + +#------------------------------------------------------------------------------- +# GLOBAL HASHES +#------------------------------------------------------------------------------- + +# For counting the number of times each motif is found +my %MOTIFS = ('|AT|TA|' => 0, + '|AG|GA|CT|TC|' => 0, + '|AC|CA|TG|GT|' => 0, + '|GC|CG|' => 0, + + '|AAT|ATA|TAA|ATT|TTA|TAT|' => 0, + '|AAG|AGA|GAA|CTT|TTC|TCT|' => 0, + '|AAC|ACA|CAA|GTT|TTG|TGT|' => 0, + + '|CCA|CAC|CCA|TGG|GTG|TGG|' => 0, + '|GGC|GCG|CGG|GCC|CCG|CGC|' => 0, + '|AGG|GAG|GGA|CCT|CTC|TCC|' => 0, + + '|ATG|TGA|GAT|CAT|ATC|TCA|' => 0, + '|AGT|GTA|TAG|ACT|CTA|TAC|' => 0, + '|AGC|GCA|CAG|GCT|CTG|TGC|' => 0, + '|ACG|CGA|GAC|CGT|GTC|TCG|' => 0); + +# For counting the number of times each motif length is found +my %MOTIFLEN = ('2' => 0, + '3' => 0, + '4' => 0); + + +# Set up the Motif specifications, based on the chosen motif types:. +my @MOTIF_SPECS; +push(@MOTIF_SPECS,[2, $MIN_REPS_2bp, $MAX_REPS_2bp, 'dinucleotides']); +push(@MOTIF_SPECS,[3, $MIN_REPS_3bp, $MAX_REPS_3bp, 'trinucleotides']); +push(@MOTIF_SPECS,[4, $MIN_REPS_4bp, $MAX_REPS_4bp, 'tetranucleotides']); + +my $SSR_COUNT = 0; # total number of SSRs found in sequences +my %STATS = (); # stores information about each SSR (motif, motif length, etc) +my %SEQ_SSR_LOC = (); # stores the start positions of SSRs found in each sequence +my $SEQ_COUNT = 0; # counts the number of sequences seen + + +#------------------------------------------------------------------------------- +# EXECUTE +#------------------------------------------------------------------------------- +main(); +#------------------------------------------------------------------------------- +# PUBLIC SUBROUTINES +#------------------------------------------------------------------------------- +# Function Name: main() + +sub main{ + my $fastq_file; # holds the command line parameter for fastq file name + my $fasta_flag = 0; # whether to print fastq or fasta files, set on command line + + Getopt::Long::Configure ('bundling'); + GetOptions('f|fastq_file=s' => \$fastq_file, + 'a|fasta_format' => \$fasta_flag); + + ## Check that all required parameters have been included + if(!$fastq_file){ print "A fastq file is required.\n"; _printUsage(); exit;} + + ## Check that fastq file exists + if(! -e $fastq_file) { print "Fasta file $fastq_file does not exist\n"; exit; } + + + print "finding SSRs...\n"; + my $ssr_out = "$fastq_file.ssr_report.txt"; # the tab-delimited output file + getContigHash($fastq_file, $ssr_out); + + print "stats...\n"; + my $stats_out = "$fastq_file.ssr_stats.txt"; # file of statistics + printStats($stats_out); + + if($fasta_flag){ + my $fasta_out = "$fastq_file.ssr.fasta"; # fasta file of sequences with 1 SSR + my $fasta_out_multi = "$fastq_file.multissr.fasta"; # fasta file of sequences with >1 SSR + print "printing fasta file..."; + printFasta($fastq_file, $fasta_out, $fasta_out_multi); + print "done.\n"; + } + else{ + my $fastq_out = "$fastq_file.ssr.fastq"; # fastq file of sequences with 1 SSR + my $fastq_out_multi = "$fastq_file.multissr.fastq"; # fastq file of sequences with >1 SSR + print "printing fastq file..."; + printFastq($fastq_file, $fastq_out, $fastq_out_multi); + print "done.\n"; + } + +} + +############################################################### + +sub getContigHash{ + my $fastq_file = $_[0]; # file name + my $ssr_out = $_[1]; # file name + + open (OUT, ">".$ssr_out) || die "ERROR cannot open $_\n"; + my $out_fh = *OUT; + + open (IN, $fastq_file) || die "Can't open $fastq_file\n"; + my @aux = undef; + my ($seqname, $seqstr, $qual); + while (($seqname, $seqstr, $qual) = readfq(\*IN, \@aux)) { + $SEQ_COUNT++; + process_seq($seqname, $seqstr, $out_fh); + } + +} + +############################################################### + +sub process_seq{ + my $contig_name = shift; + my $seq = shift; + my $out_fh = shift; + + + my %seen; # used to keep track of start positions we've already seen + my $index; # used to iterate through the 2D motif specs array + + ## LOOPB + # iterate through the motif specifications + for $index (0 .. (scalar @MOTIF_SPECS - 1)) { + + # holds the motif size (1,2,3,4,5 or 6) + my $motifLength = $MOTIF_SPECS[$index][0]; + # holds the minimum number of repeats + my $min_number_of_repeats = $MOTIF_SPECS[$index][1]-1; + # holds the maximum number of repeats + my $max_number_of_repeats = $MOTIF_SPECS[$index][2]-1; + # the regular expression for looking for SSRs + my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,})"; + + ## LOOPC + # run through the sequence and check for this motif spec + while ($seq =~ /$regex/ig) { + # Get the ssr and motif that were found + my $ssr = $1; + my $motif = lc $2; + + ## initialize the repeats varaible to (motifLength - 1) + ## or 1 if motif length - 1 equals 0; + my $repeats = $motifLength - 1; + $repeats = 1 if(!$repeats); + + my $noRepeats = ((length $ssr)/$motifLength); + my $ssrEnd = pos $seq; + my $ssrStart = ($ssrEnd - (length $ssr) + 1); + + ## LOOP D - check to make sure the motif isn't the same letter over and over again AND we don't have too many repeats + + if (!($motif =~ /([gatc])\1{$repeats}/) && $noRepeats <= $max_number_of_repeats) { + + ## LOOPE + # Only store the information if we have never + # seen this starting position before. + if (!exists $seen{$contig_name."_ssr".$ssrStart}) { + + # LOOP F + # Check to make sure the SSR is not within $LENGTH_FROM_END bp of either end of the sequence + my $seqLen = length $seq; + if($ssrStart >= $LENGTH_FROM_END && $ssrEnd <= ($seqLen-$LENGTH_FROM_END)){ + + ## FOUND A SSR TO REPORT + my $ssr_id = $contig_name."_ssr".$ssrStart; + $seen{$ssr_id} = 1; + + if(exists $SEQ_SSR_LOC{$contig_name}){ + push @{ $SEQ_SSR_LOC{$contig_name} }, $ssrStart; + } + else{ + $SEQ_SSR_LOC{$contig_name} = [$ssrStart]; + } + + $SSR_COUNT++; + printf $out_fh ("$contig_name\t$motif\t$noRepeats\t$ssrStart\t$ssrEnd\n"); + # Store STATS + $STATS{$ssr_id}{MOTIF} = $motif; + $STATS{$ssr_id}{START} = $ssrStart; + $STATS{$ssr_id}{END} = $ssrEnd; + $STATS{$ssr_id}{MOTIF_LENGTH} = $motifLength; + $STATS{$ssr_id}{NO_REPEATS} = $noRepeats; + + my $motiflen = length $motif; + my $tmp = $MOTIFLEN{$motiflen}; + $tmp++; + $MOTIFLEN{$motiflen} = $tmp; + + # Increment motif count + foreach my $group (keys %MOTIFS) { + my $motifUC = uc($motif); + if($group =~ /\|$motifUC\|/){ + my $tmp = $MOTIFS{$group}++; + $tmp++; + $MOTIFS{$group} = $tmp; + } + }# end foreach $group + } ## LOOP F + + ## LOOPE + } + + # LOOPD + } + + ## LOOPC + } # end while (sequence =~ /$regex/ig); + + ## LOOPB + } # end for $index (0 .. (scalar @{$MOTIF_SPECS} - 1)) + +} + +################################################################ +sub readfq { + my ($fh, $aux) = @_; + @$aux = [undef, 0] if (!(@$aux)); + return if ($aux->[1]); + if (!($aux->[0])) { + while (<$fh>) { + chomp; + if (substr($_, 0, 1) eq '>' || substr($_, 0, 1) eq '@') { + $aux->[0] = $_; + last; + } + } + if (!($aux->[0])) { + $aux->[1] = 1; + return; + } + } + my $name = /^.(\S+)/? $1 : ''; + my $seq = ''; + my $c; + $aux->[0] = undef; + while (<$fh>) { + chomp; + $c = substr($_, 0, 1); + last if ($c eq '>' || $c eq '@' || $c eq '+'); + $seq .= $_; + } + $aux->[0] = $_; + $aux->[1] = 1 if (!($aux->[0])); + return ($name, $seq) if ($c ne '+'); + my $qual = ''; + while (<$fh>) { + chomp; + $qual .= $_; + if (length($qual) >= length($seq)) { + $aux->[0] = undef; + return ($name, $seq, $qual); + } + } + $aux->[1] = 1; + return ($name, $seq); +} + + +################################################################ +sub printStats{ + my $stats_out = $_[0]; # file name + + open (OUTS, ">".$stats_out) || die "ERROR cannot open $stats_out\n"; + + my $time = scalar localtime; # Get the current time + + ## count number of seqs with single or multiple SSRs + my $SINGLE_SSR_COUNT = 0; + my $MULTI_SSR_COUNT = 0; + + foreach my $contig_name (keys %SEQ_SSR_LOC){ + my $starts = scalar @{ $SEQ_SSR_LOC{$contig_name} }; + if($starts == 1){ $SINGLE_SSR_COUNT++;} + elsif($starts > 1){ $MULTI_SSR_COUNT++; } + } + + print OUTS 'SSR Summary Report\n'; + print OUTS "Analsis of $SEQ_COUNT sequences\n"; + print OUTS "$time\n"; + print OUTS "Number of SSRs identified\t$SSR_COUNT\n"; + print OUTS "Number of sequences with 1 SSR: $SINGLE_SSR_COUNT\n"; + print OUTS "Number of sequences with more than one SSR: $MULTI_SSR_COUNT\n"; + print OUTS "\n"; + print OUTS "Base Pairs in Motif\tMin # Reps\tMax # Reps\n"; + print OUTS "--------------------------------------\n"; + print OUTS "2 (Dinucleotides)\t$MIN_REPS_2bp\t$MAX_REPS_2bp\n"; + print OUTS "3 (Trinucleotides)\t$MIN_REPS_3bp\t$MAX_REPS_3bp\n"; + print OUTS "4 (Tetranucleotides)\t$MIN_REPS_4bp\t$MAX_REPS_4bp\n"; + print OUTS "\n"; + print OUTS "Motif Patterns\tNumber of SSRs Found\n"; + print OUTS "--------------------------------------\n"; + my $group; + foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ + $group =~ s/^|//; + $group =~ s/|$//; + print OUTS "$group\t$MOTIFS{$group}\n"; + } + print OUTS "\n"; + print OUTS "Motif Pattern Length\tNumber of SSRs Found\n"; + print OUTS "--------------------------------------\n"; + + foreach $group (sort keys %MOTIFLEN){ + print OUTS "$group\t$MOTIFLEN{$group}\n"; + } + + close OUTS; + +} + +sub printFasta{ + my $fastq_file = $_[0]; # file name + my $fasta_out = $_[1]; # file name + my $fasta_out_multi = $_[2]; # file name + + my $line; + my $name; + + open (IN, $fastq_file) || die "ERROR cannot open $_\n"; + open (OUTF, ">".$fasta_out) || die "ERROR cannot open $_\n"; + open (OUTFM, ">".$fasta_out_multi) || die "ERROR cannot open $_\n"; + + while($line = ){ + if($line =~ /^@(\S+)/){ + $name = $1; + if(exists $SEQ_SSR_LOC{$name}){ + my $ssr_count = scalar @{ $SEQ_SSR_LOC{$name} }; + if($ssr_count == 1){ + print OUTF ">$name\n"; + my $seq = ; + print OUTF "$seq\n"; + $line = ; + $line = ; + } + else{ + print OUTFM ">$name\n"; + my $seq = ; + print OUTFM "$seq\n"; + $line = ; + $line = ; + } + } + else{ + $line = ; + $line = ; + $line = ; + } + } + } + + close IN; + close OUTF; + close OUTFM; + +} + +sub printFastq{ + my $fastq_file = $_[0]; # file name + my $fastq_out = $_[1]; # file name + my $fastq_out_multi = $_[2]; # file name + + my $line; + my $name; + + open (IN, $fastq_file) || die "ERROR cannot open $_\n"; + open (OUTF, ">".$fastq_out) || die "ERROR cannot open $_\n"; + open (OUTFM, ">".$fastq_out_multi) || die "ERROR cannot open $_\n"; + + while($line = ){ + if($line =~ /^@(\S+)/){ + $name = $1; + if(exists $SEQ_SSR_LOC{$name}){ + my $ssr_count = scalar @{ $SEQ_SSR_LOC{$name} }; + if($ssr_count == 1){ + print OUTF $line; + $line = ; + print OUTF $line; + $line = ; + print OUTF $line; + $line = ; + print OUTF $line; + } + else{ + print OUTFM $line; + $line = ; + print OUTFM $line; + $line = ; + print OUTFM $line; + $line = ; + print OUTFM $line; + } + } + else{ + $line = ; + $line = ; + $line = ; + } + } + } + + close IN; + close OUTF; + close OUTFM; + +} + +sub _printUsage { + print "Usage: $0 "; + print qq( + The list of arguments includes: + + -f|--fastq_file + Required. The file of the sequences to be searched. + + -a|--fasta_format + Optional. The two output sequence files will be in fasta format + instead of fastq format. + + ); + print "\n"; + return; +} + + +1; diff --git a/hwg_gssr_scripts/find_dup_begins.pl b/hwg_gssr_scripts/find_dup_begins.pl new file mode 100755 index 0000000..5120bbe --- /dev/null +++ b/hwg_gssr_scripts/find_dup_begins.pl @@ -0,0 +1,214 @@ +#!/usr/bin/env perl +################################################################################ +# Author: Meg Staton +# Date: Jan 23rd, 2014 +# Version: 1 +# +# DESCRIPTION +# ----------- +# This script identifies forward and reverse reads with the same beginning bases. +# +# Dependencies: +# ------------ +# Perl must have access to the package Getopt::Long, available from +# CPAN. +# +# Usage: +# ----- +# Usage: find_dup_reads.pl -f -r +# +# The list of arguments includes: +# +# -f|--forward_fastq_file +# Required. The file of the forward direction sequences to be searched. +# +# -r|--reverse_fastq_file +# Required. The file of the reverse direction sequences to be searched. +# +# Output: +# ------ +# Four output files are produced: +# +# .uniq +# A fastq file with forward sequences that do not share duplicated beginning +# with the corresponding reverse read. +# +# .dup +# A fastq file with forward sequences that do share duplicated beginning +# with the corresponding reverse read. +# +# .uniq +# A fastq file with reverse sequences that do not share duplicated beginning +# with the corresponding forward read. +# +# .dup +# A fastq file with reverse sequences that do share duplicated beginning +# with the corresponding forward read. +# +# Details: +# ------- +# This script compares from base 0 to base 20 of the forward read against +# base 0 to base 20 of the reverse read. In order to detect duplications +# where a base has been miscalled, it also compares subsequences from base +# 10 to base 30 and from base 20 to base 40. If any of these subsequnces +# from both the forward and the reverse read is identical it kicks the reads +# out into the duplicate file output. All other reads go into the unique +# file output. +# +# The length of the subsequence samples, currently set at 20, can be modified +# with the $LEN variable below. +# +# Main caveat: if the duplicated region is shifted by even one base in either +# direction, this script will not detect it. +# +# +#--------------------------------------------------------------------------------- +use strict; + +#------------------------------------------------------------------------------- +# DEPENDENCIES + +use Getopt::Long; + +#------------------------------------------------------------------------------- +# GLOBAL PARAMETERS +#------------------------------------------------------------------------------- +my $LEN = 20; + +my $for_file; +my $rev_file; + +Getopt::Long::Configure ('bundling'); +GetOptions('f|forward=s' => \$for_file, + 'r|reverse=s' => \$rev_file); + +## Check that all required parameters have been included +if(!$for_file){ print "Forward fastq file is required.\n"; _printUsage(); exit;} +if(!$rev_file){ print "Reverse fastq file is required.\n"; _printUsage(); exit;} + +## Check that fastq file exists +if(! -e $for_file) { print "Forward fastq file $for_file does not exist\n"; exit; } +if(! -e $rev_file) { print "Reverse fastq file $rev_file does not exist\n"; exit; } + + +my %seq; +my $line; +open IN, $for_file; +while($line = ){ + $line =~ /\@(\S+)/; + my $name = $1; + # store the first of the four fastq lines + $seq{$name}{FIRST} = $line; + my $seq = ; + + # store the second of the four fastq lines + $seq{$name}{SEQ} = $seq; + + # store the third of the four fastq lines + $seq{$name}{THIRD} = ; + + # store the fourth of the four fastq lines + $seq{$name}{FOURTH} = ; +} +close IN; + + +my $count = 0; +my $dups = 0; + +open IN, $rev_file; +open OUT_1_OK, ">".$for_file.".uniq"; +open OUT_2_OK, ">".$rev_file.".uniq"; +open OUT_1_DUP, ">".$for_file.".dup"; +open OUT_2_DUP, ">".$rev_file.".dup"; + +while($line = ){ + $count++; + $line =~ /\@(\S+)/; + my $name = $1; + + # get the rest of the lines associated with this fastq record + my $seq = ; + my $third = ; + my $fourth = ; + + # get a substring of the sequence characters starting from base 0 + # length determined by global var $LEN + my $rev_begin = substr($seq, 0, $LEN); + + # get a substring of the sequence characters starting from base 10 + # length determined by global var $LEN + my $rev_begin2 = substr($seq, 10, $LEN); + + # get a substring of the sequence characters starting from base 20 + # length determined by global var $LEN + my $rev_begin3 = substr($seq, 20, $LEN); + + if(!exists $seq{$name}){ + print "Can't find $name forward seq\n"; + } + else{ + # get the same three substrings from the forward read + my $for_begin = substr($seq{$name}{SEQ}, 0, $LEN); + my $for_begin2 = substr($seq{$name}{SEQ}, 10, $LEN); + my $for_begin3 = substr($seq{$name}{SEQ}, 20, $LEN); + + ## if any of the substrings between the forward and reverse sequences match, + ## its a duplicate that needs to be filtered. + if(($for_begin eq $rev_begin) || ($for_begin2 eq $rev_begin2) || ($for_begin3 eq $rev_begin3)){ + ## found duplicated bases at begining of r1 and r2 + $dups++; + print OUT_1_DUP $seq{$name}{FIRST}; + print OUT_1_DUP $seq{$name}{SEQ}; + print OUT_1_DUP $seq{$name}{THIRD}; + print OUT_1_DUP $seq{$name}{FOURTH}; + + print OUT_2_DUP $line; + print OUT_2_DUP $seq; + print OUT_2_DUP $third; + print OUT_2_DUP $fourth; + + } + else{ + ## r1 and r2 look uniq + print OUT_1_OK $seq{$name}{FIRST}; + print OUT_1_OK $seq{$name}{SEQ}; + print OUT_1_OK $seq{$name}{THIRD}; + print OUT_1_OK $seq{$name}{FOURTH}; + + print OUT_2_OK $line; + print OUT_2_OK $seq; + print OUT_2_OK $third; + print OUT_2_OK $fourth; + + } + } +} +close IN; + + +my $pct = ($dups/$count)*100; +my $uniq2 = $count - $dups; +print "total pairs examined: $count\n"; +print "pairs with dup beginning: $dups\n"; +print "pairs with uniq beginning: $uniq2\n"; +print "pct: $pct\n"; + + +################################################################################ +sub _printUsage { + print qq( +Usage: $0 -f -r + +The list of arguments includes: + +-f|--forward +Required. The file of the forward direction sequences to be searched. + +-r|--reverse +Required. The file of the reverse direction sequences to be searched. + + ); + print "\n"; + return; +}