Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| #!/usr/bin/env perl | |
| ################################################################################ | |
| # Author: Meg Staton & Stephen Ficklin | |
| # | |
| # DESCRIPTION | |
| # ----------- | |
| # This script identifies simple sequence repeats (SSRs) and calls primers from | |
| # the sequences in a fastq formatted file. | |
| # | |
| # Dependencies: | |
| # ------------ | |
| # Perl must have access to the packages: | |
| # Getopt::Long | |
| # Bio::SeqIO | |
| # Excel::Writer::XLSX | |
| # All are available from CPAN. | |
| # | |
| # Also path to the primer3 executable and primer3 config files must be specified | |
| # in the global variables section of the script. | |
| # | |
| # Usage: | |
| # ----- | |
| # Usage: findSSRs.pl <arguments> | |
| # | |
| # The list of arguments includes: | |
| # | |
| # -f|--fasta_file <fasta_file> | |
| # Required. The file of the sequences to be searched. | |
| # | |
| # -m|--masked_file <masked_fasta_file> | |
| # Required. A soft-masked version of the fasta file (soft masked means low | |
| # complexity sequences are in lower case bases.) | |
| # | |
| # Output: | |
| # ------ | |
| # <input-file-name>.ssr.fasta | |
| # A fasta file with sequences with a SSR. (Sequences with compound SSRs are included) | |
| # | |
| # <input-file-name>.ssr_stats.txt | |
| # A text file of statistics about the SSRs discovered. | |
| # | |
| # <input-file-name>.ssr_report.txt | |
| # A tab-delimited file with each SSR. The columns are SSR ID, | |
| # motif, number of repeats, start position, end position. | |
| # | |
| # <input-file-name>.ssr_report.xlsx | |
| # A excel file with SSR results and stats | |
| # | |
| # <input-file-name>.di_primer_report.txt | |
| # <input-file-name>.tri_primer_report.txt | |
| # <input-file-name>.tetra_primer_report.txt | |
| # Tab-delimited files with sequences with a specified SSR motif length. Columns are | |
| # SSR ID, motif, number of repeats, start position, end position, left primer, | |
| # right primer, left primer Tm, right primer Tm, amplicon size | |
| # | |
| # Details: | |
| # ------- | |
| # By default the script finds: | |
| # 2 bp motifs repeated from 8 to 200 times, | |
| # 3 bp motifs repeated from 7 to 133 times, | |
| # 4 bp motifs repeated from 6 to 100 times, | |
| # | |
| # These parameters may be changed in the "GLOBAL PARAMETERS" part of | |
| # the script. | |
| # | |
| # Compound SSRs are defined as any SSRs that abut or are less than 15 bases | |
| # apart. These are essentially compound SSRs for the purposes of mapping | |
| # because it is unlikely that primers can be designed between the repeat | |
| # segments. | |
| # | |
| use strict; | |
| #------------------------------------------------------------------------------- | |
| # DEPENDENCIES | |
| #------------------------------------------------------------------------------- | |
| use Getopt::Long; | |
| use Bio::SeqIO; | |
| use Excel::Writer::XLSX; | |
| #------------------------------------------------------------------------------- | |
| # GLOBAL PARAMETERS | |
| #------------------------------------------------------------------------------- | |
| #-------------- | |
| # REPEAT IDENTIFICATION PARAMETERS | |
| # Specify Motif Frequency | |
| # Motifs that occur less frequently than indicated below will be ignored. | |
| # A 0 indicates that this motif length will be ignored. | |
| our $MIN_REPS_2bp = 8; | |
| our $MIN_REPS_3bp = 7; | |
| our $MIN_REPS_4bp = 6; | |
| our $MAX_REPS_2bp = 200; | |
| our $MAX_REPS_3bp = 133; | |
| our $MAX_REPS_4bp = 100; | |
| #------------ | |
| # PRIMER PARAMETERS | |
| my $PRIMER3 = "/lustre/projects/staton/software/primer3-2.3.6/src/primer3_core"; | |
| my $PRIMER3_CONFIG = "/lustre/projects/staton/software/primer3-2.3.6/src/primer3_config/"; | |
| my $PRIMER_OPT_SIZE="20"; # default 20 | |
| my $PRIMER_MIN_SIZE="18"; # default 18 | |
| my $PRIMER_MAX_SIZE="27"; # default 27 | |
| my $PRIMER_NUM_NS_ACCEPTED = "0"; # default 0 | |
| my $PRIMER_PRODUCT_SIZE_RANGE = "100-450"; | |
| my $PRIMER_OPT_TM = "60.0"; | |
| my $PRIMER_MIN_TM = "55.0"; | |
| my $PRIMER_MAX_TM = "65.0"; | |
| my $PRIMER_MIN_GC = "40"; | |
| my $PRIMER_MAX_GC = "60"; | |
| my $PRIMER_MAX_POLY_X = "3"; | |
| my $PRIMER_GC_CLAMP = "2"; | |
| my $PRIMER_LOWERCASE_MASKING = 1; | |
| #------------------------------------------------------------------------------- | |
| # GLOBAL HASHES | |
| #------------------------------------------------------------------------------- | |
| # This makes life much easier than passing a bunch of hash refs all over the place. | |
| # | |
| #------------------------------------------------------------------------------- | |
| # Data structures: | |
| ## CONTIG_SSR_STARTS structure: | |
| ## key: contig_name | |
| ## value: array of starts of SSRs in that contig | |
| my %CONTIG_SSR_STARTS = (); | |
| ## SSR_STATS structure: | |
| ## key: ssr_id | |
| ## value -> keys: MOTIF START END MOTIF_LENGTH NO_REPEATS | |
| my %SSR_STATS = (); | |
| my $SEQ_COUNT = 0; | |
| # Set up the Motif specifications, based on the chosen motif types:. | |
| my @MOTIF_SPECS; | |
| push(@MOTIF_SPECS,[2, $MIN_REPS_2bp, $MAX_REPS_2bp, 'dinucleotides']); | |
| push(@MOTIF_SPECS,[3, $MIN_REPS_3bp, $MAX_REPS_3bp, 'trinucleotides']); | |
| push(@MOTIF_SPECS,[4, $MIN_REPS_4bp, $MAX_REPS_4bp, 'tetranucleotides']); | |
| #------------------------------------------------------------------------------- | |
| # Generating statistics | |
| my $TIME = scalar localtime; # Get the current time | |
| my $SEQ_w_SSRS = 0; | |
| my $SSR_COUNT = 0; | |
| my $SSR_COUNT_COMPOUND = 0; | |
| my $SSR_COUNT_PRIMER = 0; | |
| my %MOTIFLEN = (2 => 0, | |
| 3 => 0, | |
| 4 => 0); | |
| my %MOTIFLEN_w_PRIMERS = (2 => 0, | |
| 3 => 0, | |
| 4 => 0); | |
| my %MOTIFS = ('|AT|TA|' => 0, | |
| '|AG|GA|CT|TC|' => 0, | |
| '|AC|CA|TG|GT|' => 0, | |
| '|GC|CG|' => 0, | |
| '|AAT|ATA|TAA|ATT|TTA|TAT|' => 0, | |
| '|AAG|AGA|GAA|CTT|TTC|TCT|' => 0, | |
| '|AAC|ACA|CAA|GTT|TTG|TGT|' => 0, | |
| '|CCA|CAC|ACC|TGG|GTG|GGT|' => 0, | |
| '|GGC|GCG|CGG|GCC|CCG|CGC|' => 0, | |
| '|AGG|GAG|GGA|CCT|CTC|TCC|' => 0, | |
| '|ATG|TGA|GAT|CAT|ATC|TCA|' => 0, | |
| '|AGT|GTA|TAG|ACT|CTA|TAC|' => 0, | |
| '|AGC|GCA|CAG|GCT|CTG|TGC|' => 0, | |
| '|ACG|CGA|GAC|CGT|GTC|TCG|' => 0); | |
| #------------------------------------------------------------------------------- | |
| # EXECUTE | |
| #------------------------------------------------------------------------------- | |
| main(); | |
| #------------------------------------------------------------------------------- | |
| # PUBLIC SUBROUTINES | |
| #------------------------------------------------------------------------------- | |
| # Function Name: main() | |
| sub main{ | |
| ##--------------------------------------------------------------- | |
| ## Get input parameters | |
| my $fasta_file; | |
| my $masked_file; | |
| my $project; | |
| Getopt::Long::Configure ('bundling'); | |
| GetOptions('f|fasta_file=s' => \$fasta_file, | |
| 'm|masked_file=s' => \$masked_file, | |
| 'p|project=s' => \$project); | |
| ## Check that all required parameters have been included | |
| if(!$fasta_file){ print "A fasta file is required.\n"; _printUsage(); exit;} | |
| if(!$masked_file){ print "A masked file is required.\n"; _printUsage(); exit;} | |
| ## Check that fasta file exists | |
| if(! -e $fasta_file) { print "Fasta file $fasta_file does not exist\n"; exit; } | |
| if(! -e $masked_file) { print "Masked file $masked_file does not exist\n"; exit; } | |
| ##--------------------------------------------------------------- | |
| ## Set up output files | |
| my $p3_input = "$fasta_file.p3in.txt"; | |
| my $p3_output = "$fasta_file.p3out.txt"; | |
| my $ssr_out = "$fasta_file.ssr_report.txt"; | |
| my $fasta_out = "$fasta_file.ssr.fasta"; | |
| my $stats_out = "$fasta_file.ssr_stats.txt"; | |
| my $di_primer_out = "$fasta_file.di_primer_report.txt"; | |
| my $tri_primer_out = "$fasta_file.tri_primer_report.txt"; | |
| my $tetra_primer_out = "$fasta_file.tetra_primer_report.txt"; | |
| my $ssr_xlsx = "$fasta_file.ssr_report.xlsx"; | |
| ##--------------------------------------------------------------- | |
| print "finding SSRs...\n"; | |
| process_file($fasta_file, $masked_file); | |
| collapse_compound_ssrs(); | |
| flag_multiSSRs(); | |
| print "done.\n"; | |
| ##--------------------------------------------------------------- | |
| print "running primer3...\n"; | |
| addToPrimer3InputFile ($p3_input); | |
| print "$PRIMER3 < $p3_input > $p3_output\n"; | |
| my $status = system("$PRIMER3 < $p3_input > $p3_output"); | |
| print "$status\n"; | |
| parseP3_output($p3_output); | |
| print "done.\n"; | |
| ##--------------------------------------------------------------- | |
| ## Producing output - Fasta files and flat files | |
| print "printing output files..."; | |
| create_flat_files($ssr_out, $di_primer_out, $tri_primer_out, $tetra_primer_out); | |
| create_fasta_file($fasta_out); | |
| ##--------------------------------------------------------------- | |
| ## Producing output - statistics | |
| calculate_stats(); | |
| print_stats($stats_out); | |
| ##--------------------------------------------------------------- | |
| ## Producing output - Excel | |
| create_excel_file($ssr_xlsx, $project); | |
| print "done.\n"; | |
| } | |
| ############################################################### | |
| sub process_file{ | |
| my $fasta_file = $_[0]; # file name | |
| my $masked_file = $_[1]; # file name | |
| my $seqio = Bio::SeqIO->new('-format' => 'fasta', -file => $fasta_file); | |
| my $seqioM = Bio::SeqIO->new('-format' => 'fasta', -file => $masked_file); | |
| # Get seq obj from io stream | |
| while(my $seqobj = $seqio->next_seq){ | |
| my $seqobjM = $seqioM->next_seq; | |
| $SEQ_COUNT++; | |
| my $seqname = $seqobj->id; # get actual sequence as a string | |
| my $seqnameM = $seqobjM->id; # get actual sequence as a string | |
| my $seqstr = $seqobj->seq(); # get actual sequence as a string | |
| my $seqstrM = $seqobjM->seq(); # get actual sequence as a string | |
| if($seqname ne $seqnameM){ | |
| die "masked sequence $seqnameM not in same order as regular sequence $seqname\n"; | |
| } | |
| process_seq($seqname, $seqstr, $seqstrM); | |
| } | |
| } | |
| ############################################################### | |
| sub process_seq{ | |
| my $contig_name = shift; | |
| my $seq = shift; | |
| my $seq_masked = shift; | |
| my %seen; # used to keep track of start positions we've already seen | |
| my $index; # used to iterate through the 2D motif specs array | |
| ## LOOPB | |
| # iterate through the motif specifications | |
| for $index (0 .. (scalar @MOTIF_SPECS - 1)) { | |
| # holds the motif size (1,2,3,4,5 or 6) | |
| my $motifLength = $MOTIF_SPECS[$index][0]; | |
| # holds the minimum number of repeats | |
| my $min_number_of_repeats = $MOTIF_SPECS[$index][1]-1; | |
| # holds the maximum number of repeats | |
| my $max_number_of_repeats = $MOTIF_SPECS[$index][2]-1; | |
| # the regular expression for looking for SSRs | |
| my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,$max_number_of_repeats})"; | |
| # run through the sequence and check for this motif spec | |
| while ($seq =~ /$regex/ig) { | |
| # Get the ssr and motif that were found | |
| my $ssr = $1; | |
| my $motif = lc $2; | |
| my $start_index = $-[0]; | |
| my $end_index = $+[0]; | |
| if(quality_check_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq)){ | |
| process_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq, $seq_masked); | |
| } | |
| } | |
| } | |
| } | |
| ############################################################### | |
| sub quality_check_ssr{ | |
| my $contig_name = shift; | |
| my $ssr = shift; | |
| my $motif = shift; | |
| my $start_index = shift; | |
| my $end_index = shift; | |
| my $seq = shift; | |
| ##------------------------------------- | |
| ## CHECKS to see if this is a good ssr | |
| my $flag_same_base = 0; | |
| my $flag_already_seen = 0; | |
| ## Check #1 | |
| ## ignore SSRs that are the same base repeated | |
| if ($ssr !~ /^g+$/i && | |
| $ssr !~ /^a+$/i && | |
| $ssr !~ /^c+$/i && | |
| $ssr !~ /^t+$/i ) { | |
| $flag_same_base = 1; | |
| } | |
| # Check #2 | |
| # Make sure this isn't an already called SSR in disguise | |
| # (a dinucleotide repeat posing as a tetranucleotide repeat, for instance) | |
| if (!exists $SSR_STATS{$contig_name."_ssr".$start_index} && | |
| !exists $SSR_STATS{$contig_name."_ssr".($start_index-1)} && | |
| !exists $SSR_STATS{$contig_name."_ssr".($start_index-2)} && | |
| !exists $SSR_STATS{$contig_name."_ssr".($start_index+1)} && | |
| !exists $SSR_STATS{$contig_name."_ssr".($start_index+2)} | |
| ) { | |
| $flag_already_seen = 1; | |
| } | |
| if($flag_same_base && $flag_already_seen){ | |
| return 1; | |
| } | |
| else{ | |
| return 0; | |
| } | |
| } | |
| ###################################################### | |
| sub process_ssr{ | |
| my $contig_name = shift; | |
| my $ssr = shift; | |
| my $motif = shift; | |
| my $start_index = shift; | |
| my $end_index = shift; | |
| my $seq = shift; | |
| my $seq_masked = shift; | |
| ##------------------------------------- | |
| ## generate a few more stats and variables | |
| my $motif_len = length $motif; | |
| my $num_of_repeats = ($end_index-$start_index)/$motif_len; | |
| my $ssr_id = $contig_name."_ssr".$start_index; | |
| ##------------------------------------- | |
| ## store in data structures | |
| if(exists $CONTIG_SSR_STARTS{$contig_name}){ | |
| push @{ $CONTIG_SSR_STARTS{$contig_name} }, $start_index; | |
| } | |
| else{ | |
| $CONTIG_SSR_STARTS{$contig_name} = [$start_index]; | |
| } | |
| $SSR_STATS{$ssr_id}{MOTIF} = $motif; | |
| $SSR_STATS{$ssr_id}{START} = $start_index; | |
| $SSR_STATS{$ssr_id}{END} = $end_index; | |
| $SSR_STATS{$ssr_id}{MOTIF_LENGTH} = $motif_len; | |
| $SSR_STATS{$ssr_id}{NO_REPEATS} = $num_of_repeats; | |
| $SSR_STATS{$ssr_id}{SEQ} = $seq; | |
| $SSR_STATS{$ssr_id}{SEQM} = $seq_masked; | |
| $SSR_STATS{$ssr_id}{COMPOUND} = 0; #assume its not until proven otherwise | |
| } | |
| ####################################################### | |
| sub collapse_compound_ssrs{ | |
| foreach my $contig (keys %CONTIG_SSR_STARTS){ | |
| my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; | |
| if(@starts > 1){ | |
| ## this contig has multiple ssrs | |
| my $previous_start = -1; | |
| my $previous_end = -1; | |
| my $previous_ssr_id = ""; | |
| foreach my $current_start (sort {$a <=> $b} @starts){ | |
| my $current_ssr_id = $contig."_ssr".$current_start; | |
| my $current_end = $SSR_STATS{$current_ssr_id}{END}; | |
| if(too_close($previous_start, $previous_end, $current_start, $current_end)){ | |
| collapse($contig, $previous_ssr_id, $current_ssr_id); | |
| } | |
| $previous_start = $current_start; | |
| $previous_end = $current_end; | |
| $previous_ssr_id = $current_ssr_id; | |
| } | |
| } | |
| } | |
| } | |
| ################################################################ | |
| sub too_close{ | |
| my $previous_start = shift; | |
| my $previous_end = shift; | |
| my $current_start = shift; | |
| my $current_end = shift; | |
| # if start is a -1, then go ahead and return ok | |
| if($previous_start == -1){ | |
| return 0; | |
| } | |
| # we want to know if they overlap, abut or are less than 15 bases apart | |
| elsif($previous_end >= $current_start || | |
| ($current_start - $previous_end) < 15){ | |
| #print "$previous_start - $previous_end, $current_start - $current_end\n"; | |
| return 1; | |
| } | |
| else{ | |
| return 0; | |
| } | |
| } | |
| ################################################################ | |
| sub collapse{ | |
| my $contig = shift; | |
| my $first_ssr_id = shift; | |
| my $second_ssr_id = shift; | |
| my $second_ssr_start = $SSR_STATS{$second_ssr_id}{START}; | |
| ##fix SSR_STATS | |
| $SSR_STATS{$first_ssr_id}{MOTIF} = "COMPOUND"; | |
| $SSR_STATS{$first_ssr_id}{END} = $SSR_STATS{$second_ssr_id}{END}; | |
| $SSR_STATS{$first_ssr_id}{MOTIF_LENGTH} = "COMPOUND"; | |
| $SSR_STATS{$first_ssr_id}{NO_REPEATS} = "COMPOUND"; | |
| $SSR_STATS{$first_ssr_id}{COMPOUND} = 1; #assume its not until proven otherwise | |
| delete $SSR_STATS{$second_ssr_id}{MOTIF}; | |
| delete $SSR_STATS{$second_ssr_id}{START}; | |
| delete $SSR_STATS{$second_ssr_id}{END}; | |
| delete $SSR_STATS{$second_ssr_id}{MOTIF_LENGTH}; | |
| delete $SSR_STATS{$second_ssr_id}{NO_REPEATS}; | |
| delete $SSR_STATS{$second_ssr_id}{SEQ}; | |
| delete $SSR_STATS{$second_ssr_id}{SEQM}; | |
| delete $SSR_STATS{$second_ssr_id}{COMPOUND}; | |
| undef %{$SSR_STATS{$second_ssr_id}}; | |
| delete $SSR_STATS{$second_ssr_id}; | |
| print "\tdeleting $second_ssr_id, part of compound ssr\n"; | |
| if(exists $SSR_STATS{$second_ssr_id}){ print "\t still exists\n";} | |
| ##fix CONTIG_SSR_STARTS | |
| # get rid of the start index for the second ssr (it is now part of the | |
| # first ssr) | |
| my $index = 0; | |
| $index++ until $CONTIG_SSR_STARTS{$contig}[$index] == $second_ssr_start; | |
| splice(@{$CONTIG_SSR_STARTS{$contig}}, $index, 1); | |
| } | |
| ################################################################ | |
| sub flag_multiSSRs{ | |
| # adds a MULTI flag to the data hash indicating if the | |
| # ssr is the only one in the sequence or one of many | |
| foreach my $contig (keys %CONTIG_SSR_STARTS){ | |
| my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; | |
| if(@starts == 1){ | |
| ## this contig has only one ssr | |
| my $start_index = $starts[0]; | |
| my $ssr_id = $contig."_ssr".$start_index; | |
| $SSR_STATS{$ssr_id}{MULTI} = 0; | |
| } | |
| else{ | |
| ## this contig has multiple ssrs | |
| foreach my $start_index (@starts){ | |
| my $ssr_id = $contig."_ssr".$start_index; | |
| $SSR_STATS{$ssr_id}{MULTI} = 1; | |
| } | |
| } | |
| } | |
| close FASTA; | |
| } | |
| ################################################################ | |
| sub addToPrimer3InputFile{ | |
| my $p3_file = shift; | |
| open OUT, ">$p3_file"; | |
| foreach my $ssr_id (keys %SSR_STATS){ | |
| #skip compound SSRS | |
| if($SSR_STATS{$ssr_id}{COMPOUND} == 0){ | |
| my $ssrStart = $SSR_STATS{$ssr_id}{START}; | |
| my $ssrEnd = $SSR_STATS{$ssr_id}{END}; | |
| my $seq = $SSR_STATS{$ssr_id}{SEQM}; | |
| # change from soft mask to hard mask | |
| $seq =~ s/[actg]/N/g; | |
| my $len = $ssrEnd-$ssrStart; | |
| printf OUT ("SEQUENCE_ID=$ssr_id\n"); | |
| printf OUT ("SEQUENCE_TEMPLATE=$seq\n"); | |
| printf OUT ("SEQUENCE_TARGET=$ssrStart,$len\n"); | |
| printf OUT ("PRIMER_TASK=generic\n"); | |
| printf OUT ("PRIMER_PICK_LEFT_PRIMER=1\n"); | |
| printf OUT ("PRIMER_PICK_INTERNAL_OLIGO=0\n"); | |
| printf OUT ("PRIMER_PICK_RIGHT_PRIMER=1\n"); | |
| printf OUT ("PRIMER_OPT_SIZE=$PRIMER_OPT_SIZE\n"); | |
| printf OUT ("PRIMER_MIN_SIZE=$PRIMER_MIN_SIZE\n"); | |
| printf OUT ("PRIMER_MAX_SIZE=$PRIMER_MAX_SIZE\n"); | |
| printf OUT ("PRIMER_NUM_NS_ACCEPTED=$PRIMER_NUM_NS_ACCEPTED\n"); | |
| printf OUT ("PRIMER_PRODUCT_SIZE_RANGE=$PRIMER_PRODUCT_SIZE_RANGE\n"); | |
| printf OUT ("PRIMER_OPT_TM=$PRIMER_OPT_TM\n"); | |
| printf OUT ("PRIMER_MIN_TM=$PRIMER_MIN_TM\n"); | |
| printf OUT ("PRIMER_MAX_TM=$PRIMER_MAX_TM\n"); | |
| printf OUT ("PRIMER_MIN_GC=$PRIMER_MIN_GC\n"); | |
| printf OUT ("PRIMER_MAX_GC=$PRIMER_MAX_GC\n"); | |
| printf OUT ("PRIMER_MAX_POLY_X=$PRIMER_MAX_POLY_X\n"); | |
| printf OUT ("PRIMER_GC_CLAMP=$PRIMER_GC_CLAMP\n"); | |
| printf OUT ("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=$PRIMER3_CONFIG\n"); | |
| printf OUT ("PRIMER_LOWERCASE_MASKING=$PRIMER_LOWERCASE_MASKING\n"); | |
| printf OUT ("=\n"); | |
| } | |
| } | |
| close OUT; | |
| } | |
| ################################################################ | |
| sub parseP3_output{ | |
| my $p3_output = $_[0]; # file name | |
| # We are going to keep track of a weird phenomenon only seen in one | |
| # project - the generation of identical forward and reverse primers. The | |
| # sequences from this project were overlapping paired ends that were joined. | |
| # Apparently something went wrong and weird sequences were obtained, all of | |
| # which yield the identical primers. | |
| # This is not reported in the final stats, just as part of the standard output. | |
| my $identical_primer_cnt = 0; | |
| # The primers output file separates information about different sequences | |
| # with an equal sign on a single line. So, we want to set the file line | |
| # delimiter (for looping on the input file below) to a single equal sign | |
| # followed by a line feed. This way were guranteed to have all the primer | |
| # information together per line | |
| local $/ = "=\n"; | |
| open (P3O, $p3_output) || die "could not open $_\n"; | |
| # Read in all of the lines of the input file | |
| my $primer_record; | |
| while ($primer_record = <P3O>) { | |
| my $start = ""; | |
| my $seq_id = ""; | |
| my $ssr_id = ""; | |
| my $forward = ""; | |
| my $reverse = ""; | |
| my $product_size = ""; | |
| my $left_tm = ""; | |
| my $right_tm = ""; | |
| if ($primer_record =~ /SEQUENCE_ID=(\S+)/) { | |
| $ssr_id = $1; | |
| } | |
| # get the primary primers only | |
| if ($primer_record =~ /PRIMER_LEFT_0_SEQUENCE=(\S+)/) { | |
| $forward = $1; | |
| } | |
| if ($primer_record =~ /PRIMER_RIGHT_0_SEQUENCE=(\S+)/) { | |
| $reverse = $1; | |
| } | |
| if ($primer_record =~ /PRIMER_LEFT_0_TM=(\S+)/) { | |
| $left_tm = $1; | |
| } | |
| if ($primer_record =~ /PRIMER_RIGHT_0_TM=(\S+)/) { | |
| $right_tm = $1; | |
| } | |
| if ($primer_record =~ /PRIMER_PAIR_0_PRODUCT_SIZE=(\S+)/) { | |
| $product_size = $1; | |
| } | |
| if(length $forward > 1){ | |
| if($forward eq $reverse){ | |
| print "\tFLAG: identical primer problem with $ssr_id\n"; | |
| $identical_primer_cnt++; | |
| } | |
| else{ | |
| $SSR_STATS{$ssr_id}{FORWARD} = $forward; | |
| $SSR_STATS{$ssr_id}{REVERSE} = $reverse; | |
| $SSR_STATS{$ssr_id}{PRODUCT_SIZE} = $product_size; | |
| $SSR_STATS{$ssr_id}{LEFT_TM} = $left_tm; | |
| $SSR_STATS{$ssr_id}{RIGHT_TM} = $right_tm; | |
| } | |
| } | |
| } | |
| print "\ttotal identical primers: $identical_primer_cnt\n"; | |
| } | |
| ################################################### | |
| sub create_flat_files{ | |
| my $ssr_out = shift; | |
| my $di_primer_out = shift; | |
| my $tri_primer_out = shift; | |
| my $tetra_primer_out = shift; | |
| open OUTS, ">$ssr_out"; | |
| open OUT2, ">$di_primer_out"; | |
| open OUT3, ">$tri_primer_out"; | |
| open OUT4, ">$tetra_primer_out"; | |
| my $di_fh = *OUT2; | |
| my $tri_fh = *OUT3; | |
| my $tetra_fh = *OUT4; | |
| ##printer headers | |
| print OUTS join("\t", "SSR ID", | |
| "motif", "number of repeats", "start position", "end position"); | |
| print OUTS "\n"; | |
| print OUT2 join("\t", "SSR ID", | |
| "motif", "number of repeats", "start position", | |
| "end position", "forward primer", "reverse primer", | |
| "forward Tm", "reverse Tm","product size" ); | |
| print OUT2 "\n"; | |
| print OUT3 join("\t", "SSR ID", | |
| "motif", "number of repeats", "start position", | |
| "end position", "forward primer", "reverse primer", | |
| "forward Tm", "reverse Tm","product size" ); | |
| print OUT3 "\n"; | |
| print OUT4 join("\t", "SSR ID", | |
| "motif", "number of repeats", "start position", | |
| "end position", "forward primer", "reverse primer", | |
| "forward Tm", "reverse Tm","product size" ); | |
| print OUT4 "\n"; | |
| foreach my $ssr_id (keys %SSR_STATS){ | |
| ## all ssrs including compound go in main ssr file | |
| print OUTS join("\t", | |
| $ssr_id, | |
| $SSR_STATS{$ssr_id}{MOTIF}, | |
| $SSR_STATS{$ssr_id}{NO_REPEATS}, | |
| $SSR_STATS{$ssr_id}{START}, | |
| $SSR_STATS{$ssr_id}{END}, | |
| ); | |
| print OUTS "\n"; | |
| # for primer flat files, only print SSRs with | |
| # that have primers | |
| if($SSR_STATS{$ssr_id}{COMPOUND} == 0 && | |
| $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/ | |
| ){ | |
| if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){ | |
| _print_primer_flat_file_line($di_fh, $ssr_id); | |
| } | |
| elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){ | |
| _print_primer_flat_file_line($tri_fh, $ssr_id); | |
| } | |
| elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){ | |
| _print_primer_flat_file_line($tetra_fh, $ssr_id); | |
| } | |
| } | |
| } | |
| close OUTS; | |
| close OUT2; | |
| close OUT3; | |
| close OUT4; | |
| } | |
| ################################################### | |
| sub _print_primer_flat_file_line{ | |
| my $fh = shift; | |
| my $ssr_id = shift; | |
| print $fh join("\t", $ssr_id, | |
| $SSR_STATS{$ssr_id}{MOTIF}, | |
| $SSR_STATS{$ssr_id}{NO_REPEATS}, | |
| $SSR_STATS{$ssr_id}{START}, | |
| $SSR_STATS{$ssr_id}{END}, | |
| $SSR_STATS{$ssr_id}{FORWARD}, | |
| $SSR_STATS{$ssr_id}{REVERSE}, | |
| $SSR_STATS{$ssr_id}{LEFT_TM}, | |
| $SSR_STATS{$ssr_id}{RIGHT_TM}, | |
| $SSR_STATS{$ssr_id}{PRODUCT_SIZE}, | |
| ); | |
| print $fh "\n"; | |
| } | |
| ################################################### | |
| sub create_fasta_file{ | |
| my $fasta_out = shift; | |
| open FASTA, ">$fasta_out"; | |
| foreach my $contig (keys %CONTIG_SSR_STARTS){ | |
| my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; | |
| my $seq = ""; | |
| print FASTA ">$contig ("; | |
| foreach my $start_index (sort {$a <=> $b} @starts){ | |
| my $ssr_id = $contig."_ssr".$start_index; | |
| #if($SSR_STATS{$ssr_id}{START} >= 0){ | |
| $seq = $SSR_STATS{$ssr_id}{SEQ}; | |
| print FASTA "$SSR_STATS{$ssr_id}{START}-$SSR_STATS{$ssr_id}{END} "; | |
| if($SSR_STATS{$ssr_id}{COMPOUND} == 1){ | |
| print FASTA "*Compound "; | |
| } | |
| } | |
| #get the first ssr index just so we can get the sequence | |
| print FASTA ")\n"; | |
| print FASTA "$seq\n"; | |
| } | |
| close FASTA; | |
| } | |
| ################################################################ | |
| sub calculate_stats{ | |
| $SEQ_w_SSRS = keys %CONTIG_SSR_STARTS; | |
| foreach my $ssr_id (keys %SSR_STATS){ | |
| $SSR_COUNT++; | |
| if($SSR_STATS{$ssr_id}{COMPOUND} == 1){ | |
| $SSR_COUNT_COMPOUND++; | |
| } | |
| else{ | |
| my $motif_len = $SSR_STATS{$ssr_id}{MOTIF_LENGTH} ; | |
| #print "motif length is $motif_len\n"; | |
| $MOTIFLEN{$motif_len}++; | |
| my $motifUC = uc($SSR_STATS{$ssr_id}{MOTIF}); | |
| foreach my $group (keys %MOTIFS) { | |
| if($group =~ /\|$motifUC\|/){ | |
| #print "Incrementing $group for $motifUC\n"; | |
| $MOTIFS{$group}++; | |
| } | |
| } | |
| if($SSR_STATS{$ssr_id}{FORWARD} =~ /\S/){ | |
| $SSR_COUNT_PRIMER++; | |
| $MOTIFLEN_w_PRIMERS{$motif_len}++; | |
| } | |
| } | |
| } | |
| } | |
| sub print_stats{ | |
| my $stats_out = $_[0]; # file name | |
| open (OUTS, ">".$stats_out) || die "ERROR cannot open $stats_out\n"; | |
| print OUTS 'SSR Summary Report\n'; | |
| print OUTS "Analysis of $SEQ_COUNT sequences\n"; | |
| print OUTS "$TIME\n"; | |
| print OUTS "\n"; | |
| print OUTS "Number of sequences with at least one SSR\t$SEQ_w_SSRS\n"; | |
| print OUTS "Number of SSRs identified\t$SSR_COUNT\n"; | |
| print OUTS "\n"; | |
| print OUTS "Number of compound SSRs*: $SSR_COUNT_COMPOUND\n"; | |
| print OUTS "Number of SSRs with primers**: $SSR_COUNT_PRIMER\n"; | |
| print OUTS "\n"; | |
| print OUTS "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n"; | |
| print OUTS "**No primers are designed for compound SSRs\n"; | |
| print OUTS "\n"; | |
| print OUTS "Parameters used for identifying SSRS:\n"; | |
| print OUTS "Base Pairs in Motif\tMin # Reps\tMax # Reps\n"; | |
| print OUTS "--------------------------------------\n"; | |
| print OUTS "2 (Dinucleotides)\t$MIN_REPS_2bp\t$MAX_REPS_2bp\n"; | |
| print OUTS "3 (Trinucleotides)\t$MIN_REPS_3bp\t$MAX_REPS_3bp\n"; | |
| print OUTS "4 (Tetranucleotides)\t$MIN_REPS_4bp\t$MAX_REPS_4bp\n"; | |
| print OUTS "\n"; | |
| print OUTS "Chart of motif pattern frequence (compound SSRs excluded)\n"; | |
| print OUTS "Motif Patterns\tNumber of SSRs Found\n"; | |
| print OUTS "--------------------------------------\n"; | |
| my $group; | |
| foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ | |
| $group =~ s/^|//; | |
| $group =~ s/|$//; | |
| print OUTS "$group\t$MOTIFS{$group}\n"; | |
| } | |
| print OUTS "\n"; | |
| print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n"; | |
| print OUTS "Motif Pattern Length\tNumber of SSRs\n"; | |
| print OUTS "--------------------------------------\n"; | |
| foreach $group (sort keys %MOTIFLEN){ | |
| print OUTS "$group\t$MOTIFLEN{$group}\n"; | |
| } | |
| print OUTS "\n"; | |
| print OUTS "SSRS with Primers \n"; | |
| print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n"; | |
| print OUTS "Motif Pattern Length\tNumber of SSRs\n"; | |
| print OUTS "--------------------------------------\n"; | |
| foreach $group (sort keys %MOTIFLEN_w_PRIMERS){ | |
| print OUTS "$group\t$MOTIFLEN_w_PRIMERS{$group}\n"; | |
| } | |
| close OUTS; | |
| } | |
| ################################################################ | |
| sub create_excel_file{ | |
| my $ssr_xlsx = shift; | |
| my $project = shift; | |
| # Create an excel workbook | |
| my $workbook = Excel::Writer::XLSX->new("$ssr_xlsx"); | |
| # Setup the formats that will be necessary for the excel spreadsheet | |
| my %header = (font => 'Calibri', | |
| size => 12, | |
| bold => 1, | |
| color => 'black', | |
| align => 'left', | |
| text_wrap => 1); | |
| my %text = (font => 'Calibri', | |
| size => 12, | |
| color => 'black', | |
| align => 'left', | |
| text_wrap => 1); | |
| #add the formats to the workbook | |
| my $header_format = $workbook->add_format(%header); | |
| my $text_format = $workbook->add_format(%text); | |
| my $worksheet_stats = create_stats_worksheet($workbook, $header_format, $text_format, $project); | |
| build_data_worksheets($workbook, $header_format, $text_format); | |
| $worksheet_stats->activate(); | |
| $worksheet_stats->select(); | |
| $workbook->close(); | |
| } | |
| sub create_stats_worksheet{ | |
| my $workbook = shift; | |
| my $header_format = shift; | |
| my $text_format = shift; | |
| my $project = shift; | |
| my $worksheet = $workbook->add_worksheet("Summary"); | |
| ## set all cells to text format | |
| ## only cells that need the header format will need to specify the format during write | |
| ## set column widths | |
| $worksheet->set_column('A:A', 75, $text_format); | |
| $worksheet->set_column('B:B', 30, $text_format); | |
| $worksheet->set_column('C:C', 30, $text_format); | |
| $worksheet->write('A1', "SSR Summary Report for $project", $header_format); | |
| $worksheet->write('A2', "Analysis of $SEQ_COUNT sequences"); | |
| $worksheet->write('A3', "$TIME"); | |
| $worksheet->write('A5', "Number of sequences with at least one SSR"); | |
| $worksheet->write('B5', "$SEQ_w_SSRS"); | |
| $worksheet->write('A6', "Number of SSRs identified"); | |
| $worksheet->write('B6', "$SSR_COUNT"); | |
| $worksheet->write('A8', "Number of compound SSRs*:"); | |
| $worksheet->write('B8', "$SSR_COUNT_COMPOUND"); | |
| $worksheet->write('A9', "Number of SSRs with primers**"); | |
| $worksheet->write('B9', "$SSR_COUNT_PRIMER"); | |
| $worksheet->write('A11', "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n"); | |
| $worksheet->write('A12', "**No primers are designed for compound SSRs\n"); | |
| $worksheet->write('A14', "Parameters used for identifying SSRS:\n", $header_format); | |
| $worksheet->write('A15','Base Pairs in Motif', $header_format); | |
| $worksheet->write('B15','Min # Reps', $header_format); | |
| $worksheet->write('C15','Max # Reps', $header_format); | |
| $worksheet->write('A16','2 (Dinucleotides)'); | |
| $worksheet->write('B16',"$MIN_REPS_2bp"); | |
| $worksheet->write('C16',"$MAX_REPS_2bp"); | |
| $worksheet->write('A17','3 (Trinucleotides)'); | |
| $worksheet->write('B17',"$MIN_REPS_3bp"); | |
| $worksheet->write('C17',"$MAX_REPS_3bp"); | |
| $worksheet->write('A18','4 (Tetranucleotides)'); | |
| $worksheet->write('B18',"$MIN_REPS_4bp"); | |
| $worksheet->write('C18',"$MAX_REPS_4bp"); | |
| ##---------------------------------------------------------- | |
| ##Chart of motif pattern frequence (compound SSRs excluded) | |
| $worksheet->write('A20','Chart of motif pattern frequence (compound SSRs excluded)', $header_format); | |
| $worksheet->write('A21','Motif Patterns', $header_format); | |
| $worksheet->write('B21','Number of SSRs Found', $header_format); | |
| my $group; | |
| my $i = 21; | |
| foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ | |
| $group =~ s/^|//; | |
| $group =~ s/|$//; | |
| $i++; | |
| $worksheet->write("A$i", $group); | |
| $worksheet->write("B$i", $MOTIFS{$group}); | |
| } | |
| ##---------------------------------------------------------- | |
| ## Chart of motif pattern length frequence (compound SSRs excluded) | |
| $i++; | |
| $i++; | |
| $worksheet->write("A$i", "Chart of motif pattern length frequence (compound SSRs excluded)", $header_format); | |
| $i++; | |
| $worksheet->write("A$i",'Motif Pattern Length', $header_format); | |
| $worksheet->write("B$i",'Number of SSRs Found', $header_format); | |
| foreach $group (sort keys %MOTIFLEN){ | |
| $i++; | |
| $worksheet->write("A$i", "$group bp"); | |
| $worksheet->write("B$i", $MOTIFLEN{$group}); | |
| } | |
| ##---------------------------------------------------------- | |
| ## SSRs w primers: | |
| ## Chart of motif pattern length frequence (compound SSRs excluded) | |
| $i++; | |
| $i++; | |
| $worksheet->write("A$i",'SSRs with Primers', $header_format); | |
| $i++; | |
| $worksheet->write("A$i",'Chart of motif pattern length frequence (compound SSRs excluded)', $header_format); | |
| $i++; | |
| $worksheet->write("A$i",'Motif Pattern Length', $header_format); | |
| $worksheet->write("B$i",'Number of SSRs Found', $header_format); | |
| foreach $group (sort keys %MOTIFLEN_w_PRIMERS){ | |
| $i++; | |
| $worksheet->write("A$i", "$group bp"); | |
| $worksheet->write("B$i", $MOTIFLEN_w_PRIMERS{$group}); | |
| } | |
| return $worksheet; | |
| } | |
| ############################################################## | |
| sub build_data_worksheets{ | |
| my $workbook = shift; | |
| my $header_format = shift; | |
| my $text_format = shift; | |
| my $di_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Dinucleotides"); | |
| my $tri_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Trinucleotides"); | |
| my $tetra_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Tetranucleotides"); | |
| my $di_index = 3; | |
| my $tri_index = 3; | |
| my $tetra_index = 3; | |
| foreach my $ssr_id (keys %SSR_STATS){ | |
| # for excel data files, only print SSRs | |
| # that have primers | |
| if($SSR_STATS{$ssr_id}{COMPOUND} == 0 && | |
| $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/ | |
| ){ | |
| if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){ | |
| _print_excel_file_line($di_worksheet, $di_index, $ssr_id); | |
| $di_index++; | |
| } | |
| elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){ | |
| _print_excel_file_line($tri_worksheet, $tri_index, $ssr_id); | |
| $tri_index++; | |
| } | |
| elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){ | |
| _print_excel_file_line($tetra_worksheet, $tetra_index, $ssr_id); | |
| $tetra_index++; | |
| } | |
| } | |
| } | |
| } | |
| ############################################################## | |
| sub _initiate_worksheet{ | |
| my $workbook = $_[0]; | |
| my $header_format = $_[1]; | |
| my $text_format = $_[2]; | |
| my $name = $_[3]; | |
| my $worksheet = $workbook->add_worksheet($name); | |
| $worksheet->set_column('A:A', 60, $text_format); | |
| $worksheet->set_column('B:E', 10, $text_format); | |
| $worksheet->set_column('F:G', 30, $text_format); | |
| $worksheet->set_column('H:J', 10, $text_format); | |
| $worksheet->write('A1', "$name with primers", $header_format); | |
| $worksheet->write('A2', 'SSR ID', $header_format); | |
| $worksheet->write('B2', 'Motif', $header_format); | |
| $worksheet->write('C2', '# Repeats', $header_format); | |
| $worksheet->write('D2', 'Start', $header_format); | |
| $worksheet->write('E2', 'End', $header_format); | |
| $worksheet->write('F2', 'Forward Primer', $header_format); | |
| $worksheet->write('G2', 'Reverse Primer', $header_format); | |
| $worksheet->write('H2', 'Forward Tm', $header_format); | |
| $worksheet->write('I2', 'Reverse Tm', $header_format); | |
| $worksheet->write('J2', 'Fragment Size', $header_format); | |
| return $worksheet; | |
| } | |
| ################################################################ | |
| sub _print_excel_file_line{ | |
| my $worksheet = shift; | |
| my $index = shift; | |
| my $ssr_id = shift; | |
| $worksheet->write("A$index", $ssr_id); | |
| $worksheet->write("B$index", $SSR_STATS{$ssr_id}{MOTIF}); | |
| $worksheet->write("C$index", $SSR_STATS{$ssr_id}{NO_REPEATS}); | |
| $worksheet->write("D$index", $SSR_STATS{$ssr_id}{START}); | |
| $worksheet->write("E$index", $SSR_STATS{$ssr_id}{END}); | |
| $worksheet->write("F$index", $SSR_STATS{$ssr_id}{FORWARD}); | |
| $worksheet->write("G$index", $SSR_STATS{$ssr_id}{REVERSE}); | |
| $worksheet->write("H$index", $SSR_STATS{$ssr_id}{LEFT_TM}); | |
| $worksheet->write("I$index", $SSR_STATS{$ssr_id}{RIGHT_TM}); | |
| $worksheet->write("J$index", $SSR_STATS{$ssr_id}{PRODUCT_SIZE}); | |
| } | |
| ############################################################### | |
| sub _printUsage { | |
| print "Usage: $0.pl <arguments>"; | |
| print qq( | |
| The list of arguments includes: | |
| -f|--fasta_file <fasta_file> | |
| Required. The file of the sequences to be searched. | |
| -m|--masked_file <masked_fasta_file> | |
| Required. A soft-masked version of the fasta file (soft masked means low | |
| complexity sequences are in lower case bases.) | |
| -p|--project "project name" | |
| Optional. A project name for use in the Excel output. | |
| ); | |
| print "\n"; | |
| return; | |
| } | |
| 1; |