Jul 7, 2015
primer3 input file being generated now
|
|
|
542 |
my $ssrStart = $SSR_STATS{$ssr_id}{START}; |
|
543 |
my $ssrEnd = $SSR_STATS{$ssr_id}{END}; |
|
544 |
my $seq = $SSR_STATS{$ssr_id}{SEQM}; |
|
545 |
|
|
546 |
# change from soft mask to hard mask |
|
547 |
$seq =~ s/[actg]/N/g; |
|
548 |
|
|
549 |
my $len = $ssrEnd-$ssrStart; |
|
550 |
|
|
551 |
printf OUT ("SEQUENCE_ID=$ssr_id\n"); |
|
552 |
printf OUT ("SEQUENCE_TEMPLATE=$seq\n"); |
|
553 |
printf OUT ("SEQUENCE_TARGET=$ssrStart,$len\n"); |
|
554 |
printf OUT ("PRIMER_TASK=generic\n"); |
|
555 |
printf OUT ("PRIMER_PICK_LEFT_PRIMER=1\n"); |
|
556 |
printf OUT ("PRIMER_PICK_INTERNAL_OLIGO=0\n"); |
|
557 |
printf OUT ("PRIMER_PICK_RIGHT_PRIMER=1\n"); |
|
558 |
printf OUT ("PRIMER_OPT_SIZE=$PRIMER_OPT_SIZE\n"); |
|
559 |
printf OUT ("PRIMER_MIN_SIZE=$PRIMER_MIN_SIZE\n"); |
|
560 |
printf OUT ("PRIMER_MAX_SIZE=$PRIMER_MAX_SIZE\n"); |
|
561 |
printf OUT ("PRIMER_NUM_NS_ACCEPTED=$PRIMER_NUM_NS_ACCEPTED\n"); |
|
562 |
printf OUT ("PRIMER_PRODUCT_SIZE_RANGE=$PRIMER_PRODUCT_SIZE_RANGE\n"); |
|
563 |
printf OUT ("PRIMER_OPT_TM=$PRIMER_OPT_TM\n"); |
|
564 |
printf OUT ("PRIMER_MIN_TM=$PRIMER_MIN_TM\n"); |
|
565 |
printf OUT ("PRIMER_MAX_TM=$PRIMER_MAX_TM\n"); |
|
566 |
printf OUT ("PRIMER_MIN_GC=$PRIMER_MIN_GC\n"); |
|
567 |
printf OUT ("PRIMER_MAX_GC=$PRIMER_MAX_GC\n"); |
|
568 |
printf OUT ("PRIMER_MAX_POLY_X=$PRIMER_MAX_POLY_X\n"); |
|
569 |
printf OUT ("PRIMER_GC_CLAMP=$PRIMER_GC_CLAMP\n"); |
|
570 |
printf OUT ("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=$PRIMER3_CONFIG\n"); |
Jul 7, 2015
parse primer3 output working again
|
|
|
578 |
sub parseP3_output{ |
|
579 |
my $p3_output = $_[0]; # file name |
|
580 |
|
|
581 |
# We are going to keep track of a weird phenomenon only seen in one |
|
582 |
# project - the generation of identical forward and reverse primers. The |
|
583 |
# sequences from this project were overlapping paired ends that were joined. |
|
584 |
# Apparently something went wrong and weird sequences were obtained, all of |
|
585 |
# which yield the identical primers. |
|
586 |
# This is not reported in the final stats, just as part of the standard output. |
|
587 |
my $identical_primer_cnt = 0; |
|
588 |
|
|
589 |
# The primers output file separates information about different sequences |
|
590 |
# with an equal sign on a single line. So, we want to set the file line |
|
591 |
# delimiter (for looping on the input file below) to a single equal sign |
|
592 |
# followed by a line feed. This way were guranteed to have all the primer |
|
593 |
# information together per line |
|
594 |
local $/ = "=\n"; |
|
595 |
|
|
596 |
open (P3O, $p3_output) || die "could not open $_\n"; |
|
597 |
|
|
598 |
# Read in all of the lines of the input file |
|
599 |
my $primer_record; |
|
600 |
while ($primer_record = <P3O>) { |
|
601 |
my $start = ""; |
|
602 |
my $seq_id = ""; |
|
603 |
my $ssr_id = ""; |
|
604 |
my $forward = ""; |
|
605 |
my $reverse = ""; |
|
606 |
my $product_size = ""; |
|
607 |
my $left_tm = ""; |
|
608 |
my $right_tm = ""; |
|
609 |
|
|
610 |
if ($primer_record =~ /SEQUENCE_ID=(\S+)/) { |
|
611 |
$ssr_id = $1; |
|
612 |
} |
|
613 |
# get the primary primers only |
|
614 |
if ($primer_record =~ /PRIMER_LEFT_0_SEQUENCE=(\S+)/) { |
|
615 |
$forward = $1; |
|
616 |
} |
|
617 |
if ($primer_record =~ /PRIMER_RIGHT_0_SEQUENCE=(\S+)/) { |
|
618 |
$reverse = $1; |
|
619 |
} |
|
620 |
if ($primer_record =~ /PRIMER_LEFT_0_TM=(\S+)/) { |
|
621 |
$left_tm = $1; |
|
622 |
} |
|
623 |
if ($primer_record =~ /PRIMER_RIGHT_0_TM=(\S+)/) { |
|
624 |
$right_tm = $1; |
|
625 |
} |
|
626 |
if ($primer_record =~ /PRIMER_PAIR_0_PRODUCT_SIZE=(\S+)/) { |
|
627 |
$product_size = $1; |
|
628 |
} |
|
629 |
|
|
630 |
if(length $forward > 1){ |
|
631 |
if($forward eq $reverse){ |
Jul 7, 2015
Printing fasta files and other flat files done. Collapsing compound s…
|
|
|
668 |
print OUTS "\n"; |
|
669 |
|
|
670 |
print OUT2 join("\t", "SSR ID", |
|
671 |
"motif", "number of repeats", "start position", |
|
672 |
"end position", "forward primer", "reverse primer", |
|
673 |
"forward Tm", "reverse Tm","product size" ); |
|
674 |
print OUT2 "\n"; |
|
675 |
|
|
676 |
print OUT3 join("\t", "SSR ID", |
|
677 |
"motif", "number of repeats", "start position", |
|
678 |
"end position", "forward primer", "reverse primer", |
|
679 |
"forward Tm", "reverse Tm","product size" ); |
|
680 |
print OUT3 "\n"; |
|
681 |
|
|
682 |
print OUT4 join("\t", "SSR ID", |
|
683 |
"motif", "number of repeats", "start position", |
|
684 |
"end position", "forward primer", "reverse primer", |
|
685 |
"forward Tm", "reverse Tm","product size" ); |
|
686 |
print OUT4 "\n"; |
|
687 |
|
|
688 |
foreach my $ssr_id (keys %SSR_STATS){ |
|
689 |
## all ssrs including compound go in main ssr file |
|
690 |
print OUTS join("\t", |
|
691 |
$ssr_id, |
|
692 |
$SSR_STATS{$ssr_id}{MOTIF}, |
Jul 7, 2015
Lots of fixes, excel summary sheet being produced.
|
|
|
892 |
|
|
893 |
|
|
894 |
} |
|
895 |
|
|
896 |
sub create_stats_worksheet{ |
|
897 |
my $workbook = shift; |
|
898 |
my $header_format = shift; |
|
899 |
my $text_format = shift; |
|
900 |
my $project = shift; |
|
901 |
|
|
902 |
my $worksheet = $workbook->add_worksheet("Summary"); |
|
903 |
|
|
904 |
## set all cells to text format |
|
905 |
## only cells that need the header format will need to specify the format during write |
|
906 |
## set column widths |
|
907 |
$worksheet->set_column('A:A', 75, $text_format); |
|
908 |
$worksheet->set_column('B:B', 30, $text_format); |
|
909 |
$worksheet->set_column('C:C', 30, $text_format); |
|
910 |
|
|
911 |
$worksheet->write('A1', "SSR Summary Report for $project", $header_format); |
|
912 |
$worksheet->write('A2', "Analysis of $SEQ_COUNT sequences"); |
|
913 |
$worksheet->write('A3', "$TIME"); |
|
914 |
|
|
915 |
$worksheet->write('A5', "Number of sequences with at least one SSR"); |
|
916 |
$worksheet->write('B5', "$SEQ_w_SSRS"); |
|
917 |
|
|
918 |
$worksheet->write('A6', "Number of SSRs identified"); |
|
919 |
$worksheet->write('B6', "$SSR_COUNT"); |
|
920 |
|
|
921 |
$worksheet->write('A8', "Number of compound SSRs*:"); |
|
922 |
$worksheet->write('B8', "$SSR_COUNT_COMPOUND"); |
|
923 |
|
|
924 |
$worksheet->write('A9', "Number of SSRs with primers**"); |
|
925 |
$worksheet->write('B9', "$SSR_COUNT_PRIMER"); |
|
926 |
|
|
927 |
$worksheet->write('A11', "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n"); |
|
928 |
|
|
929 |
$worksheet->write('A12', "**No primers are designed for compound SSRs\n"); |
|
930 |
|
|
931 |
$worksheet->write('A14', "Parameters used for identifying SSRS:\n", $header_format); |
|
932 |
|
|
933 |
$worksheet->write('A15','Base Pairs in Motif', $header_format); |
|
934 |
$worksheet->write('B15','Min # Reps', $header_format); |
|
935 |
$worksheet->write('C15','Max # Reps', $header_format); |
|
936 |
|
|
937 |
$worksheet->write('A16','2 (Dinucleotides)'); |
|
938 |
$worksheet->write('B16',"$MIN_REPS_2bp"); |
|
939 |
$worksheet->write('C16',"$MAX_REPS_2bp"); |
|
940 |
|
|
941 |
$worksheet->write('A17','3 (Trinucleotides)'); |
|
942 |
$worksheet->write('B17',"$MIN_REPS_3bp"); |
|
943 |
$worksheet->write('C17',"$MAX_REPS_3bp"); |
|
944 |
|
|
945 |
$worksheet->write('A18','4 (Tetranucleotides)'); |
|
946 |
$worksheet->write('B18',"$MIN_REPS_4bp"); |
|
947 |
$worksheet->write('C18',"$MAX_REPS_4bp"); |
|
948 |
|
|
949 |
##---------------------------------------------------------- |
|
950 |
##Chart of motif pattern frequence (compound SSRs excluded) |
|
951 |
|
|
952 |
$worksheet->write('A20','Chart of motif pattern frequence (compound SSRs excluded)', $header_format); |
|
953 |
$worksheet->write('A21','Motif Patterns', $header_format); |
|
954 |
$worksheet->write('B21','Number of SSRs Found', $header_format); |
|
955 |
my $group; |
|
956 |
my $i = 21; |
|
957 |
foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ |
|
958 |
$group =~ s/^|//; |
|
959 |
$group =~ s/|$//; |
|
960 |
$i++; |
|
961 |
$worksheet->write("A$i", $group); |
|
962 |
$worksheet->write("B$i", $MOTIFS{$group}); |
|
963 |
} |
|
964 |
|
|
965 |
##---------------------------------------------------------- |
|
966 |
## Chart of motif pattern length frequence (compound SSRs excluded) |
|
967 |
$i++; |
|
968 |
$i++; |
|
969 |
$worksheet->write("A$i", "Chart of motif pattern length frequence (compound SSRs excluded)", $header_format); |
|
970 |
$i++; |
|
971 |
$worksheet->write("A$i",'Motif Pattern Length', $header_format); |
|
972 |
$worksheet->write("B$i",'Number of SSRs Found', $header_format); |
|
973 |
foreach $group (sort keys %MOTIFLEN){ |
|
974 |
$i++; |
|
975 |
$worksheet->write("A$i", "$group bp"); |
|
976 |
$worksheet->write("B$i", $MOTIFLEN{$group}); |
|
977 |
} |
|
978 |
|
|
979 |
##---------------------------------------------------------- |
|
980 |
## SSRs w primers: |
|
981 |
## Chart of motif pattern length frequence (compound SSRs excluded) |
|
982 |
$i++; |
|
983 |
$i++; |
|
984 |
$worksheet->write("A$i",'SSRs with Primers', $header_format); |
|
985 |
$i++; |
|
986 |
$worksheet->write("A$i",'Chart of motif pattern length frequence (compound SSRs excluded)', $header_format); |
|
987 |
$i++; |
|
988 |
$worksheet->write("A$i",'Motif Pattern Length', $header_format); |
|
989 |
$worksheet->write("B$i",'Number of SSRs Found', $header_format); |
|
990 |
foreach $group (sort keys %MOTIFLEN_w_PRIMERS){ |
|
991 |
$i++; |
|
992 |
$worksheet->write("A$i", "$group bp"); |
|
993 |
$worksheet->write("B$i", $MOTIFLEN_w_PRIMERS{$group}); |
|
994 |
} |
|
995 |
|
Jul 7, 2015
Everything working including excel data sheets.
|
|
|
1005 |
my $di_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Dinucleotides"); |
|
1006 |
my $tri_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Trinucleotides"); |
|
1007 |
my $tetra_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Tetranucleotides"); |
|
1008 |
|
|
1009 |
my $di_index = 3; |
|
1010 |
my $tri_index = 3; |
|
1011 |
my $tetra_index = 3; |
|
1012 |
|
|
1013 |
foreach my $ssr_id (keys %SSR_STATS){ |
|
1014 |
# for excel data files, only print SSRs |
|
1015 |
# that have primers |
|
1016 |
if($SSR_STATS{$ssr_id}{COMPOUND} == 0 && |
|
1017 |
$SSR_STATS{$ssr_id}{FORWARD} =~ /\S/ |
|
1018 |
){ |
|
1019 |
if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){ |
|
1020 |
_print_excel_file_line($di_worksheet, $di_index, $ssr_id); |
|
1021 |
$di_index++; |
|
1022 |
} |
|
1023 |
elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){ |
|
1024 |
_print_excel_file_line($tri_worksheet, $tri_index, $ssr_id); |
|
1025 |
$tri_index++; |
|
1026 |
} |
|
1027 |
elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){ |
|
1028 |
_print_excel_file_line($tetra_worksheet, $tetra_index, $ssr_id); |
|
1029 |
$tetra_index++; |
|
1030 |
} |
|
1031 |
} |
|
1032 |
} |
|
1033 |
|
|
1034 |
|
|
1035 |
} |
|
1036 |
|
|
1037 |
############################################################## |
|
1038 |
sub _initiate_worksheet{ |
|
1039 |
my $workbook = $_[0]; |
|
1040 |
my $header_format = $_[1]; |
|
1041 |
my $text_format = $_[2]; |
|
1042 |
my $name = $_[3]; |
|
1043 |
|
|
1044 |
my $worksheet = $workbook->add_worksheet($name); |
|
1045 |
$worksheet->set_column('A:A', 60, $text_format); |
|
1046 |
$worksheet->set_column('B:E', 10, $text_format); |
|
1047 |
$worksheet->set_column('F:G', 30, $text_format); |
|
1048 |
$worksheet->set_column('H:J', 10, $text_format); |
|
1049 |
|
|
1050 |
$worksheet->write('A1', "$name with primers", $header_format); |
|
1051 |
$worksheet->write('A2', 'SSR ID', $header_format); |
|
1052 |
$worksheet->write('B2', 'Motif', $header_format); |
|
1053 |
$worksheet->write('C2', '# Repeats', $header_format); |
|
1054 |
$worksheet->write('D2', 'Start', $header_format); |
|
1055 |
$worksheet->write('E2', 'End', $header_format); |
|
1056 |
$worksheet->write('F2', 'Forward Primer', $header_format); |
|
1057 |
$worksheet->write('G2', 'Reverse Primer', $header_format); |
|
1058 |
$worksheet->write('H2', 'Forward Tm', $header_format); |
|
1059 |
$worksheet->write('I2', 'Reverse Tm', $header_format); |
|
1060 |
$worksheet->write('J2', 'Fragment Size', $header_format); |
|
1061 |
|
|
1062 |
return $worksheet; |
|
1063 |
} |
Jul 7, 2015
Everything working including excel data sheets.
|
|
|
1067 |
sub _print_excel_file_line{ |
|
1068 |
my $worksheet = shift; |
|
1069 |
my $index = shift; |
|
1070 |
my $ssr_id = shift; |
|
1071 |
|
|
1072 |
$worksheet->write("A$index", $ssr_id); |
|
1073 |
$worksheet->write("B$index", $SSR_STATS{$ssr_id}{MOTIF}); |
|
1074 |
$worksheet->write("C$index", $SSR_STATS{$ssr_id}{NO_REPEATS}); |
|
1075 |
$worksheet->write("D$index", $SSR_STATS{$ssr_id}{START}); |
|
1076 |
$worksheet->write("E$index", $SSR_STATS{$ssr_id}{END}); |
|
1077 |
$worksheet->write("F$index", $SSR_STATS{$ssr_id}{FORWARD}); |
|
1078 |
$worksheet->write("G$index", $SSR_STATS{$ssr_id}{REVERSE}); |
|
1079 |
$worksheet->write("H$index", $SSR_STATS{$ssr_id}{LEFT_TM}); |
|
1080 |
$worksheet->write("I$index", $SSR_STATS{$ssr_id}{RIGHT_TM}); |
|
1081 |
$worksheet->write("J$index", $SSR_STATS{$ssr_id}{PRODUCT_SIZE}); |
|
1082 |
|
|
1083 |
} |
|
1084 |
|
|
1085 |
############################################################### |
|
1086 |
sub _printUsage { |
|
1087 |
print "Usage: $0.pl <arguments>"; |
|
1088 |
print qq( |
|
1089 |
The list of arguments includes: |
|
1090 |
|
|
1091 |
-f|--fasta_file <fasta_file> |
|
1092 |
Required. The file of the sequences to be searched. |
|
1093 |
|
|
1094 |
-m|--masked_file <masked_fasta_file> |
|
1095 |
Required. A soft-masked version of the fasta file (soft masked means low |
|
1096 |
complexity sequences are in lower case bases.) |
|
1097 |
|
|
1098 |
-p|--project "project name" |
|
1099 |
Optional. A project name for use in the Excel output. |
|
1100 |
|
|
1101 |
); |
|
1102 |
print "\n"; |
|
1103 |
return; |
|
1104 |
} |
|
1105 |
|
|
1106 |
|
|
1107 |
1; |