Permalink
Newer
Older
100755 227 lines (191 sloc) 6.79 KB
Jan 23, 2014 @mestato hwg gssr scripts
1 #!/usr/bin/env perl
2 ################################################################################
3 # Author: Meg Staton
4 # Date: Jan 23rd, 2014
5 # Version: 1
6 #
7 # DESCRIPTION
8 # -----------
9 # This script identifies forward and reverse reads with the same beginning bases.
10 #
11 # Dependencies:
12 # ------------
13 # Perl must have access to the package Getopt::Long, available from
14 # CPAN.
15 #
16 # Usage:
17 # -----
18 # Usage: find_dup_reads.pl -f <forward fastq file> -r <reverse fastq file>
19 #
20 # The list of arguments includes:
21 #
22 # -f|--forward_fastq_file <fastq_file>
23 # Required. The file of the forward direction sequences to be searched.
24 #
25 # -r|--reverse_fastq_file <fastq_file>
26 # Required. The file of the reverse direction sequences to be searched.
27 #
28 # Output:
29 # ------
Jan 23, 2014 @mestato adding comments
30 # Four output files are produced, in the same directory as the input files:
Jan 23, 2014 @mestato hwg gssr scripts
31 #
32 # <forward_fastq_file>.uniq
33 # A fastq file with forward sequences that do not share duplicated beginning
34 # with the corresponding reverse read.
35 #
36 # <forward_fastq_file>.dup
37 # A fastq file with forward sequences that do share duplicated beginning
38 # with the corresponding reverse read.
39 #
40 # <reverse_fastq_file>.uniq
41 # A fastq file with reverse sequences that do not share duplicated beginning
42 # with the corresponding forward read.
43 #
44 # <reverse_fastq_file>.dup
45 # A fastq file with reverse sequences that do share duplicated beginning
46 # with the corresponding forward read.
47 #
48 # Details:
49 # -------
50 # This script compares from base 0 to base 20 of the forward read against
51 # base 0 to base 20 of the reverse read. In order to detect duplications
52 # where a base has been miscalled, it also compares subsequences from base
53 # 10 to base 30 and from base 20 to base 40. If any of these subsequnces
54 # from both the forward and the reverse read is identical it kicks the reads
55 # out into the duplicate file output. All other reads go into the unique
56 # file output.
57 #
58 # The length of the subsequence samples, currently set at 20, can be modified
59 # with the $LEN variable below.
60 #
Jan 23, 2014 @mestato adding comments
61 # WARNING: This script puts the entire forward file into memory in a hash, so
62 # use with caution on large files.
63 #
64 # Needed improvements and caveat emptors:
65 # - if the duplicated region is shifted by even one base in either
66 # direction, this script will not detect it.
67 #
68 # - memory requirements are ridiculous, a better strategy is needed
69 #
70 # - the output files are put whereever the input files are because the path
71 # is not parsed off, could be fixed with File::Basename
Jan 23, 2014 @mestato hwg gssr scripts
72 #
73 #
74 #---------------------------------------------------------------------------------
75 use strict;
76
77 #-------------------------------------------------------------------------------
78 # DEPENDENCIES
79
80 use Getopt::Long;
81
82 #-------------------------------------------------------------------------------
83 # GLOBAL PARAMETERS
84 #-------------------------------------------------------------------------------
85 my $LEN = 20;
86
87 my $for_file;
88 my $rev_file;
89
90 Getopt::Long::Configure ('bundling');
91 GetOptions('f|forward=s' => \$for_file,
92 'r|reverse=s' => \$rev_file);
93
94 ## Check that all required parameters have been included
95 if(!$for_file){ print "Forward fastq file is required.\n"; _printUsage(); exit;}
96 if(!$rev_file){ print "Reverse fastq file is required.\n"; _printUsage(); exit;}
97
98 ## Check that fastq file exists
99 if(! -e $for_file) { print "Forward fastq file $for_file does not exist\n"; exit; }
100 if(! -e $rev_file) { print "Reverse fastq file $rev_file does not exist\n"; exit; }
101
102
103 my %seq;
104 my $line;
105 open IN, $for_file;
106 while($line = <IN>){
107 $line =~ /\@(\S+)/;
108 my $name = $1;
109 # store the first of the four fastq lines
110 $seq{$name}{FIRST} = $line;
111 my $seq = <IN>;
112
113 # store the second of the four fastq lines
114 $seq{$name}{SEQ} = $seq;
115
116 # store the third of the four fastq lines
117 $seq{$name}{THIRD} = <IN>;
118
119 # store the fourth of the four fastq lines
120 $seq{$name}{FOURTH} = <IN>;
121 }
122 close IN;
123
124
125 my $count = 0;
126 my $dups = 0;
127
128 open IN, $rev_file;
129 open OUT_1_OK, ">".$for_file.".uniq";
130 open OUT_2_OK, ">".$rev_file.".uniq";
131 open OUT_1_DUP, ">".$for_file.".dup";
132 open OUT_2_DUP, ">".$rev_file.".dup";
133
134 while($line = <IN>){
135 $count++;
136 $line =~ /\@(\S+)/;
137 my $name = $1;
138
139 # get the rest of the lines associated with this fastq record
140 my $seq = <IN>;
141 my $third = <IN>;
142 my $fourth = <IN>;
143
144 # get a substring of the sequence characters starting from base 0
145 # length determined by global var $LEN
146 my $rev_begin = substr($seq, 0, $LEN);
147
148 # get a substring of the sequence characters starting from base 10
149 # length determined by global var $LEN
150 my $rev_begin2 = substr($seq, 10, $LEN);
151
152 # get a substring of the sequence characters starting from base 20
153 # length determined by global var $LEN
154 my $rev_begin3 = substr($seq, 20, $LEN);
155
156 if(!exists $seq{$name}){
157 print "Can't find $name forward seq\n";
158 }
159 else{
160 # get the same three substrings from the forward read
161 my $for_begin = substr($seq{$name}{SEQ}, 0, $LEN);
162 my $for_begin2 = substr($seq{$name}{SEQ}, 10, $LEN);
163 my $for_begin3 = substr($seq{$name}{SEQ}, 20, $LEN);
164
165 ## if any of the substrings between the forward and reverse sequences match,
166 ## its a duplicate that needs to be filtered.
167 if(($for_begin eq $rev_begin) || ($for_begin2 eq $rev_begin2) || ($for_begin3 eq $rev_begin3)){
168 ## found duplicated bases at begining of r1 and r2
169 $dups++;
170 print OUT_1_DUP $seq{$name}{FIRST};
171 print OUT_1_DUP $seq{$name}{SEQ};
172 print OUT_1_DUP $seq{$name}{THIRD};
173 print OUT_1_DUP $seq{$name}{FOURTH};
174
175 print OUT_2_DUP $line;
176 print OUT_2_DUP $seq;
177 print OUT_2_DUP $third;
178 print OUT_2_DUP $fourth;
179
180 }
181 else{
182 ## r1 and r2 look uniq
183 print OUT_1_OK $seq{$name}{FIRST};
184 print OUT_1_OK $seq{$name}{SEQ};
185 print OUT_1_OK $seq{$name}{THIRD};
186 print OUT_1_OK $seq{$name}{FOURTH};
187
188 print OUT_2_OK $line;
189 print OUT_2_OK $seq;
190 print OUT_2_OK $third;
191 print OUT_2_OK $fourth;
192
193 }
194 }
195 }
196 close IN;
197
198
199 my $pct = ($dups/$count)*100;
200 my $uniq2 = $count - $dups;
201 print "total pairs examined: $count\n";
202 print "pairs with dup beginning: $dups\n";
203 print "pairs with uniq beginning: $uniq2\n";
Jan 23, 2014 @mestato adding comments
204 print "percent duplicates: $pct %\n";
Jan 23, 2014 @mestato hwg gssr scripts
205
206
207 ################################################################################
208 sub _printUsage {
209 print qq(
210 Usage: $0 -f <forward fastq file> -r <reverse fastq file>
211
212 The list of arguments includes:
213
214 -f|--forward
215 Required. The file of the forward direction sequences to be searched.
216
217 -r|--reverse
218 Required. The file of the reverse direction sequences to be searched.
219
Jan 23, 2014 @mestato Addition of warning to usage instructions.
220 WARNING: This script puts the entire forward fastq file into memory, so
221 use with extreme caution on large files.
222
Jan 23, 2014 @mestato hwg gssr scripts
223 );
224 print "\n";
225 return;
226 }