Jan 23, 2014
hwg gssr scripts
|
|
|
31 |
# |
|
32 |
# <forward_fastq_file>.uniq |
|
33 |
# A fastq file with forward sequences that do not share duplicated beginning |
|
34 |
# with the corresponding reverse read. |
|
35 |
# |
|
36 |
# <forward_fastq_file>.dup |
|
37 |
# A fastq file with forward sequences that do share duplicated beginning |
|
38 |
# with the corresponding reverse read. |
|
39 |
# |
|
40 |
# <reverse_fastq_file>.uniq |
|
41 |
# A fastq file with reverse sequences that do not share duplicated beginning |
|
42 |
# with the corresponding forward read. |
|
43 |
# |
|
44 |
# <reverse_fastq_file>.dup |
|
45 |
# A fastq file with reverse sequences that do share duplicated beginning |
|
46 |
# with the corresponding forward read. |
|
47 |
# |
|
48 |
# Details: |
|
49 |
# ------- |
|
50 |
# This script compares from base 0 to base 20 of the forward read against |
|
51 |
# base 0 to base 20 of the reverse read. In order to detect duplications |
|
52 |
# where a base has been miscalled, it also compares subsequences from base |
|
53 |
# 10 to base 30 and from base 20 to base 40. If any of these subsequnces |
|
54 |
# from both the forward and the reverse read is identical it kicks the reads |
|
55 |
# out into the duplicate file output. All other reads go into the unique |
|
56 |
# file output. |
|
57 |
# |
|
58 |
# The length of the subsequence samples, currently set at 20, can be modified |
|
59 |
# with the $LEN variable below. |
|
60 |
# |
Jan 23, 2014
hwg gssr scripts
|
|
|
72 |
# |
|
73 |
# |
|
74 |
#--------------------------------------------------------------------------------- |
|
75 |
use strict; |
|
76 |
|
|
77 |
#------------------------------------------------------------------------------- |
|
78 |
# DEPENDENCIES |
|
79 |
|
|
80 |
use Getopt::Long; |
|
81 |
|
|
82 |
#------------------------------------------------------------------------------- |
|
83 |
# GLOBAL PARAMETERS |
|
84 |
#------------------------------------------------------------------------------- |
|
85 |
my $LEN = 20; |
|
86 |
|
|
87 |
my $for_file; |
|
88 |
my $rev_file; |
|
89 |
|
|
90 |
Getopt::Long::Configure ('bundling'); |
|
91 |
GetOptions('f|forward=s' => \$for_file, |
|
92 |
'r|reverse=s' => \$rev_file); |
|
93 |
|
|
94 |
## Check that all required parameters have been included |
|
95 |
if(!$for_file){ print "Forward fastq file is required.\n"; _printUsage(); exit;} |
|
96 |
if(!$rev_file){ print "Reverse fastq file is required.\n"; _printUsage(); exit;} |
|
97 |
|
|
98 |
## Check that fastq file exists |
|
99 |
if(! -e $for_file) { print "Forward fastq file $for_file does not exist\n"; exit; } |
|
100 |
if(! -e $rev_file) { print "Reverse fastq file $rev_file does not exist\n"; exit; } |
|
101 |
|
|
102 |
|
|
103 |
my %seq; |
|
104 |
my $line; |
|
105 |
open IN, $for_file; |
|
106 |
while($line = <IN>){ |
|
107 |
$line =~ /\@(\S+)/; |
|
108 |
my $name = $1; |
|
109 |
# store the first of the four fastq lines |
|
110 |
$seq{$name}{FIRST} = $line; |
|
111 |
my $seq = <IN>; |
|
112 |
|
|
113 |
# store the second of the four fastq lines |
|
114 |
$seq{$name}{SEQ} = $seq; |
|
115 |
|
|
116 |
# store the third of the four fastq lines |
|
117 |
$seq{$name}{THIRD} = <IN>; |
|
118 |
|
|
119 |
# store the fourth of the four fastq lines |
|
120 |
$seq{$name}{FOURTH} = <IN>; |
|
121 |
} |
|
122 |
close IN; |
|
123 |
|
|
124 |
|
|
125 |
my $count = 0; |
|
126 |
my $dups = 0; |
|
127 |
|
|
128 |
open IN, $rev_file; |
|
129 |
open OUT_1_OK, ">".$for_file.".uniq"; |
|
130 |
open OUT_2_OK, ">".$rev_file.".uniq"; |
|
131 |
open OUT_1_DUP, ">".$for_file.".dup"; |
|
132 |
open OUT_2_DUP, ">".$rev_file.".dup"; |
|
133 |
|
|
134 |
while($line = <IN>){ |
|
135 |
$count++; |
|
136 |
$line =~ /\@(\S+)/; |
|
137 |
my $name = $1; |
|
138 |
|
|
139 |
# get the rest of the lines associated with this fastq record |
|
140 |
my $seq = <IN>; |
|
141 |
my $third = <IN>; |
|
142 |
my $fourth = <IN>; |
|
143 |
|
|
144 |
# get a substring of the sequence characters starting from base 0 |
|
145 |
# length determined by global var $LEN |
|
146 |
my $rev_begin = substr($seq, 0, $LEN); |
|
147 |
|
|
148 |
# get a substring of the sequence characters starting from base 10 |
|
149 |
# length determined by global var $LEN |
|
150 |
my $rev_begin2 = substr($seq, 10, $LEN); |
|
151 |
|
|
152 |
# get a substring of the sequence characters starting from base 20 |
|
153 |
# length determined by global var $LEN |
|
154 |
my $rev_begin3 = substr($seq, 20, $LEN); |
|
155 |
|
|
156 |
if(!exists $seq{$name}){ |
|
157 |
print "Can't find $name forward seq\n"; |
|
158 |
} |
|
159 |
else{ |
|
160 |
# get the same three substrings from the forward read |
|
161 |
my $for_begin = substr($seq{$name}{SEQ}, 0, $LEN); |
|
162 |
my $for_begin2 = substr($seq{$name}{SEQ}, 10, $LEN); |
|
163 |
my $for_begin3 = substr($seq{$name}{SEQ}, 20, $LEN); |
|
164 |
|
|
165 |
## if any of the substrings between the forward and reverse sequences match, |
|
166 |
## its a duplicate that needs to be filtered. |
|
167 |
if(($for_begin eq $rev_begin) || ($for_begin2 eq $rev_begin2) || ($for_begin3 eq $rev_begin3)){ |
|
168 |
## found duplicated bases at begining of r1 and r2 |
|
169 |
$dups++; |
|
170 |
print OUT_1_DUP $seq{$name}{FIRST}; |
|
171 |
print OUT_1_DUP $seq{$name}{SEQ}; |
|
172 |
print OUT_1_DUP $seq{$name}{THIRD}; |
|
173 |
print OUT_1_DUP $seq{$name}{FOURTH}; |
|
174 |
|
|
175 |
print OUT_2_DUP $line; |
|
176 |
print OUT_2_DUP $seq; |
|
177 |
print OUT_2_DUP $third; |
|
178 |
print OUT_2_DUP $fourth; |
|
179 |
|
|
180 |
} |
|
181 |
else{ |
|
182 |
## r1 and r2 look uniq |
|
183 |
print OUT_1_OK $seq{$name}{FIRST}; |
|
184 |
print OUT_1_OK $seq{$name}{SEQ}; |
|
185 |
print OUT_1_OK $seq{$name}{THIRD}; |
|
186 |
print OUT_1_OK $seq{$name}{FOURTH}; |
|
187 |
|
|
188 |
print OUT_2_OK $line; |
|
189 |
print OUT_2_OK $seq; |
|
190 |
print OUT_2_OK $third; |
|
191 |
print OUT_2_OK $fourth; |
|
192 |
|
|
193 |
} |
|
194 |
} |
|
195 |
} |
|
196 |
close IN; |
|
197 |
|
|
198 |
|
|
199 |
my $pct = ($dups/$count)*100; |
|
200 |
my $uniq2 = $count - $dups; |
|
201 |
print "total pairs examined: $count\n"; |
|
202 |
print "pairs with dup beginning: $dups\n"; |
|
203 |
print "pairs with uniq beginning: $uniq2\n"; |