Skip to content

Commit

Permalink
BUG Fix FastQ encoding detection heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
luispedro committed Feb 23, 2017
1 parent c343f8a commit a0aa888
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 26 deletions.
39 changes: 23 additions & 16 deletions NGLess/Interpretation/FastQ.hs
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,32 @@ import NGLess
-- | Guess the encoding of a file
encodingFor :: FilePath -> NGLessIO FastQEncoding
encodingFor fp = do
let countMin :: (Int, Word8) -> Word8 -> (Int, Word8)
countMin (!c,!m) m' = (c+1, min m m')
minLc :: (MonadError NGError m) => [ByteLine] -> m Word8
minLc [_,_,_,qs] = return . B.minimum . unwrapByteLine $ qs
minLc _ = throwDataError ("Malformed FASTQ file: '" ++ fp ++ "': number of lines is not a multiple of 4")
let update :: Word8 -> Word8 -> B.ByteString -> (Word8, Word8)
update minv maxv qs = (min minv $ B.minimum qs, max maxv $ B.maximum qs)
encodingC minv maxv =
C.await >>= \case
Nothing
| minv == 255 -> do
lift $ outputListLno' WarningOutput ["Input file ", fp, " is empty."]
return SangerEncoding -- It does not matter
| otherwise -> do
lift $ outputListLno' WarningOutput ["Heuristic for FastQ encoding determination for file ", show fp, " cannot be 100% confident. Guessing 33 offset (Sanger encoding, used by newer Illumina machines)."]
return SangerEncoding
Just [_, _, _, ByteLine qs] -> case update minv maxv qs of
(minv', maxv')
| minv' < 33 -> throwDataError ("No known encodings with chars < 33 (Yours was "++ show minv ++ ") for file '" ++ show fp ++ "'.")
| minv' < 58 -> return SangerEncoding
-- 33 + 45 should never happen with SangerEncoding, but it's quality 14 in SolexaEncoding, so should be common
| maxv' >= (33+45) -> return SolexaEncoding
| otherwise -> encodingC minv' maxv'
_ -> throwDataError ("Malformed file '" ++ fp ++ "': number of lines is not a multiple of 4.")

(c,m) <- conduitPossiblyCompressedFile fp

C.runConduit $
conduitPossiblyCompressedFile fp
=$= linesCBounded
=$= groupC 4
=$= CL.isolate 100
=$= CL.mapM minLc
$$ CL.fold countMin (0,maxBound :: Word8)
if
| c < 1 -> do
outputListLno' WarningOutput ["Input file ", fp, " is empty."]
return SangerEncoding -- It does not matter
| m < 33 -> throwDataError ("No known encodings with chars < 33 (Yours was "++ show c ++ ")")
| m < 58 -> return SangerEncoding
| otherwise -> return SolexaEncoding
=$= encodingC 255 0

-- | Checks if file has no content
checkNoContent fp =
Expand Down
19 changes: 17 additions & 2 deletions NGLess/Tests/FastQ.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import Interpretation.FastQ
import Interpretation.Substrim
import Tests.Utils
import Data.FastQ
import Utils.Here

tgroup_FastQ = $(testGroupGenerator)

Expand All @@ -37,14 +38,28 @@ simpleStats f = testNGLessIO $ do

-- negative tests quality on value 60 char ';'. Value will be 60 - 64 which is -4
case_calc_statistics_negative = do
s <- simpleStats "test_samples/sample_low_qual.fq"
lowQ <- testNGLessIO $ asTempFile low_quality "fq"
s <- simpleStats lowQ
head s @?= (-4,-4,-4,-4)

-- low positive tests quality on 65 char 'A'. Value will be 65-64 which is 1.
case_calc_statistics_low_positive = do
s <- simpleStats "test_samples/sample_low_qual.fq"
lowQ <- testNGLessIO $ asTempFile low_quality "fq"
s <- simpleStats lowQ
last s @?= (1,1,1,1)


low_quality = [here|
@IRIS:7:1:17:394#0/1
GTCAGGACAAT
+
<=>?@ABCAWA
@IRIS:7:1:17:800#0/1
GGAAACACTA
+
<=>?@ABCAA
|]

case_calc_statistics_normal = do
s <- simpleStats "test_samples/data_set_repeated.fq"
head s @?= (25,33,31,33)
Expand Down
8 changes: 0 additions & 8 deletions test_samples/sample_low_qual.fq

This file was deleted.

0 comments on commit a0aa888

Please sign in to comment.