Skip to content

Commit

Permalink
BUG Fix important sequence reinjection bug
Browse files Browse the repository at this point in the history
Before, it was possible that the wrong mate would be reinjected, leading
to length mismatches
  • Loading branch information
luispedro committed Apr 6, 2020
1 parent bbccc02 commit dad2f06
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 40 deletions.
1 change: 1 addition & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Version 1.1.0+
* Enable specifying *all* module resources by URL with download on first
use
* Fix CIGAR reinjection bug

Version 1.1.0 2020-01-25 by luispedro
* Reintroduce zstd compression (after fixes upstream)
Expand Down
21 changes: 1 addition & 20 deletions NGLess/Interpret.hs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ import Interpretation.Count (executeCount, executeCountCheck)
import Interpretation.CountFile (executeCountFile)
import Interpretation.FastQ
import Interpretation.Write
import Interpretation.Select (executeSelect, executeMappedReadMethod, splitSamlines3, fixCigar)
import Interpretation.Select (executeSelect, executeMappedReadMethod, reinjectSequences)
import Interpretation.Unique
import Interpretation.Substrim
import Utils.Utils
Expand Down Expand Up @@ -589,25 +589,6 @@ executeSelectWBlock input@NGOMappedReadSet{ nglSamFile = isam} args (Block [Vari
_ -> nglTypeError ("Expected variable "++show var++" to contain a mapped read.")

else return []
reinjectSequences :: [SamLine] -> [SamLine] -> NGLess [SamLine]
reinjectSequences original filtered = case (splitSamlines3 original, splitSamlines3 filtered) of
((o1, o2, os), (f1, f2, fs)) -> do
r1 <- reinjectSequences' o1 f1
r2 <- reinjectSequences' o2 f2
ss <- reinjectSequences' os fs
return (r1 ++ r2 ++ ss)
reinjectSequences' :: [SamLine] -> [SamLine] -> NGLess [SamLine]
reinjectSequences' original f@(s@SamLine{}:rs)
| not (any hasSequence f) = let
fixed = flip map (filter hasSequence original) $ \s' -> do
cigar <- fixCigar (samCigar s) (samLength s')
return (s { samSeq = samSeq s', samQual = samQual s', samCigar = cigar }:rs)
asum' [] = return f
asum' (x:xs) = case x of
Right{} -> x
Left{} -> asum' xs
in asum' fixed
reinjectSequences' _ f = return f
executeSelectWBlock expr _ _ = unreachable ("Select with block, unexpected argument: " ++ show expr)


Expand Down
68 changes: 48 additions & 20 deletions NGLess/Interpretation/Select.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{- Copyright 2015-2019 NGLess Authors
{- Copyright 2015-2020 NGLess Authors
- License: MIT
-}

Expand All @@ -7,8 +7,8 @@
module Interpretation.Select
( executeSelect
, executeMappedReadMethod
, splitSamlines3
, fixCigar
, reinjectSequences
) where

import qualified Data.ByteString as B
Expand All @@ -25,7 +25,7 @@ import qualified Data.Text.Encoding as TE
import Data.Bits (Bits(..))
import Control.Monad.Except (throwError)
import Data.Either.Combinators (fromRight)
import Data.List (foldl', find)
import Data.List (foldl')
import Data.Either.Extra (eitherToMaybe)
import Data.Tuple.Extra (fst3)
import Data.Ratio (Ratio, (%))
Expand All @@ -34,7 +34,7 @@ import Data.Maybe

import Data.Sam
import FileManagement
import FileOrStream
import FileOrStream (asSamStream, FileOrStream(..))
import Language
import Output

Expand Down Expand Up @@ -81,24 +81,52 @@ _parseConditions args = do
-- 2) we want to keep the full sequence, so we want to use soft trimming (if
-- at all possible)
matchConditions :: Bool -> MatchCondition -> [(SamLine,B.ByteString)] -> NGLess [(SamLine, B.ByteString)]
matchConditions doReinject conds sg = reinjectSequences doReinject (matchConditions' conds sg)
matchConditions doReinject conds sg =
let sg' = matchConditions' conds sg
in if doReinject
then reinjectSequencesIfNeeded sg sg'
else pure sg'

toStrictBS :: BB.Builder -> B.ByteString
toStrictBS = BL.toStrict . BB.toLazyByteString


reinjectSequencesIfNeeded :: [(SamLine,B.ByteString)] -> [(SamLine,B.ByteString)] -> NGLess [(SamLine,B.ByteString)]
reinjectSequencesIfNeeded _ filtered@((SamHeader{},_):_) = return filtered
reinjectSequencesIfNeeded orig filtered =
if needsReinject (fmap fst filtered)
then do
filtered' <- reinjectSequences (fmap fst orig) (fmap fst filtered)
return $ fmap (\s -> (s, toStrictBS $ encodeSamLine s)) filtered'
else return filtered
where
reinjectSequences True f@((s@SamLine{}, _):rs)
| not (any (hasSequence . fst) f) && any (hasSequence . fst) sg
= do
s' <- addSequence s
return ((s', toStrictBS $ encodeSamLine s'):rs)
reinjectSequences _ f = return f

toStrictBS :: BB.Builder -> B.ByteString
toStrictBS = BL.toStrict . BB.toLazyByteString

addSequence s = case find hasSequence (fst <$> sg) of
Just s'@SamLine{} -> do
cigar' <- fixCigar (samCigar s) (B.length $ samSeq s')
return s { samSeq = samSeq s', samQual = samQual s', samCigar = cigar' }
_ -> return s
needsReinject :: [SamLine] -> Bool
needsReinject fs = let (fs1, fs2, fs0) = splitSamlines3 fs
in needsReinject' fs1 || needsReinject' fs2 || needsReinject' fs0
needsReinject' :: [SamLine] -> Bool
needsReinject' [] = False
needsReinject' xs = not (any hasSequence xs)

-- See note above on "Sequence reinjection" about why this function is necessary
reinjectSequences :: [SamLine] -> [SamLine] -> NGLess [SamLine]
reinjectSequences original filtered = case (splitSamlines3 original, splitSamlines3 filtered) of
((o1, o2, os), (f1, f2, fs)) -> do
r1 <- reinjectSequences' o1 f1
r2 <- reinjectSequences' o2 f2
ss <- reinjectSequences' os fs
return (r1 ++ r2 ++ ss)
reinjectSequences' :: [SamLine] -> [SamLine] -> NGLess [SamLine]
reinjectSequences' original f@(s@SamLine{}:rs)
| not (any hasSequence f) = let
fixed = flip map (filter hasSequence original) $ \s' -> do
cigar <- fixCigar (samCigar s) (samLength s')
return (s { samSeq = samSeq s', samQual = samQual s', samCigar = cigar }:rs)
asum' [] = return f
asum' (x:xs) = case x of
Right{} -> x
Left{} -> asum' xs
in asum' fixed
reinjectSequences' _ f = return f
-- See note above on "Sequence reinjection" about why this function is necessary
fixCigar :: B.ByteString -> Int -> NGLess B.ByteString
fixCigar prev n = do
Expand Down
1 change: 1 addition & 0 deletions tests/fix_cigarSam/fixCigar.ngl
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ mappedBlock = select(mapped) using |mr|:
discard

mappedCall = select(mapped, keep_if=[{mapped}])
write(mappedCall, ofile='output.sam')
10 changes: 10 additions & 0 deletions tests/fix_cigarSam/reinject.sam
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
@SQ SN:SAMN04435864.232 LN:3042917
@SQ SN:SAMN04487766.1000003 LN:272134
@SQ SN:SAMEA4471185.1000011 LN:381280
@SQ SN:SAMN03197959.1000103 LN:43533
@SQ SN:4-2_gene39770 LN:1547
@SQ SN:7-35611_1_gene245044 LN:1547
@SQ SN:5-17872_1_gene78296 LN:1547
Expand Down Expand Up @@ -41,3 +45,9 @@ ER9 377 7-ene66981 13 0 22H45M = 13 0 * * NM:i:1 MD:Z:16A28 AS:i:40
ER9 329 4-ene6183 1494 0 45M22H = 1494 0 * * NM:i:1 MD:Z:28T16 AS:i:40
ER9 377 5-ene109198 13 0 22H45M = 13 0 * * NM:i:1 MD:Z:16A28 AS:i:40
ER9 2121 g944564.1178 1508 0 28H39M = 1508 0 GCTAAAAAAGCTCCTTTCTATCGTATTGTTGTTGCAGAC DFGHGGIJIIGCEGHHFECHF@DEDC@C>CB=AAC>;:A NM:i:2 MD:Z:26C8T3 AS:i:30 XS:i:0 SA:Z:4-12192_1_gene91203,1504,+,44M23S,0,0;
GW1808313582 117 * 136740 0 * * 136740 0 GGCAGTGAAGAGGAGGCGGGGAGGTCCCCTGGCGCTCCCGCTGCATGTATGATCGCGGGGTGGTCAAGCAGATCTGCAAGATCGCCTTTATACGGACAGTCATGCCGGCCGCAGCAGTCGCTCTTATCGCGCTCACCGTCTGGGACTGAT JJJAJA<<JJJJJJJJJJJJFJJJJJJFJJAAJJFJFJJJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJFJFJJ<JJFAJJJAFJJJJFJFF<JJJJAJJJJJJFJJJJJJJJJJJJJJFFFAA<JJ<JJJFJFFFAA MC:Z:79M AS:i:0 XS:i:0
GW1808313582 189 * 136740 7 * * 136740 0 CGCCCGCGGCTCAGAAAAAGGGACCTGCCACGTGCGCGCGGGCGGCTTTTAGATGAACGTCCACGACTCCGAGCACATG JJJJJJJFJJJJFJJ<AJJJJFF<JF<JAAFFAJJJJJJJJJJJJJJFJJJJJFJJJJFJJFJJJFJJ<<FFJJFFFAA NM:i:3 MD:Z:6A2A0G68 AS:i:68 XS:i:64
GW1808313582 393 SAMN04435864.232 80639 0 64M15H = 80639 0 * * NM:i:0 MD:Z:64 AS:i:64
GW1808313582 393 SAMN04487766.1000003 230499 0 60M19H = 230499 0 * * NM:i:0 MD:Z:60 AS:i:60
GW1808313582 393 SAMEA4471185.1000011 18883 0 61M18H = 18883 0 * * NM:i:1 MD:Z:54A6 AS:i:56
GW1808313582 393 SAMN03197959.1000103 16191 0 60M19H = 16191 0 * * NM:i:1 MD:Z:36C23 AS:i:55

0 comments on commit dad2f06

Please sign in to comment.