Skip to content

Commit

Permalink
ENH Reduce memory usage in count()
Browse files Browse the repository at this point in the history
On a moderately, the memory used by count() drops from ~800 MiB to
540MiB (one third less).

Furthermore, the memory is now consumed by objects rather than thunks.

Larger gains would probably only come from rewriting the code for the
interval map structure as most of the remaining memory is spent there.
  • Loading branch information
luispedro committed Apr 27, 2020
1 parent 25a5d29 commit 4b7939e
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 23 deletions.
93 changes: 73 additions & 20 deletions NGLess/Interpretation/Count.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{- Copyright 2015-2019 NGLess Authors
{- Copyright 2015-2020 NGLess Authors
- License: MIT
-}
{-# LANGUAGE FlexibleContexts, CPP #-}
Expand All @@ -18,6 +18,9 @@ module Interpretation.Count
, loadFunctionalMap
, performCount
, RSV.RefSeqInfo(..)
#ifdef IS_BUILDING_TEST
, AnnotationInfo(..)
#endif
) where

import qualified Data.ByteString as B
Expand All @@ -43,7 +46,6 @@ import qualified Data.Conduit.Algorithms.Async as CAlg
import qualified Data.Conduit.Algorithms.Utils as CAlg
import Data.Conduit.Algorithms.Async (conduitPossiblyCompressedFile)
import Data.Conduit ((.|))
import qualified Data.Strict.Tuple as TU
import Data.Strict.Tuple (Pair(..))
import Control.Monad (when, unless, forM, forM_)

Expand All @@ -53,7 +55,7 @@ import Control.Monad.IO.Class (liftIO)
import Control.Monad.Except (throwError)
import Data.List (foldl1', foldl', sort, sortOn)
import GHC.Conc (getNumCapabilities)
import Control.DeepSeq (NFData(..))
import Control.DeepSeq (NFData(..), ($!!))
import Control.Error (note)
import Control.Applicative ((<|>))
import Data.Maybe
Expand Down Expand Up @@ -113,11 +115,47 @@ toShortest = B8.pack . show
- 3. MOCAT-style "gene name" -> "feature"
-}


-- AnnotationInfo is a C-style embedded list
--
-- The type `AnnotationInfo` is equivalent to `NonEmpty (GffStrand, Int)`, while
-- `Maybe AnnotationInfo` is equivalent to `[(GffString, Int)]` (with `Nothing`
-- representing `[]`).
--
-- The reason to not use those more natural types (which was done until NGLess
-- 1.2) is that some GFF files can get pretty big and it ends up being worth it
-- to save those extra Bytes. Additionally, AnnotationInfo is strict, so
-- `AnnotationInfo` is actually equivalent to a strict version of `NonEmpty
-- (Pair GffStrand Int)`, which avoids any space leaks due to laziness.
--
data AnnotationInfo = AnnotationInfo1 !GffStrand
{-# UNPACK #-} !Int
| AnnotationInfoCons !GffStrand
{-# UNPACK #-} !Int
!AnnotationInfo

instance NFData AnnotationInfo where
rnf (AnnotationInfo1 !_ !_) = ()
rnf (AnnotationInfoCons !_ !_ ais) = rnf ais

aiConcat :: [AnnotationInfo] -> Maybe AnnotationInfo
aiConcat xs = aiFromList . concat . map aiToList $ xs
where
aiToList :: AnnotationInfo -> [(GffStrand, Int)]
aiToList (AnnotationInfo1 s ix) = [(s,ix)]
aiToList (AnnotationInfoCons s ix ais) = (s,ix):aiToList ais

aiFromList :: [(GffStrand, Int)] -> Maybe AnnotationInfo
aiFromList [] = Nothing
aiFromList [(s,ix)] = Just $ AnnotationInfo1 s ix
aiFromList ((s,ix):ais) = AnnotationInfoCons s ix <$> (aiFromList ais)


type GffIMMap = IM.IntervalMap Int AnnotationInfo

-- GFFAnnotationMap maps from `References` (e.g., chromosomes) to positions to (strand/feature-id)
type AnnotationInfo = Pair GffStrand Int
type GffIMMap = IM.IntervalMap Int [AnnotationInfo]
type GFFAnnotationMap = M.Map B.ByteString GffIMMap
type AnnotationRule = GffIMMap -> GffStrand -> (Int, Int) -> [AnnotationInfo]
type AnnotationRule = GffIMMap -> GffStrand -> (Int, Int) -> Maybe AnnotationInfo

-- This implements MOCAT-style "gene name" -> "feature" annotation
type GeneMapAnnotation = M.Map B8.ByteString [Int]
Expand Down Expand Up @@ -767,7 +805,7 @@ loadGFF gffFp opts = do
Just fs -> Just <$> fs]

outputListLno' TraceOutput ["Loading GFF file '", gffFp, "' complete."]
return $! map finishGffAnnotator partials
return $!! map finishGffAnnotator partials
where
singleFeature
| length (optFeatures opts) > 1 = False
Expand Down Expand Up @@ -813,7 +851,10 @@ loadGFF gffFp opts = do
gmap' :: GFFAnnotationMap
gmap' = M.alter insertg' (gffSeqId gline) gmap
insertg' immap = Just $ IM.alter
(\vs -> Just ((gffStrand gline :!: active):fromMaybe [] vs))
(\case
Nothing -> Just $ AnnotationInfo1 (gffStrand gline) active
Just ais -> Just $ AnnotationInfoCons (gffStrand gline) active ais
)
asInterval
(fromMaybe IM.empty immap)

Expand All @@ -834,11 +875,12 @@ loadGFF gffFp opts = do
-- First integer IDs are assigned "first come, first served"
-- `reindex` makes them alphabetical
reindex :: GFFAnnotationMap -> M.Map B.ByteString Int -> Pair GFFAnnotationMap [B.ByteString]
reindex amap namemap = (M.map (fmap (map reindexAI)) amap :!: headers)
reindex amap namemap = (M.map (IM.map reindexAI) amap :!: headers)
where
headers = M.keys namemap -- these are sorted
reindexAI :: AnnotationInfo -> AnnotationInfo
reindexAI (s :!: v) = s :!: (ix2ix VU.! v)
reindexAI (AnnotationInfo1 s v) = AnnotationInfo1 s (ix2ix VU.! v)
reindexAI (AnnotationInfoCons s v ais) = AnnotationInfoCons s (ix2ix VU.! v) (reindexAI ais)
ix2ix :: VU.Vector Int
ix2ix = revnamemap namemap

Expand All @@ -854,8 +896,11 @@ loadGFF gffFp opts = do
annotateSamLineGFF :: CountOpts -> GFFAnnotationMap -> SamLine -> [Int]
annotateSamLineGFF opts amap samline = case M.lookup rname amap of
Nothing -> []
Just im -> TU.snd <$> (optIntersectMode opts) im lineStrand (sStart, sEnd)
Just im -> selectIx $ (optIntersectMode opts) im lineStrand (sStart, sEnd)
where
selectIx Nothing = []
selectIx (Just (AnnotationInfo1 _ ix)) = [ix]
selectIx (Just (AnnotationInfoCons _ ix ais)) = ix:selectIx (Just ais)
rname = samRName samline
sStart = samPos samline
sEnd = sStart + samLength samline - 1
Expand All @@ -870,16 +915,23 @@ annotateSamLineGFF opts amap samline = case M.lookup rname amap of
| isPositive samline -> GffNegStrand
| otherwise -> GffPosStrand

filterStrand :: GffStrand -> IM.IntervalMap Int [AnnotationInfo] -> IM.IntervalMap Int [AnnotationInfo]
filterStrand :: GffStrand -> IM.IntervalMap Int AnnotationInfo -> IM.IntervalMap Int AnnotationInfo
filterStrand GffUnStranded = id
filterStrand strand = IM.mapMaybe $ \ais -> case filter matchStrand ais of
[] -> Nothing
ais' -> Just ais'
filterStrand strand = IM.mapMaybe matchStrand
where
matchStrand (s :!: _) = s == GffUnStranded || s == strand
match1 s = s == GffUnStranded || s == strand
matchStrand ai@(AnnotationInfo1 s _)
| match1 s = Just ai
| otherwise = Nothing
matchStrand (AnnotationInfoCons s ix ais)
| match1 s =
Just $! case matchStrand ais of
Nothing -> AnnotationInfo1 s ix
Just ais' -> AnnotationInfoCons s ix ais'
| otherwise = matchStrand ais

union :: AnnotationRule
union im strand (sS, sE) = concat . IM.elems . filterStrand strand . IM.intersecting im $ IM.ClosedInterval sS sE
union im strand (sS, sE) = aiConcat . IM.elems . filterStrand strand . IM.intersecting im $ IM.ClosedInterval sS sE

intersection_strict :: AnnotationRule
intersection_strict im strand (sS, sE) = intersection' $ map (filterStrand strand . IM.containing im) [sS..sE]
Expand All @@ -889,9 +941,10 @@ intersection_non_empty im strand (sS, sE) = intersection' . filter (not . null)
where
subim = IM.intersecting im (IM.ClosedInterval sS sE)

intersection' :: [GffIMMap] -> [AnnotationInfo]
intersection' [] = []
intersection' im = concat . IM.elems $ foldl1' IM.intersection im
intersection' :: [GffIMMap] -> Maybe AnnotationInfo
intersection' [] = Nothing
intersection' im = aiConcat . IM.elems $ foldl1' IM.intersection im


lookupFilePath context name args = case lookup name args of
Nothing -> return Nothing
Expand Down
10 changes: 7 additions & 3 deletions Tests-Src/Tests/Count.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import qualified Data.Conduit.Combinators as C
import qualified Data.Conduit.Binary as CB
import qualified Data.Conduit.List as CL
import qualified Data.Conduit as C
import qualified Data.Strict.Tuple as TU
import Data.Conduit ((.|))
import Data.List (elemIndex)
import Control.Monad.IO.Class (liftIO)
Expand Down Expand Up @@ -74,11 +73,16 @@ defCountOpts =
}


extractIds :: [AnnotationInfo] -> [Int]
extractIds [] = []
extractIds (AnnotationInfo1 _ ix:xs) = ix:extractIds xs
extractIds (AnnotationInfoCons _ ix ais:xs) = ix:extractIds (ais:xs)

very_short_gff = "test_samples/very_short.gtf"
case_load_very_short = do
[GFFAnnotator immap headers szmap] <- testNGLessIO
$ loadAnnotator (AnnotateGFF very_short_gff) defCountOpts { optFeatures = ["gene"] }
let usedIDs = map TU.snd $ concat $ concatMap IM.elems $ M.elems immap
let usedIDs = extractIds $ concatMap IM.elems $ M.elems immap
length (listNub usedIDs) @?= length headers
minimum usedIDs @?= 0
maximum usedIDs @?= length headers - 1
Expand All @@ -101,7 +105,7 @@ case_load_gff_order = do
fp <- testNGLessIO $ asTempFile short3 "gtf"
[GFFAnnotator immap headers _] <- testNGLessIO
$ loadAnnotator (AnnotateGFF fp) defCountOpts { optFeatures = ["gene"] }
let [h] = map TU.snd . concat . IM.elems . fromJust $ M.lookup "V" immap
let [h] = extractIds . IM.elems . fromJust $ M.lookup "V" immap
(headers !! h) @?= "WBGene00008825"

short1 :: B.ByteString
Expand Down
2 changes: 2 additions & 0 deletions docs/sources/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Internal improvements

- Modules can now specify their annotation as a URL that NGLess downloads on a
"as needed" basis: in version 1.1, only FASTA files were supported.
- Memory consumption of `count() function <Functions.html#count>`__ has been
improved when using GFF files.

Version 1.1.1
-------------
Expand Down

0 comments on commit 4b7939e

Please sign in to comment.