Skip to content

Commit

Permalink
ENH Implement selecting reads in regions of interest using samtools
Browse files Browse the repository at this point in the history
refs #69
  • Loading branch information
unode committed Dec 12, 2018
1 parent e68da99 commit 55aa365
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 7 deletions.
1 change: 1 addition & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Version 0.10.0+
* Disable Cairo dependency when building statically
* Update minimap2 version to 2.14
* Module samtools (version 0.1) now includes samtools_view

Version 0.10.0 2018-11-12 by luispedro
* Fix to lock1's return value when used with paths (#68 - reopen)
Expand Down
72 changes: 68 additions & 4 deletions NGLess/StandardModules/Samtools.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
- License: MIT
-}

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleContexts, OverloadedStrings #-}

module StandardModules.Samtools
( loadModule
Expand Down Expand Up @@ -173,14 +173,78 @@ samtools_sort_function = Function
}


executeView :: NGLessObject -> KwArgsValues -> NGLessIO NGLessObject
executeView (NGOMappedReadSet name istream rinfo) args = do
outBam <- lookupBoolOrScriptErrorDef (return False) "samtools_view" "__output_bam" args
bedFile <- lookupStringOrScriptError "coordinates in BED format" "bed_file" args

let oformat = if outBam then "bam" else "sam"
fname = case istream of
File f -> f
Stream _ f _ -> f

(rk, (newfp, hout)) <- openNGLTempFile' fname "subset_" ("." ++ oformat)

numCapabilities <- liftIO getNumCapabilities
let cmdargs = ["view", "-h", "-@", show numCapabilities, "-O", oformat, "-L", T.unpack bedFile]
samtoolsPath <- samtoolsBin
outputListLno' TraceOutput ["Calling binary ", samtoolsPath, " with args: ", unwords cmdargs]
(err, exitCode) <- case istream of
File fpsam -> do
let cp = (proc samtoolsPath (snoc cmdargs fpsam)) { std_out = UseHandle hout }
liftIO $ readProcessErrorWithExitCode cp
Stream _ _ istream' -> do
let cp = (proc samtoolsPath cmdargs) { std_in = CreatePipe, std_out = UseHandle hout, std_err = CreatePipe }
(Just pipe_out, Nothing, Just herr, jHandle) <- liftIO $ createProcess cp
samP <- liftIO . A.async $ waitForProcess jHandle
err <- liftIO . A.async $ do
-- In a separate thread, consume all the error input.
-- the same pattern is used in the implementation of
-- readProcessWithErrorCode (which cannot be used here
-- as we want control over stdin/stdout)
err <- hGetContents herr
void (evaluate (length err))
hClose herr
return err
C.runConduit $ istream' .| byteLineVSinkHandle pipe_out
liftIO $ do
hClose pipe_out
A.waitBoth err samP
outputListLno' DebugOutput ["Samtools err output: ", err]
liftIO $ hClose hout
case exitCode of
ExitSuccess -> do
outputListLno' InfoOutput ["Done samtools view"]
return (NGOMappedReadSet name (File newfp) rinfo)
ExitFailure code -> do
release rk
throwSystemError $ concat ["Failed samtools view\n",
"Executable used::\t", samtoolsPath,"\n",
"Command line was::\n\t", unwords cmdargs, "\n",
"Samtools exit code was ", show code, "."]
executeView _ _ = throwScriptError "Unexpected arguments for samtools_view function"

samtools_view_function = Function
{ funcName = FuncName "samtools_view"
, funcArgType = Just NGLMappedReadSet
, funcArgChecks = []
, funcRetType = NGLMappedReadSet
, funcKwArgs = [ArgInformation "bed_file" True NGLString [ArgCheckFileReadable]]
, funcAllowsAutoComprehension = False
, funcChecks = []
}

loadModule :: T.Text -> NGLessIO Module
loadModule _ =
return def
{ modInfo = ModInfo "stdlib.samtools" "0.0"
{ modInfo = ModInfo "stdlib.samtools" "0.1"
, modCitations = [citation]
, modFunctions = [samtools_sort_function]
, modFunctions = [samtools_sort_function, samtools_view_function]
, modTransform = sortOFormat >=> checkUnique
, runFunction = const executeSort
, runFunction = \case
"samtools_sort" -> executeSort
"samtools_view" -> executeView
other -> \_ _ -> throwShouldNotOccur ("samtools runction called with wrong arguments: " ++ show other)
}
where
citation = T.concat
Expand Down
15 changes: 12 additions & 3 deletions docs/sources/stdlib.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,17 @@ interfere in a way that you get erroneous results**.

## Samtools module

This module exposes the samtools sorting functionality through the
`samtools_sort` function.
This module exposes two samtools functionalities: sorting (`samtools_sort`) and
selecting reads in regions of interest (`samtools_view`).

ngless '0.8'
import "samtools" version "0.0"
input = samfile('input.bam')
sam_regions = samtools_view(input, bed_file="interesting_regions.bed")
write(sam_regions, ofile='interesting.sam')

`samtools_view :: mappedreadset -> mappedreadset` returns a subset of the
mapped reads that overlap with the regions specified in the BED file.

ngless '0.8'
import "samtools" version "0.0"
Expand All @@ -111,7 +120,7 @@ This module exposes the samtools sorting functionality through the
`samtools_sort :: mappedreadset -> mappedreadset` returns a sorted version of
the dataset.

Internally, this function calls ngless' version of samtools while respecting
Internally, both function call ngless' version of samtools while respecting
your settings for the use of threads and temporary disk space. When combined
with other functionality, ngless can also often stream data into/from samtools
instead of relying on intermediate files (these optimizations should not change
Expand Down
Binary file added test_samples/short.bed.gz
Binary file not shown.
15 changes: 15 additions & 0 deletions tests/samfile-select-view/check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash

if [ "$(which samtools 2>/dev/null)" == "" ]; then
if [ "$NGLESS_SAMTOOLS_BIN" == "" ]; then
SAMTOOLS="$(ngless --print-path samtools)"
else
SAMTOOLS="$NGLESS_SAMTOOLS_BIN"
fi
else
SAMTOOLS="$(which samtools)"
fi

if ! diff <($SAMTOOLS view -h output.bam) <(zcat texpected.sam.gz) ; then
exit 1
fi
1 change: 1 addition & 0 deletions tests/samfile-select-view/sample.sam.gz
1 change: 1 addition & 0 deletions tests/samfile-select-view/short.bed.gz
Binary file added tests/samfile-select-view/texpected.sam.gz
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/samfile-select-view/view.ngl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ngless '0.0'
import "samtools" version "0.1"

mapped = samfile('sample.sam.gz')
region_reads = samtools_view(mapped, bed_file="short.bed.gz")
write(region_reads, ofile='output.bam')

0 comments on commit 55aa365

Please sign in to comment.