Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast scan primitive, like attoparsec's #280

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
42 changes: 42 additions & 0 deletions Text/Megaparsec.hs
Expand Up @@ -866,6 +866,20 @@ class (Stream s, A.Alternative m, MonadPlus m)
-> (Token s -> Bool) -- ^ Predicate to use to test tokens
-> m (Tokens s) -- ^ A chunk of matching tokens

-- | Similar to 'takeWhileP', but with a stateful predicate.
-- The predicate consumes and transforms a state argument of type
-- @st@, and each transformed state is passed to successive iterations
-- of the predicate on each 'Token' of the input until one returns
-- 'Nothing' or the input ends.
--
-- This parser never fails, although it may parse an empty chunk.

scanP
:: Maybe String
-> st
-> (st -> Token s -> Maybe st)
-> m (Tokens s)

-- | Extract the specified number of tokens from the input stream and
-- return them packed as a chunk of stream. If there is not enough tokens
-- in the stream, a parse error will be signaled. It's guaranteed that if
Expand Down Expand Up @@ -909,6 +923,7 @@ instance (Ord e, Stream s) => MonadParsec e s (ParsecT e s m) where
tokens = pTokens
takeWhileP = pTakeWhileP
takeWhile1P = pTakeWhile1P
scanP = pScanP
takeP = pTakeP
getParserState = pGetParserState
updateParserState = pUpdateParserState
Expand Down Expand Up @@ -1072,6 +1087,25 @@ pTakeWhileP ml f = ParsecT $ \(State input (pos:|z) tp w) cok _ eok _ ->
else cok ts (State input' (npos:|z) (tp + len) w) hs
{-# INLINE pTakeWhileP #-}

pScanP :: forall e s m st. Stream s
=> Maybe String
-> st
-> (st -> Token s -> Maybe st)
-> ParsecT e s m (Tokens s)
pScanP ml st f = ParsecT $ \(State input (pos:|z) tp w) cok _ eok _ ->
let pxy = Proxy :: Proxy s
(ts, input') = scan_ f st input
!npos = advanceN pxy w pos ts
len = chunkLength pxy ts
hs =
case ml >>= NE.nonEmpty of
Nothing -> mempty
Just l -> (Hints . pure . E.singleton . Label) l
in if chunkEmpty pxy ts
then eok ts (State input' (npos:|z) (tp + len) w) hs
else cok ts (State input' (npos:|z) (tp + len) w) hs
{-# INLINE pScanP #-}

pTakeWhile1P :: forall e s m. Stream s
=> Maybe String
-> (Token s -> Bool)
Expand Down Expand Up @@ -1149,6 +1183,7 @@ instance MonadParsec e s m => MonadParsec e s (L.StateT st m) where
tokens e ts = lift (tokens e ts)
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift (updateParserState f)
Expand All @@ -1171,6 +1206,7 @@ instance MonadParsec e s m => MonadParsec e s (S.StateT st m) where
tokens e ts = lift (tokens e ts)
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift (updateParserState f)
Expand All @@ -1190,6 +1226,7 @@ instance MonadParsec e s m => MonadParsec e s (L.ReaderT r m) where
tokens e ts = lift (tokens e ts)
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift (updateParserState f)
Expand All @@ -1212,6 +1249,7 @@ instance (Monoid w, MonadParsec e s m) => MonadParsec e s (L.WriterT w m) where
tokens e ts = lift (tokens e ts)
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift (updateParserState f)
Expand All @@ -1234,6 +1272,7 @@ instance (Monoid w, MonadParsec e s m) => MonadParsec e s (S.WriterT w m) where
tokens e ts = lift (tokens e ts)
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift (updateParserState f)
Expand All @@ -1260,6 +1299,7 @@ instance (Monoid w, MonadParsec e s m) => MonadParsec e s (L.RWST r w st m) wher
tokens e ts = lift (tokens e ts)
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift (updateParserState f)
Expand All @@ -1286,6 +1326,7 @@ instance (Monoid w, MonadParsec e s m) => MonadParsec e s (S.RWST r w st m) wher
tokens e ts = lift (tokens e ts)
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift (updateParserState f)
Expand All @@ -1305,6 +1346,7 @@ instance MonadParsec e s m => MonadParsec e s (IdentityT m) where
tokens e ts = lift $ tokens e ts
takeWhileP l f = lift (takeWhileP l f)
takeWhile1P l f = lift (takeWhile1P l f)
scanP l st f = lift (scanP l st f)
takeP l n = lift (takeP l n)
getParserState = lift getParserState
updateParserState f = lift $ updateParserState f
Expand Down
134 changes: 126 additions & 8 deletions Text/Megaparsec/Stream.hs
Expand Up @@ -14,10 +14,13 @@
--
-- @since 6.0.0

{-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DefaultSignatures #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}

module Text.Megaparsec.Stream
( Stream (..) )
Expand All @@ -27,14 +30,25 @@ import Data.List (foldl')
import Data.Proxy
import Data.Semigroup ((<>))
import Data.Word (Word8)
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Ptr (minusPtr, plusPtr)
import Foreign.Storable (Storable(peek))
import Text.Megaparsec.Pos
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import System.IO.Unsafe
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Internal as BLI
import qualified Data.Text as T
import qualified Data.Text.Array as TI
import qualified Data.Text.Internal as TI
import qualified Data.Text.Internal.Unsafe.Char as TI
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Internal.Lazy as TLI

#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
import Data.Monoid (Monoid(mempty))
#endif

-- | Type class for inputs that can be consumed by the library.
Expand Down Expand Up @@ -166,6 +180,28 @@ class (Ord (Token s), Ord (Tokens s)) => Stream s where

takeWhile_ :: (Token s -> Bool) -> s -> (Tokens s, s)

-- | Efficiently scan through the stream with a stateful predicate,
-- taking tokens while the predicate returns @'Just' st@. Return the
-- chunk and the rest of the stream.
--
-- For many types of streams, the method allows for significant
-- performance improvements, although it is not strictly necessary from
-- conceptual point of view.

scan_ :: (st -> Token s -> Maybe st) -> st -> s -> (Tokens s, s)

default scan_ :: (Monoid s) => (st -> Token s -> Maybe st) -> st -> s -> (Tokens s, s)
scan_ p state str = scan' state str []
where
scan' st s toks = case take1_ s of
Just (tok, rest) ->
case p st tok of
Just st' -> scan' st' rest (tok:toks)
Nothing -> (tokensToChunk proxy (reverse toks), rest)
Nothing -> (tokensToChunk proxy (reverse toks), mempty)
where
proxy = Proxy :: Proxy s

instance Stream String where
type Token String = Char
type Tokens String = String
Expand All @@ -183,6 +219,12 @@ instance Stream String where
| null s = Nothing
| otherwise = Just (splitAt n s)
takeWhile_ = span
scan_ p st str = scan_' st ([], str)
where
scan_' _ (toks, rest@[]) = (reverse toks, rest)
scan_' state (toks, rest@(char:rest')) = case p state char of
Just st' -> scan_' st' (char:toks, rest')
Nothing -> (reverse toks, rest)

instance Stream B.ByteString where
type Token B.ByteString = Word8
Expand All @@ -200,6 +242,24 @@ instance Stream B.ByteString where
| B.null s = Nothing
| otherwise = Just (B.splitAt n s)
takeWhile_ = B.span
-- This is not gonna be pretty.
scan_ p state bs = unsafePerformIO (evilScan bs)
where
evilScan (BI.PS fp off len) = withForeignPtr fp $ \ptr0 -> do
let start = ptr0 `plusPtr` off
end = start `plusPtr` len
scan' !st ptr
| ptr < end = do
char <- peek ptr
case p st char of
Just st' -> scan' st' (ptr `plusPtr` 1)
Nothing -> return (ptr `minusPtr` start)
| otherwise = return len
matched <- scan' state start
let match = BI.PS fp off matched
rest = BI.PS fp (off + matched) (len - matched)

return (match, rest)

instance Stream BL.ByteString where
type Token BL.ByteString = Word8
Expand All @@ -217,6 +277,34 @@ instance Stream BL.ByteString where
| BL.null s = Nothing
| otherwise = Just (BL.splitAt (fromIntegral n) s)
takeWhile_ = BL.span
scan_ p state0 bytes = unsafePerformIO (evilScanL bytes state0 BL.empty)
where
evilScanL BLI.Empty _ existing = return (existing, BL.empty)
evilScanL (BLI.Chunk bs next) state existing = do
(match, rest, done) <- evilScanS bs state
let existing' = BL.append existing (BLI.chunk match BLI.Empty)

case done of
-- If done == Nothing, then evilScanS stopped because it failed to match, so we're done.
Nothing -> return (existing', BLI.chunk rest BLI.Empty)
-- Otherwise, we need to keep going with the next chunk of the lazy string.
Just state' -> evilScanL next state' existing'

evilScanS (BI.PS fp off len) state = withForeignPtr fp $ \ptr0 -> do
let start = ptr0 `plusPtr` off
end = start `plusPtr` len
scan' !st ptr
| ptr < end = do
char <- peek ptr
case p st char of
Just st' -> scan' st' (ptr `plusPtr` 1)
Nothing -> return (ptr `minusPtr` start, Nothing)
| otherwise = return (len, Just st)
(matched, done) <- scan' state start
let match = BI.PS fp off matched
rest = BI.PS fp (off + matched) (len - matched)

return (match, rest, done)

instance Stream T.Text where
type Token T.Text = Char
Expand All @@ -234,6 +322,15 @@ instance Stream T.Text where
| T.null s = Nothing
| otherwise = Just (T.splitAt n s)
takeWhile_ = T.span
scan_ p state orig@(TI.Text array offset len) = uglyScan state offset
where
-- Mucking around with secret internals is merely *ugly*, as opposed to the unalloyed evil of unsafePerformIO
uglyScan st off
| off < len =
case p st (TI.unsafeChr (TI.unsafeIndex array (offset + off))) of
Just st' -> uglyScan st' (off + 1)
Nothing -> (TI.Text array offset off, TI.Text array (offset + off) (len - off))
| otherwise = (orig, T.empty)

instance Stream TL.Text where
type Token TL.Text = Char
Expand All @@ -251,6 +348,27 @@ instance Stream TL.Text where
| TL.null s = Nothing
| otherwise = Just (TL.splitAt (fromIntegral n) s)
takeWhile_ = TL.span
scan_ p state0 text = uglyScanL text state0 TL.empty
where
uglyScanL TLI.Empty _ existing = (existing, TL.empty)
uglyScanL (TLI.Chunk txt next) state existing =
case done of
-- If done == Nothing, then uglyScanS stopped because it failed to match, so we're done.
Nothing -> (existing', TLI.chunk rest TLI.Empty)
-- Otherwise, we need to keep going with the next chunk of the lazy string.
Just state' -> uglyScanL next state' existing'
where
(match, rest, done) = uglyScanS txt state
existing' = TL.append existing (TLI.chunk match TLI.Empty)

uglyScanS orig@(TI.Text array offset len) state = scan' state offset
where
scan' st off
| off < len =
case p st (TI.unsafeChr (TI.unsafeIndex array (offset + off))) of
Just st' -> scan' st' (off + 1)
Nothing -> (TI.Text array offset off, TI.Text array (offset + off) (len - off), Nothing)
| otherwise = (orig, T.empty, Just st)

----------------------------------------------------------------------------
-- Helpers
Expand Down
15 changes: 15 additions & 0 deletions tests/Text/Megaparsec/StreamSpec.hs
Expand Up @@ -90,6 +90,9 @@ spec = do
it "extracts a chunk that is a prefix consisting of matching tokens" $
property $ \s ->
takeWhile_ isLetter s === span isLetter s
describe "scan_" $
it "extracts a chunk that is a prefix consisting of filtered tokens" $
scan_ (\st c -> if (st /= '\\' && c == '!') then Nothing else Just c) '\0' ("aaaa\\!aaaa!" :: String) === ("aaaa\\!aaaa", "!")

describe "ByteString instance of Stream" $ do
describe "tokenToChunk" $
Expand Down Expand Up @@ -165,6 +168,9 @@ spec = do
property $ \s ->
let f = isLetter . chr . fromIntegral
in takeWhile_ f s === B.span f s
describe "scan_" $
it "extracts a chunk that is a prefix consisting of filtered tokens" $
scan_ (\st c -> if (st /= 0x5c && c == 0x21) then Nothing else Just c) 0 ("aaaa\\!aaaa!" :: B.ByteString) === ("aaaa\\!aaaa", "!")

describe "Lazy ByteString instance of Stream" $ do
describe "tokenToChunk" $
Expand Down Expand Up @@ -240,6 +246,9 @@ spec = do
property $ \s ->
let f = isLetter . chr . fromIntegral
in takeWhile_ f s === BL.span f s
describe "scan_" $
it "extracts a chunk that is a prefix consisting of filtered tokens" $
scan_ (\st c -> if (st /= 0x5c && c == 0x21) then Nothing else Just c) 0 ("aaaa\\!aaaa!" :: BL.ByteString) === ("aaaa\\!aaaa", "!")

describe "Text instance of Stream" $ do
describe "tokenToChunk" $
Expand Down Expand Up @@ -314,6 +323,9 @@ spec = do
it "extracts a chunk that is a prefix consisting of matching tokens" $
property $ \s ->
takeWhile_ isLetter s === T.span isLetter s
describe "scan_" $
it "extracts a chunk that is a prefix consisting of filtered tokens" $
scan_ (\st c -> if (st /= '\\' && c == '!') then Nothing else Just c) '\0' ("aaaa\\!aaaa!" :: T.Text) === ("aaaa\\!aaaa", "!")

describe "Lazy Text instance of Stream" $ do
describe "tokenToChunk" $
Expand Down Expand Up @@ -388,6 +400,9 @@ spec = do
it "extracts a chunk that is a prefix consisting of matching tokens" $
property $ \s ->
takeWhile_ isLetter s === TL.span isLetter s
describe "scan_" $
it "extracts a chunk that is a prefix consisting of filtered tokens" $
scan_ (\st c -> if (st /= '\\' && c == '!') then Nothing else Just c) '\0' ("aaaa\\!aaaa!" :: TL.Text) === ("aaaa\\!aaaa", "!")

----------------------------------------------------------------------------
-- Helpers
Expand Down