Skip to content

Commit

Permalink
various
Browse files Browse the repository at this point in the history
  • Loading branch information
mlang committed Oct 10, 2018
1 parent 47c801d commit 608d819
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 55 deletions.
10 changes: 5 additions & 5 deletions Makefile
@@ -1,18 +1,18 @@
CHUNKSIZE=10M
CHUNKSIZE=100M
GIT=git
STACK=stack
WGET=wget
WIKILANG=de

wikiwc.$(WIKILANG): extracted
docs/known.$(WIKILANG).txt docs/unknown.$(WIKILANG).txt) docs/subseq.$(WIKILANG).txt: extracted.$(WIKILANG)
$(STACK) build
$(STACK) exec wikiwc -- 'extracted/*/wiki_*' +RTS -N >$@
$(STACK) exec wikiwc -- -w words.$(WIKILANG) -k docs/$(WIKILANG)/known.txt -u docs/$(WIKILANG)/unknown.txt -s docs/$(WIKILANG)/known-subseq.txt extracted.$(WIKILANG)/*/wiki_* +RTS -N

clean:
rm wikiwc.$(WIKILANG) $(WIKILANG)wiki-latest-pages-articles.xml.bz2
rm -rf extracted wikiextractor
rm -rf extracted.* wikiextractor

extracted: wikiextractor
extracted.$(WIKILANG): $(WIKILANG)wiki-latest-pages-articles.xml.bz2 wikiextractor
$(PYTHON) wikiextractor/WikiExtractor.py -b$(CHUNKSIZE) -o $@ $(WIKILANG)wiki-latest-pages-articles.xml.bz2

wikiextractor:
Expand Down
62 changes: 46 additions & 16 deletions app/Main.hs
@@ -1,29 +1,59 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Main where

import Control.Concurrent.Async (mapConcurrently)
import Control.Monad (foldM)
import Data.Foldable (foldl')
import Data.Text (Text)
import qualified Data.Text as Text (lines, null, pack, toLower, words)
import qualified Data.Text.IO as Text (putStr)
import qualified Data.Text as Text
import qualified Data.Text.IO as Text
import System.FilePath.Glob (glob)
import System.Environment (getArgs)
import WordFreq
import Options.Applicative

main :: IO ()
main = do
args <- getArgs
case args of
[glob] -> run glob
_ -> run "wiki_*"
data Options = Options {
knownWordListFiles :: [FilePath]
, knownOutput :: Maybe FilePath
, unknownOutput :: Maybe FilePath
, subsequenceOutput :: Maybe FilePath
, wikiExtractorFiles :: [FilePath]
} deriving (Eq)

programOptions :: Parser Options
programOptions =
Options <$> many (strOption $ long "known-words" <> short 'w' <> metavar "FILE")
<*> optional (strOption $ long "known" <> short 'k' <> metavar "KNOWN-OUTPUT-FILE")
<*> optional (strOption $ long "unknown" <> short 'u' <> metavar "UNKNOWN-OUTPUT-FILE")
<*> optional (strOption $ long "subseq" <> short 's' <> metavar "OUTPUT-FILE")
<*> some (argument str (metavar "FILES..."))

run match = do
words <- loadWordList "/usr/share/dict/ngerman"
map <- normalize . foldl' mappend mempty <$> (mapConcurrently loadWikiDump =<< glob match)
let known = filterWordFreq (knownWord words) (const True) map
let unknown = filterWordFreq (not . knownWord words) (> 1) map
Text.putStr $ foldWordFreq showWord (commonSequences known)
wikiwc :: Options -> IO ()
wikiwc Options{..} = do
words <- foldl' mappend mempty <$> mapConcurrently loadWordList knownWordListFiles
wc <- normalize <$> foldM loadWikiDump mempty wikiExtractorFiles
let known = filterWordFreq (knownWord words) (const True) wc
let unknown = filterWordFreq (not . knownWord words) (const True) wc
case knownOutput of
Just fp -> Text.writeFile fp $ toText . byFrequency $ known
Nothing -> pure ()
case unknownOutput of
Just fp -> Text.writeFile fp $ toText . byFrequency $ unknown
Nothing -> pure ()
case subsequenceOutput of
Just fp -> Text.writeFile fp $ toText . byFrequency $ commonSequences known
Nothing -> pure ()
pure ()

main :: IO ()
main = wikiwc =<< execParser opts where
opts = info (programOptions <**> helper) $
fullDesc
<> progDesc "Count words and substrings from Wikipedia dumps."
<> header "wikiwc - Wikipedia word counter"

showWord :: Text -> Int -> Text
showWord s i = Text.pack (show i) <> " " <> s <> "\n"
toText :: [(Int, Text)] -> Text
toText = foldMap f where
f (a, w) = Text.pack (show a) <> " " <> w <> "\n"
2 changes: 2 additions & 0 deletions package.yaml
Expand Up @@ -21,6 +21,8 @@ dependencies:
- containers
- extra
- Glob
- optparse-applicative
- parallel
- text

library:
Expand Down
80 changes: 46 additions & 34 deletions src/WordFreq.hs
@@ -1,44 +1,48 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module WordFreq (
loadWikiDump, WordFreq, filterWordFreq, foldWordFreq, addWord
, commonSequences, normalize
, commonSequences, normalize, byFrequency
, loadWordList, knownWord
) where

import Control.Applicative ((<|>))
import Control.Monad (void)
import Control.Monad.Extra (ifM)
import Control.Parallel.Strategies
import Data.Attoparsec.Text
import Data.Char (isAlpha)
import Data.Foldable (foldr)
import Data.List (sortBy)
import Data.Foldable (foldl')
import Data.List (groupBy, sort, sortOn)
import Data.Map.Strict (Map)
import Data.IntMap.Strict (IntMap)
import qualified Data.Map.Strict as Map
import qualified Data.IntMap.Strict as IntMap
import Data.Maybe (fromMaybe)
import Data.Ord (Down(Down), comparing)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member)
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.IO as Text (readFile, hGetChunk)
import System.IO (withFile, IOMode(ReadMode))
import qualified Data.Text.IO as Text
import GHC.Base (($!))
import System.IO (withFile, IOMode(ReadMode))

wikiDump :: Parser (WordFreq Text)
wikiDump = loop mempty where
loop ws = ifM atEnd (pure ws) $ do
w <- dirt *> takeWhile1 isWordy <* dirt
loop $! addWord w ws
wikiDump :: WordFreq Text -> Parser (WordFreq Text)
wikiDump ws = ifM atEnd (pure ws) $ do
w <- dirt *> takeWhile1 isWordy <* dirt
wikiDump $! addWord w ws
where
dirt = many' $ startDoc <|> endDoc <|> notWordy
startDoc = string "<doc " >> skipWhile (not . isEndOfLine) >> endOfLine
endDoc = string "</doc>" >> endOfLine
notWordy = void $ satisfy $ not . isWordy
isWordy = (||) <$> isAlpha <*> (== '-')

loadWikiDump :: FilePath -> IO (WordFreq Text)
loadWikiDump fp = withFile fp ReadMode $ \h ->
fromMaybe mempty . maybeResult <$> parseWith (Text.hGetChunk h) wikiDump ""
loadWikiDump :: WordFreq Text -> FilePath -> IO (WordFreq Text)
loadWikiDump wc fp = withFile fp ReadMode $ \h ->
fromMaybe mempty . maybeResult <$> parseWith (Text.hGetChunk h) (wikiDump wc) ""

newtype WordFreq a = WordFreq { unWordFreq :: Map a Int } deriving (Eq)
instance Ord a => Semigroup (WordFreq a) where
Expand All @@ -54,31 +58,39 @@ addWord w = WordFreq . add . unWordFreq where

normalize :: WordFreq Text -> WordFreq Text
normalize = WordFreq
. Map.fromList . Map.elems . fmap merge . Map.foldrWithKey lc mempty
. Map.fromList . Map.elems . fmap merge . Map.foldrWithKey' lc mempty
. unWordFreq where
lc k a = Map.insertWith mappend (Text.toLower k) [(k, a)]
merge xs = let ((k, a):xs') = sortBy (comparing (Down . snd)) xs
in (k, a + sum (snd <$> xs'))
merge xs = let ((k, a):xs') = sortOn (Down . snd) xs
!a' = a + sum (snd <$> xs')
in (k, a')

commonSequences :: WordFreq Text -> WordFreq Text
commonSequences = WordFreq
. clean . Map.foldrWithKey subseqs mempty
. deleteInfix . Map.foldrWithKey' subseqs mempty
. unWordFreq where
subseqs k a m = let l = Text.length k
in foldr (uncurry $ Map.insertWith (+)) m [
(Text.toLower $ Text.take n $ Text.drop i k, a)
| i <- [0 .. l - 1], n <- [1 .. l - i]
]
clean = Map.foldrWithKey' coinvert mempty
. fmap reduce
. Map.foldrWithKey invert mempty where
invert k a = Map.insertWith mappend a [k]
coinvert a ks m = foldr (\k m -> Map.insert k a m) m ks
reduce d = let xs = sortBy (comparing (Down . Text.length)) d
in foldr f xs xs where
f x xs = case break (== x) xs of
(_, []) -> xs
(ls, _:rs) -> ls ++ (x : filter (not . (`Text.isInfixOf` x)) rs)
subseqs :: Text -> Int -> Map Text Int -> Map Text Int
subseqs k a !m = let l = Text.length k
in foldl' (flip $ uncurry $ Map.insertWith (+)) m [
(Text.toLower $ Text.take n $ Text.drop i k, a)
| i <- [0 .. l - 1], n <- [1 .. l - i]
]
deleteInfix m = Map.withoutKeys m $ mconcat $
withStrategy (parList rdeepseq) $ map search $
foldMap (breakDownBy Text.length) $
Map.foldrWithKey' invert mempty m where
invert k a = IntMap.insertWith mappend a [k]
search (l, r) = Set.fromList $ concatMap (\x -> filter (`Text.isInfixOf` x) r) l
breakDownBy by = go . sortOn (Down . by) where
go [] = []
go (x:xs) = let b = by x in case break ((b /=) . by) xs of
(_, []) -> []
(xs', ys) -> (x:xs', ys) : go ys

byFrequency :: WordFreq Text -> [(Int, Text)]
byFrequency = IntMap.foldlWithKey' g [] . Map.foldrWithKey' f mempty . unWordFreq where
f k a = IntMap.insertWith (const (k:)) a [k]
g xs a ks = map (a,) (sort ks) ++ xs

loadWordList :: FilePath -> IO (Set Text)
loadWordList = fmap (Set.fromList . map Text.toLower . Text.lines)
Expand Down

0 comments on commit 608d819

Please sign in to comment.