Permalink
Browse files

various

  • Loading branch information...
mlang committed Oct 10, 2018
1 parent 47c801d commit 608d8195104e877e7790642c4e83b43f39ac46ae
Showing with 99 additions and 55 deletions.
  1. +5 −5 Makefile
  2. +46 −16 app/Main.hs
  3. +2 −0 package.yaml
  4. +46 −34 src/WordFreq.hs
View
@@ -1,18 +1,18 @@
CHUNKSIZE=10M
CHUNKSIZE=100M
GIT=git
STACK=stack
WGET=wget
WIKILANG=de
wikiwc.$(WIKILANG): extracted
docs/known.$(WIKILANG).txt docs/unknown.$(WIKILANG).txt) docs/subseq.$(WIKILANG).txt: extracted.$(WIKILANG)
$(STACK) build
$(STACK) exec wikiwc -- 'extracted/*/wiki_*' +RTS -N >$@
$(STACK) exec wikiwc -- -w words.$(WIKILANG) -k docs/$(WIKILANG)/known.txt -u docs/$(WIKILANG)/unknown.txt -s docs/$(WIKILANG)/known-subseq.txt extracted.$(WIKILANG)/*/wiki_* +RTS -N
clean:
rm wikiwc.$(WIKILANG) $(WIKILANG)wiki-latest-pages-articles.xml.bz2
rm -rf extracted wikiextractor
rm -rf extracted.* wikiextractor
extracted: wikiextractor
extracted.$(WIKILANG): $(WIKILANG)wiki-latest-pages-articles.xml.bz2 wikiextractor
$(PYTHON) wikiextractor/WikiExtractor.py -b$(CHUNKSIZE) -o $@ $(WIKILANG)wiki-latest-pages-articles.xml.bz2
wikiextractor:
View
@@ -1,29 +1,59 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Main where
import Control.Concurrent.Async (mapConcurrently)
import Control.Monad (foldM)
import Data.Foldable (foldl')
import Data.Text (Text)
import qualified Data.Text as Text (lines, null, pack, toLower, words)
import qualified Data.Text.IO as Text (putStr)
import qualified Data.Text as Text
import qualified Data.Text.IO as Text
import System.FilePath.Glob (glob)
import System.Environment (getArgs)
import WordFreq
import Options.Applicative
main :: IO ()
main = do
args <- getArgs
case args of
[glob] -> run glob
_ -> run "wiki_*"
data Options = Options {
knownWordListFiles :: [FilePath]
, knownOutput :: Maybe FilePath
, unknownOutput :: Maybe FilePath
, subsequenceOutput :: Maybe FilePath
, wikiExtractorFiles :: [FilePath]
} deriving (Eq)
programOptions :: Parser Options
programOptions =
Options <$> many (strOption $ long "known-words" <> short 'w' <> metavar "FILE")
<*> optional (strOption $ long "known" <> short 'k' <> metavar "KNOWN-OUTPUT-FILE")
<*> optional (strOption $ long "unknown" <> short 'u' <> metavar "UNKNOWN-OUTPUT-FILE")
<*> optional (strOption $ long "subseq" <> short 's' <> metavar "OUTPUT-FILE")
<*> some (argument str (metavar "FILES..."))
run match = do
words <- loadWordList "/usr/share/dict/ngerman"
map <- normalize . foldl' mappend mempty <$> (mapConcurrently loadWikiDump =<< glob match)
let known = filterWordFreq (knownWord words) (const True) map
let unknown = filterWordFreq (not . knownWord words) (> 1) map
Text.putStr $ foldWordFreq showWord (commonSequences known)
wikiwc :: Options -> IO ()
wikiwc Options{..} = do
words <- foldl' mappend mempty <$> mapConcurrently loadWordList knownWordListFiles
wc <- normalize <$> foldM loadWikiDump mempty wikiExtractorFiles
let known = filterWordFreq (knownWord words) (const True) wc
let unknown = filterWordFreq (not . knownWord words) (const True) wc
case knownOutput of
Just fp -> Text.writeFile fp $ toText . byFrequency $ known
Nothing -> pure ()
case unknownOutput of
Just fp -> Text.writeFile fp $ toText . byFrequency $ unknown
Nothing -> pure ()
case subsequenceOutput of
Just fp -> Text.writeFile fp $ toText . byFrequency $ commonSequences known
Nothing -> pure ()
pure ()
main :: IO ()
main = wikiwc =<< execParser opts where
opts = info (programOptions <**> helper) $
fullDesc
<> progDesc "Count words and substrings from Wikipedia dumps."
<> header "wikiwc - Wikipedia word counter"
showWord :: Text -> Int -> Text
showWord s i = Text.pack (show i) <> " " <> s <> "\n"
toText :: [(Int, Text)] -> Text
toText = foldMap f where
f (a, w) = Text.pack (show a) <> " " <> w <> "\n"
View
@@ -21,6 +21,8 @@ dependencies:
- containers
- extra
- Glob
- optparse-applicative
- parallel
- text
library:
View
@@ -1,44 +1,48 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
module WordFreq (
loadWikiDump, WordFreq, filterWordFreq, foldWordFreq, addWord
, commonSequences, normalize
, commonSequences, normalize, byFrequency
, loadWordList, knownWord
) where
import Control.Applicative ((<|>))
import Control.Monad (void)
import Control.Monad.Extra (ifM)
import Control.Parallel.Strategies
import Data.Attoparsec.Text
import Data.Char (isAlpha)
import Data.Foldable (foldr)
import Data.List (sortBy)
import Data.Foldable (foldl')
import Data.List (groupBy, sort, sortOn)
import Data.Map.Strict (Map)
import Data.IntMap.Strict (IntMap)
import qualified Data.Map.Strict as Map
import qualified Data.IntMap.Strict as IntMap
import Data.Maybe (fromMaybe)
import Data.Ord (Down(Down), comparing)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member)
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.IO as Text (readFile, hGetChunk)
import System.IO (withFile, IOMode(ReadMode))
import qualified Data.Text.IO as Text
import GHC.Base (($!))
import System.IO (withFile, IOMode(ReadMode))
wikiDump :: Parser (WordFreq Text)
wikiDump = loop mempty where
loop ws = ifM atEnd (pure ws) $ do
w <- dirt *> takeWhile1 isWordy <* dirt
loop $! addWord w ws
wikiDump :: WordFreq Text -> Parser (WordFreq Text)
wikiDump ws = ifM atEnd (pure ws) $ do
w <- dirt *> takeWhile1 isWordy <* dirt
wikiDump $! addWord w ws
where
dirt = many' $ startDoc <|> endDoc <|> notWordy
startDoc = string "<doc " >> skipWhile (not . isEndOfLine) >> endOfLine
endDoc = string "</doc>" >> endOfLine
notWordy = void $ satisfy $ not . isWordy
isWordy = (||) <$> isAlpha <*> (== '-')
loadWikiDump :: FilePath -> IO (WordFreq Text)
loadWikiDump fp = withFile fp ReadMode $ \h ->
fromMaybe mempty . maybeResult <$> parseWith (Text.hGetChunk h) wikiDump ""
loadWikiDump :: WordFreq Text -> FilePath -> IO (WordFreq Text)
loadWikiDump wc fp = withFile fp ReadMode $ \h ->
fromMaybe mempty . maybeResult <$> parseWith (Text.hGetChunk h) (wikiDump wc) ""
newtype WordFreq a = WordFreq { unWordFreq :: Map a Int } deriving (Eq)
instance Ord a => Semigroup (WordFreq a) where
@@ -54,31 +58,39 @@ addWord w = WordFreq . add . unWordFreq where
normalize :: WordFreq Text -> WordFreq Text
normalize = WordFreq
. Map.fromList . Map.elems . fmap merge . Map.foldrWithKey lc mempty
. Map.fromList . Map.elems . fmap merge . Map.foldrWithKey' lc mempty
. unWordFreq where
lc k a = Map.insertWith mappend (Text.toLower k) [(k, a)]
merge xs = let ((k, a):xs') = sortBy (comparing (Down . snd)) xs
in (k, a + sum (snd <$> xs'))
merge xs = let ((k, a):xs') = sortOn (Down . snd) xs
!a' = a + sum (snd <$> xs')
in (k, a')
commonSequences :: WordFreq Text -> WordFreq Text
commonSequences = WordFreq
. clean . Map.foldrWithKey subseqs mempty
. deleteInfix . Map.foldrWithKey' subseqs mempty
. unWordFreq where
subseqs k a m = let l = Text.length k
in foldr (uncurry $ Map.insertWith (+)) m [
(Text.toLower $ Text.take n $ Text.drop i k, a)
| i <- [0 .. l - 1], n <- [1 .. l - i]
]
clean = Map.foldrWithKey' coinvert mempty
. fmap reduce
. Map.foldrWithKey invert mempty where
invert k a = Map.insertWith mappend a [k]
coinvert a ks m = foldr (\k m -> Map.insert k a m) m ks
reduce d = let xs = sortBy (comparing (Down . Text.length)) d
in foldr f xs xs where
f x xs = case break (== x) xs of
(_, []) -> xs
(ls, _:rs) -> ls ++ (x : filter (not . (`Text.isInfixOf` x)) rs)
subseqs :: Text -> Int -> Map Text Int -> Map Text Int
subseqs k a !m = let l = Text.length k
in foldl' (flip $ uncurry $ Map.insertWith (+)) m [
(Text.toLower $ Text.take n $ Text.drop i k, a)
| i <- [0 .. l - 1], n <- [1 .. l - i]
]
deleteInfix m = Map.withoutKeys m $ mconcat $
withStrategy (parList rdeepseq) $ map search $
foldMap (breakDownBy Text.length) $
Map.foldrWithKey' invert mempty m where
invert k a = IntMap.insertWith mappend a [k]
search (l, r) = Set.fromList $ concatMap (\x -> filter (`Text.isInfixOf` x) r) l
breakDownBy by = go . sortOn (Down . by) where
go [] = []
go (x:xs) = let b = by x in case break ((b /=) . by) xs of
(_, []) -> []
(xs', ys) -> (x:xs', ys) : go ys
byFrequency :: WordFreq Text -> [(Int, Text)]
byFrequency = IntMap.foldlWithKey' g [] . Map.foldrWithKey' f mempty . unWordFreq where
f k a = IntMap.insertWith (const (k:)) a [k]
g xs a ks = map (a,) (sort ks) ++ xs
loadWordList :: FilePath -> IO (Set Text)
loadWordList = fmap (Set.fromList . map Text.toLower . Text.lines)

0 comments on commit 608d819

Please sign in to comment.