Skip to content
Permalink
Browse files

update microformats2-types

  • Loading branch information...
myfreeweb committed Feb 22, 2015
1 parent 943d845 commit 3a4c422d7c472e7770c3b64c077adce57be426df
@@ -14,7 +14,7 @@ clean:
if test -d .hpc; then rm -r .hpc; fi

configure:
cabal configure --enable-benchmarks --enable-tests --enable-coverage -v2
cabal configure --enable-benchmarks --enable-tests --enable-library-coverage -v2

haddock:
cabal haddock --hyperlink-source
@@ -0,0 +1,16 @@
# microformats2-parser [![Hackage](https://img.shields.io/hackage/v/microformats2-parser.svg?style=flat)](https://hackage.haskell.org/package/microformats2-parser) [![ISC License](https://img.shields.io/badge/license-ISC-red.svg?style=flat)](https://tldrlegal.com/license/-isc-license)

[Microformats 2] parser for Haskell!

Originally created for [sweetroll] :-)

The types are located in a separate package called [microformats2-types].

[Microformats 2]: http://microformats.org/wiki/microformats2
[sweetroll]: https://github.com/myfreeweb/sweetroll
[microformats2-types]: https://github.com/myfreeweb/microformats2-types

## License

Copyright 2015 Greg V <greg@unrelenting.technology>
Available under the ISC license, see the `COPYING` file
@@ -65,8 +65,8 @@ getText e = Just . T.strip <$> T.concat $ e ^.. basicText
getAllText Element Maybe Text
getAllText e = Just . T.strip <$> T.concat $ e ^.. entire . nodes . traverse . _ContentWithAlts

getAllHtml Element Maybe Text
getAllHtml e = Just . T.strip <$> T.concat $ e ^.. nodes . traverse . _InnerHtml
getAllHtml Element [Text]
getAllHtml e = [T.strip <$> T.concat $ e ^.. nodes . traverse . _InnerHtml]

getAbbrTitle Element Maybe Text
getAbbrTitle e = e ^. el "abbr" . attribute "title"
@@ -120,32 +120,35 @@ extractValueClassPattern fs e = if' (isJust $ e ^? valueParts) $ extractValuePar
valueParts Applicative f => (Element f Element) Element f Element
valueParts = entire . hasOneClass ["value", "value-title"]

findProperty Element String Maybe Element
findProperty e n = e ^? entire . hasClass n
findProperty Element String [Element]
findProperty e n = e ^.. entire . hasClass n

data PropType = P | U | Dt | E

extractProperty PropType String Element Maybe Text
extractProperty P n e' = do
e findProperty e' $ "p-" ++ n
asum $ [ extractValueClassPattern [extractValueTitle, extractValue]
, extractValue ] <*> pure e
extractProperty U n e' = do
e findProperty e' $ "u-" ++ n
asum $ [ getAAreaHref, getImgAudioVideoSourceSrc
, extractValueClassPattern [extractValueTitle, extractValue]
, getAbbrTitle, getDataInputValue, getAllText ] <*> pure e
extractProperty Dt n e' = do
e findProperty e' $ "dt-" ++ n
let ms = [ getTimeInsDelDatetime, getAbbrTitle, getDataInputValue ]
asum $ (extractValueClassPattern ms) : ms ++ [getAllText] <*> pure e
extract [Element Maybe Text] Element [Text]
extract ps e = catMaybes $ [ \x -> asum $ ps <*> pure x ] <*> pure e

extractProperty PropType String Element [Text]
extractProperty P n e' =
findProperty e' ("p-" ++ n) >>=
extract [ extractValueClassPattern [extractValueTitle, extractValue]
, extractValue ]
extractProperty U n e' =
findProperty e' ("u-" ++ n) >>=
extract [ getAAreaHref, getImgAudioVideoSourceSrc
, extractValueClassPattern [extractValueTitle, extractValue]
, getAbbrTitle, getDataInputValue, getAllText ]
extractProperty Dt n e' =
findProperty e' ("dt-" ++ n) >>=
extract ((extractValueClassPattern ms) : ms ++ [getAllText])
where ms = [ getTimeInsDelDatetime, getAbbrTitle, getDataInputValue ]
extractProperty E n e' = findProperty e' ("e-" ++ n) >>= getAllHtml

extractPropertyL PropType String Element Maybe TL.Text
extractPropertyL t n e = return . TL.fromStrict =<< extractProperty t n e
extractPropertyL PropType String Element [TL.Text]
extractPropertyL t n e = TL.fromStrict <$> extractProperty t n e

extractPropertyR Read α PropType String Element Maybe α
extractPropertyR t n e = readMay =<< return . T.unpack =<< extractProperty t n e
extractPropertyR Read α PropType String Element [α]
extractPropertyR t n e = catMaybes $ readMay <$> T.unpack <$> extractProperty t n e

implyProperty PropType String Element Maybe Text
implyProperty P "name" e = asum $ [ getImgAreaAlt, getAbbrTitle
@@ -27,7 +27,7 @@ library
, either
, safe
, data-default
, microformats2-types == 0.3.*
, microformats2-types == 0.4.*
, html-conduit
, xml-lens
, blaze-markup
@@ -7,6 +7,7 @@ import TestCommon
import Text.HTML.DOM
import Text.XML.Lens (documentRoot)
import Data.Microformats2.Parser.Internal
import Control.Applicative

{-# ANN module ("HLint: ignore Redundant do"::String) #-}

@@ -15,79 +16,79 @@ spec = do
describe "extractProperty" $ do
it "parses p- properties" $ do
let nm = extractProperty P "name" . documentRoot . parseLBS
nm [xml|<span class="p-name">Hello Basic</span>|] `shouldBe` Just "Hello Basic"
nm [xml|<abbr class="p-name" title="Hello Abbr">HA</abbr>|] `shouldBe` Just "Hello Abbr"
nm [xml|<abbr class="p-name">HA</abbr>|] `shouldBe` Just "HA"
nm [xml|<data class="p-name" value="Hello Data" />|] `shouldBe` Just "Hello Data"
nm [xml|<input class="p-name" value="Hello Input" />|] `shouldBe` Just "Hello Input"
nm [xml|<img class="p-name" alt="Hello Img" />|] `shouldBe` Just "Hello Img"
nm [xml|<area class="p-name" alt="Hello Area" />|] `shouldBe` Just "Hello Area"
nm [xml|<span class="p-name"> ignore <i class="value">Hello</i> <img class="value" alt="ValuePattern" src="x.png"> </span>|] `shouldBe` Just "HelloValuePattern"
nm [xml|<span class="p-name"> ignore <em class="value-title" title="Hello">Hi</em> <span class="value">Value-Title</span></span>|] `shouldBe` Just "HelloValue-Title"
nm [xml|<span class="p-name"> Hello <img alt="Span With Img" src="x.png"> </span>|] `shouldBe` Just "Hello Span With Img"
nm [xml|<span class="p-name">Hello Basic</span>|] `shouldBe` pure "Hello Basic"
nm [xml|<abbr class="p-name" title="Hello Abbr">HA</abbr>|] `shouldBe` pure "Hello Abbr"
nm [xml|<abbr class="p-name">HA</abbr>|] `shouldBe` pure "HA"
nm [xml|<data class="p-name" value="Hello Data" />|] `shouldBe` pure "Hello Data"
nm [xml|<input class="p-name" value="Hello Input" />|] `shouldBe` pure "Hello Input"
nm [xml|<img class="p-name" alt="Hello Img" />|] `shouldBe` pure "Hello Img"
nm [xml|<area class="p-name" alt="Hello Area" />|] `shouldBe` pure "Hello Area"
nm [xml|<span class="p-name"> ignore <i class="value">Hello</i> <img class="value" alt="ValuePattern" src="x.png"> </span>|] `shouldBe` pure "HelloValuePattern"
nm [xml|<span class="p-name"> ignore <em class="value-title" title="Hello">Hi</em> <span class="value">Value-Title</span></span>|] `shouldBe` pure "HelloValue-Title"
nm [xml|<span class="p-name"> Hello <img alt="Span With Img" src="x.png"> </span>|] `shouldBe` pure "Hello Span With Img"
nm [xml|<span class="p-name"> <span class="value"> Hello
<img alt="Span With Img" src="x.png"> </span> <em class="value-title" title="&& Value Title">nope</em> </span>|] `shouldBe` Just "Hello Span With Img&& Value Title"
<img alt="Span With Img" src="x.png"> </span> <em class="value-title" title="&& Value Title">nope</em> </span>|] `shouldBe` pure "Hello Span With Img&& Value Title"

it "parses u- properties" $ do
let ur = extractProperty U "url" . documentRoot . parseLBS
ur [xml|<a class="u-url" href="/yo/a">link</a>|] `shouldBe` Just "/yo/a"
ur [xml|<area class="u-url" href="/yo/area"/>|] `shouldBe` Just "/yo/area"
ur [xml|<img class="u-url" src="/yo/img"/>|] `shouldBe` Just "/yo/img"
ur [xml|<audio class="u-url" src="/yo/audio"/>|] `shouldBe` Just "/yo/audio"
ur [xml|<video class="u-url" src="/yo/video"/>|] `shouldBe` Just "/yo/video"
ur [xml|<source class="u-url" src="/yo/source"/>|] `shouldBe` Just "/yo/source"
ur [xml|<span class="u-url"><b class=value>/yo</b><em class="value">/vcp</span>|] `shouldBe` Just "/yo/vcp"
ur [xml|<abbr class="u-url" title="/yo/abbr"/>|] `shouldBe` Just "/yo/abbr"
ur [xml|<data class="u-url" value="/yo/data"/>|] `shouldBe` Just "/yo/data"
ur [xml|<input class="u-url" value="/yo/input"/>|] `shouldBe` Just "/yo/input"
ur [xml|<span class="u-url">/yo/span</span>|] `shouldBe` Just "/yo/span"
ur [xml|<a class="u-url" href="/yo/a">link</a>|] `shouldBe` pure "/yo/a"
ur [xml|<area class="u-url" href="/yo/area"/>|] `shouldBe` pure "/yo/area"
ur [xml|<img class="u-url" src="/yo/img"/>|] `shouldBe` pure "/yo/img"
ur [xml|<audio class="u-url" src="/yo/audio"/>|] `shouldBe` pure "/yo/audio"
ur [xml|<video class="u-url" src="/yo/video"/>|] `shouldBe` pure "/yo/video"
ur [xml|<source class="u-url" src="/yo/source"/>|] `shouldBe` pure "/yo/source"
ur [xml|<span class="u-url"><b class=value>/yo</b><em class="value">/vcp</span>|] `shouldBe` pure "/yo/vcp"
ur [xml|<abbr class="u-url" title="/yo/abbr"/>|] `shouldBe` pure "/yo/abbr"
ur [xml|<data class="u-url" value="/yo/data"/>|] `shouldBe` pure "/yo/data"
ur [xml|<input class="u-url" value="/yo/input"/>|] `shouldBe` pure "/yo/input"
ur [xml|<span class="u-url">/yo/span</span>|] `shouldBe` pure "/yo/span"

it "parses u- properties" $ do
it "parses dt- properties" $ do
let dt = extractProperty Dt "updated" . documentRoot . parseLBS
dt [xml|<time class="dt-updated" datetime="ti.me">someday</time>|] `shouldBe` Just "ti.me"
dt [xml|<ins class="dt-updated" datetime="i.ns">someday</ins>|] `shouldBe` Just "i.ns"
dt [xml|<del class="dt-updated" datetime="d.el">someday</del>|] `shouldBe` Just "d.el"
dt [xml|<abbr class="dt-updated" title="ab.br">AB</abbr>|] `shouldBe` Just "ab.br"
dt [xml|<data class="dt-updated" value="da.ta"/>|] `shouldBe` Just "da.ta"
dt [xml|<input class="dt-updated" value="i.np.ut"/>|] `shouldBe` Just "i.np.ut"
dt [xml|<time class="dt-updated" datetime="ti.me">someday</time>|] `shouldBe` pure "ti.me"
dt [xml|<ins class="dt-updated" datetime="i.ns">someday</ins>|] `shouldBe` pure "i.ns"
dt [xml|<del class="dt-updated" datetime="d.el">someday</del>|] `shouldBe` pure "d.el"
dt [xml|<abbr class="dt-updated" title="ab.br">AB</abbr>|] `shouldBe` pure "ab.br"
dt [xml|<data class="dt-updated" value="da.ta"/>|] `shouldBe` pure "da.ta"
dt [xml|<input class="dt-updated" value="i.np.ut"/>|] `shouldBe` pure "i.np.ut"
dt [xml|<span class="dt-updated">
<abbr class="value" title="vcp">VCP</abbr>
<time class="value" datetime="ti">TIME</time>
<ins class="value" datetime="me">lol</time>
</span>|] `shouldBe` Just "vcptime"
dt [xml|<span class="dt-updated">date</span>|] `shouldBe` Just "date"
</span>|] `shouldBe` pure "vcptime"
dt [xml|<span class="dt-updated">date</span>|] `shouldBe` pure "date"

it "parses e- properties" $ do
let ct = extractProperty E "content" . documentRoot . parseLBS
ct [xml|<div class="e-content"><em>hello html</em>!</div>|] `shouldBe` Just "<em>hello html</em>!"
ct [xml|<div class="e-content"><em>hello html</em>!</div>|] `shouldBe` pure "<em>hello html</em>!"

describe "implyProperty" $ do
it "parses implied p-name" $ do
let nm = implyProperty P "name" . documentRoot . parseLBS
nm [xml|<img class="h-blah" alt="Hello Img!">|] `shouldBe` Just "Hello Img!"
nm [xml|<abbr class="h-blah" title="Hello Abbr!">HA</abbr>|] `shouldBe` Just "Hello Abbr!"
nm [xml|<p class="h-blah"><img alt="Hello Only Img!"></p>|] `shouldBe` Just "Hello Only Img!"
nm [xml|<p class="h-blah"><img alt="DOING"><img alt="IT"><img alt="WRONG">Goodbye Img!</p>|] `shouldBe` Just "Goodbye Img!"
nm [xml|<p class="h-blah"><abbr title="Hello Only Abbr!">HOA</abbr></p>|] `shouldBe` Just "Hello Only Abbr!"
nm [xml|<p class="h-blah"><abbr title="DOING"/><abbr title="IT"/><abbr title="WRONG"/>Goodbye Abbr!</p>|] `shouldBe` Just "Goodbye Abbr!"
nm [xml|<p class="h-blah"><em><img alt="Hello Only Nested Img!"></p>|] `shouldBe` Just "Hello Only Nested Img!"
nm [xml|<p class="h-blah"><em><img alt="DOING"><img alt="WRONG">Goodbye Nested Img!</p>|] `shouldBe` Just "Goodbye Nested Img!"
nm [xml|<p class="h-blah"><em><abbr title="Hello Only Nested Abbr!">HOA</abbr></p>|] `shouldBe` Just "Hello Only Nested Abbr!"
nm [xml|<p class="h-blah"><em><abbr title="DOING"/><abbr title="WRONG"/>Goodbye Nested Abbr!</p>|] `shouldBe` Just "Goodbye Nested Abbr!"
nm [xml|<p class="h-blah">Hello Text!</p>|] `shouldBe` Just "Hello Text!"
nm [xml|<img class="h-blah" alt="Hello Img!">|] `shouldBe` pure "Hello Img!"
nm [xml|<abbr class="h-blah" title="Hello Abbr!">HA</abbr>|] `shouldBe` pure "Hello Abbr!"
nm [xml|<p class="h-blah"><img alt="Hello Only Img!"></p>|] `shouldBe` pure "Hello Only Img!"
nm [xml|<p class="h-blah"><img alt="DOING"><img alt="IT"><img alt="WRONG">Goodbye Img!</p>|] `shouldBe` pure "Goodbye Img!"
nm [xml|<p class="h-blah"><abbr title="Hello Only Abbr!">HOA</abbr></p>|] `shouldBe` pure "Hello Only Abbr!"
nm [xml|<p class="h-blah"><abbr title="DOING"/><abbr title="IT"/><abbr title="WRONG"/>Goodbye Abbr!</p>|] `shouldBe` pure "Goodbye Abbr!"
nm [xml|<p class="h-blah"><em><img alt="Hello Only Nested Img!"></p>|] `shouldBe` pure "Hello Only Nested Img!"
nm [xml|<p class="h-blah"><em><img alt="DOING"><img alt="WRONG">Goodbye Nested Img!</p>|] `shouldBe` pure "Goodbye Nested Img!"
nm [xml|<p class="h-blah"><em><abbr title="Hello Only Nested Abbr!">HOA</abbr></p>|] `shouldBe` pure "Hello Only Nested Abbr!"
nm [xml|<p class="h-blah"><em><abbr title="DOING"/><abbr title="WRONG"/>Goodbye Nested Abbr!</p>|] `shouldBe` pure "Goodbye Nested Abbr!"
nm [xml|<p class="h-blah">Hello Text!</p>|] `shouldBe` pure "Hello Text!"

it "parses implied u-photo" $ do
let ph = implyProperty U "photo" . documentRoot . parseLBS
ph [xml|<img class="h-blah" src="selfie.png">|] `shouldBe` Just "selfie.png"
ph [xml|<object class="h-blah" data="art.svg">|] `shouldBe` Just "art.svg"
ph [xml|<div class="h-blah"><img src="selfie.png"/><em>yo</em>|] `shouldBe` Just "selfie.png"
ph [xml|<div class="h-blah"><object data="art.svg"/><em>yo</em>|] `shouldBe` Just "art.svg"
ph [xml|<div class="h-blah"><p class="onlychild"><img src="selfie.png"/><em>yo</em>|] `shouldBe` Just "selfie.png"
ph [xml|<div class="h-blah"><p class="onlychild"><object data="art.svg"/><em>yo</em>|] `shouldBe` Just "art.svg"
ph [xml|<img class="h-blah" src="selfie.png">|] `shouldBe` pure "selfie.png"
ph [xml|<object class="h-blah" data="art.svg">|] `shouldBe` pure "art.svg"
ph [xml|<div class="h-blah"><img src="selfie.png"/><em>yo</em>|] `shouldBe` pure "selfie.png"
ph [xml|<div class="h-blah"><object data="art.svg"/><em>yo</em>|] `shouldBe` pure "art.svg"
ph [xml|<div class="h-blah"><p class="onlychild"><img src="selfie.png"/><em>yo</em>|] `shouldBe` pure "selfie.png"
ph [xml|<div class="h-blah"><p class="onlychild"><object data="art.svg"/><em>yo</em>|] `shouldBe` pure "art.svg"

it "parses implied u-url" $ do
let ur = implyProperty U "url" . documentRoot . parseLBS
ur [xml|<a class="h-blah" href="/hello/a">|] `shouldBe` Just "/hello/a"
ur [xml|<area class="h-blah" href="/hello/area">|] `shouldBe` Just "/hello/area"
ur [xml|<div class="h-blah"><em>what</em><a href="/hello/n/a">|] `shouldBe` Just "/hello/n/a"
ur [xml|<div class="h-blah"><em>what</em><area href="/hello/n/area">|] `shouldBe` Just "/hello/n/area"
ur [xml|<a class="h-blah" href="/hello/a">|] `shouldBe` pure "/hello/a"
ur [xml|<area class="h-blah" href="/hello/area">|] `shouldBe` pure "/hello/area"
ur [xml|<div class="h-blah"><em>what</em><a href="/hello/n/a">|] `shouldBe` pure "/hello/n/a"
ur [xml|<div class="h-blah"><em>what</em><area href="/hello/n/area">|] `shouldBe` pure "/hello/n/area"
@@ -7,6 +7,7 @@ import TestCommon
import Data.Default
import Data.Microformats2
import Data.Microformats2.Parser
import Control.Applicative

{-# ANN module ("HLint: ignore Redundant do"::String) #-}

@@ -24,16 +25,17 @@ spec = do
<span class="p-altitude">1.2345</span>
</p>
<p class="h-geo">
<data class="p-latitude" value="1.2345">
<data class="p-latitude" value="123.45">
<input class="p-latitude" value="678.9">
</p>
</div>|] `shouldBe` [ def { geoLatitude = Just 37.33168, geoLongitude = Just (-122.03016), geoAltitude = Just 1.2345 }
, def { geoLatitude = Just 1.2345 } ]
</div>|] `shouldBe` [ def { geoLatitude = pure 37.33168, geoLongitude = pure (-122.03016), geoAltitude = pure 1.2345 }
, def { geoLatitude = [123.45, 678.9] } ]

it "ignores invalid properties" $ do
parseGeo' [xml|<p class="h-geo">
<span class="p-latitude">HELLO WORLD!!</span>
<span class="p-altitude">1.2345</span>
</p>|] `shouldBe` [ def { geoAltitude = Just 1.2345 } ]
</p>|] `shouldBe` [ def { geoAltitude = pure 1.2345 } ]

describe "parseAdr" $ do
let parseAdr' = parseAdr . documentRoot . parseLBS
@@ -50,8 +52,8 @@ spec = do
<span class="p-country-name">C</span>
<span class="p-label">LB</span>
</article>
</div>|] `shouldBe` [ def { adrStreetAddress = Just "SA", adrExtendedAddress = Just "EA"
, adrPostOfficeBox = Just "PO", adrLocality = Just "L"
, adrRegion = Just "R", adrPostalCode = Just "PC"
, adrCountryName = Just "C", adrLabel = Just "LB"
</div>|] `shouldBe` [ def { adrStreetAddress = pure "SA", adrExtendedAddress = pure "EA"
, adrPostOfficeBox = pure "PO", adrLocality = pure "L"
, adrRegion = pure "R", adrPostalCode = pure "PC"
, adrCountryName = pure "C", adrLabel = pure "LB"
} ]

0 comments on commit 3a4c422

Please sign in to comment.
You can’t perform that action at this time.