Skip to content
Permalink
Browse files

add value-class-pattern date normalization

  • Loading branch information...
myfreeweb committed Aug 28, 2015
1 parent f9f9843 commit 28dd7a9a8136330eba540bf0cd1509135a4e09ad
@@ -6,6 +6,7 @@ Originally created for [sweetroll].

- parses `items`, `rels`, `rel-urls`
- resolves relative URLs (with support for the `<base>` tag)
- parses the [value-class-pattern](http://microformats.org/wiki/value-class-pattern), including date and time normalization
- handles malformed HTML (the actual HTML parser is [tagstream-conduit])
- high performance
- extensively tested
@@ -0,0 +1,159 @@
{-# OPTIONS_GHC -fno-warn-unused-do-bind #-}
{-# LANGUAGE OverloadedStrings, UnicodeSyntax, CPP, TypeFamilies #-}

module Data.Microformats2.Parser.Date where

#if !MIN_VERSION_base(4,8,0)
import Prelude hiding (sequence)
import Data.Traversable
#endif
import Control.Applicative
import Control.Monad
import Control.Error.Util (hush)
import Text.Printf
import Data.Maybe
import Data.Foldable
import Data.Attoparsec.Text
import qualified Data.Time.Calendar as C
import qualified Data.Time.Calendar.OrdinalDate as O
import qualified Data.Text as T

data Date = Date Int Int Int
instance Show Date where
show (Date y m d) = printf "%d-%02d-%02d" y m d

data HourType = TwentyFourHour | AMHour | PMHour
data Time = Time Int Int Int
instance Show Time where
show (Time h m s) = printf "%02d:%02d:%02d" h m s

data DateTime = DateTime Date Time
instance Show DateTime where
show (DateTime d t) = show d ++ "T" ++ show t

data ZoneType = Plus | Minus
data Zone = Zone ZoneType Int Int
instance Show Zone where
show (Zone Plus h m) = printf "+%02d:%02d" h m
show (Zone Minus h m) = printf "-%02d:%02d" h m

data TimeZone = TimeZone Time Zone
instance Show TimeZone where
show (TimeZone t z) = show t ++ show z

data DateTimeZone = DateTimeZone DateTime Zone
instance Show DateTimeZone where
show (DateTimeZone dt z) = show dt ++ show z

data DTPart = DatePart Date | TimePart Time | ZonePart Zone | TimeZonePart TimeZone | DateTimePart DateTime | DateTimeZonePart DateTimeZone
instance Show DTPart where
show (DatePart d) = show d
show (TimePart t) = show t
show (ZonePart z) = show z
show (TimeZonePart tz) = show tz
show (DateTimePart dt) = show dt
show (DateTimeZonePart dtz) = show dtz

isDatePart, isTimePart, isZonePart, isTimeZonePart, isDateTimePart, isDateTimeZonePart DTPart Bool
isDatePart (DatePart _) = True
isDatePart _ = False
isTimePart (TimePart _) = True
isTimePart _ = False
isZonePart (ZonePart _) = True
isZonePart _ = False
isTimeZonePart (TimeZonePart _) = True
isTimeZonePart _ = False
isDateTimePart (DateTimePart _) = True
isDateTimePart _ = False
isDateTimeZonePart (DateTimeZonePart _) = True
isDateTimeZonePart _ = False

parseDate Parser Date
parseDate = parseDate'
where parseDate' = do
year read <$> count 4 digit
char '-'
parseMMDD year <|> parseDDD year
parseMMDD year = do
mm read <$> count 2 digit
char '-'
dd read <$> count 2 digit
return $ Date year mm dd
parseDDD year = do
ddd read <$> count 3 digit
let (_, mm, dd) = C.toGregorian $ O.fromOrdinalDate (fromIntegral year) ddd
return $ Date year mm dd

parseHourType Parser HourType
parseHourType =
((char 'a' <|> char 'A') >> option '.' (char '.') >> (char 'm' <|> char 'M') >> option '.' (char '.') >> return AMHour)
<|> ((char 'p' <|> char 'P') >> option '.' (char '.') >> (char 'm' <|> char 'M') >> option '.' (char '.') >> return PMHour)

parseTime Parser Time
parseTime = do
hrs read <$> count 2 digit
mins option 0 $ char ':' >> read <$> count 2 digit
secs option 0 $ char ':' >> read <$> count 2 digit
htyp option TwentyFourHour $ parseHourType
let hrs' = case (hrs, htyp) of
(12, AMHour) 00
(x, PMHour) | x < 12 x + 12
(x, _) x
return $ Time hrs' mins secs

parseZone Parser Zone
parseZone = (char 'Z' >> return (Zone Plus 0 0)) <|> parseZone'
where parseZone' = do
htyp (char '+' >> return Plus) <|> (char '-' >> return Minus)
hrs read <$> count 2 digit
mins option 0 $ option ':' (char ':') >> read <$> count 2 digit
return $ Zone htyp hrs mins

parseTimeZone Parser TimeZone
parseTimeZone = do
t parseTime
z parseZone
return $ TimeZone t z

parseDateTime Parser DateTime
parseDateTime = do
d parseDate
option 'T' $ char 'T' <|> char ' '
t parseTime
return $ DateTime d t

parseDateTimeZone Parser DateTimeZone
parseDateTimeZone = do
dt parseDateTime
z parseZone
return $ DateTimeZone dt z

parseDTPart Parser DTPart
parseDTPart =
(liftM DateTimeZonePart parseDateTimeZone)
<|> (liftM DateTimePart parseDateTime)
<|> (liftM DatePart parseDate)
<|> (liftM TimeZonePart parseTimeZone)
<|> (liftM TimePart parseTime)
<|> (liftM ZonePart parseZone)

parseDTParts (Traversable φ, Monoid (φ DTPart)) φ T.Text φ DTPart
parseDTParts = fromMaybe mempty . sequence . fmap (hush . parseOnly parseDTPart)

normalizeDTParts (Foldable φ) φ DTPart Maybe DTPart
normalizeDTParts ps = asum [ find isDateTimeZonePart ps, findDateTime, findDateAndTime, find isDatePart ps, find isTimeZonePart ps, find isTimePart ps ]
where findDateTime = do
(DateTimePart dt) find isDateTimePart ps
return $ case find isZonePart ps of
Just (ZonePart z) DateTimeZonePart $ DateTimeZone dt z
_ DateTimePart dt
findDateAndTime = do
(DatePart d) find isDatePart ps
case find isTimeZonePart ps of
Just (TimeZonePart (TimeZone t z)) return $ DateTimeZonePart $ DateTimeZone (DateTime d t) z
_ findTime d
findTime d = do
(TimePart t) find isTimePart ps
return $ case find isZonePart ps of
Just (ZonePart z) DateTimeZonePart $ DateTimeZone (DateTime d t) z
_ DateTimePart $ DateTime d t
@@ -13,10 +13,11 @@ import Data.Foldable (asum)
import qualified Data.Map as M
import Data.Maybe
import Text.XML.Lens hiding (re)
import Data.Microformats2.Parser.Date (normalizeDTParts, parseDTParts)
import Data.Microformats2.Parser.HtmlUtil
import Data.Microformats2.Parser.Util

unwrapName (Name, a) (Text, a)
unwrapName (Name, α) (Text, α)
unwrapName (Name n _ _, val) = (n, val)

classes Element [Text]
@@ -72,16 +73,23 @@ getOnlyOfTypeAAreaHref e = (^. attribute "href") =<< asum (getOnlyOfType <$>
extractValue e = asum $ [ getAbbrTitle, getDataInputValue, getImgAreaAlt, getInnerTextRaw ] <*> pure e
extractValueTitle e = if' (isJust $ e ^? hasClass "value-title") $ e ^. attribute "title"

extractValueClassPattern [Element Maybe Text] Element Maybe Text
extractValueClassPattern [Element Maybe Text] Element Maybe [Text]
extractValueClassPattern fs e = if' (isJust $ e ^? valueParts) extractValueParts
where extractValueParts = Just . T.concat . catMaybes $ e ^.. valueParts . to extractValuePart
where extractValueParts = Just . catMaybes $ e ^.. valueParts . to extractValuePart
extractValuePart e' = asum $ fs <*> pure e'
valueParts Applicative f => (Element f Element) Element f Element
valueParts Applicative φ => (Element φ Element) Element φ Element
valueParts = entire . hasOneClass ["value", "value-title"]

extractValueClassPatternConcat [Element Maybe Text] Element Maybe Text
extractValueClassPatternConcat fs e = T.concat <$> extractValueClassPattern fs e

extractValueClassPatternDate [Element Maybe Text] Element Maybe Text
extractValueClassPatternDate fs e = asum [ T.pack . show <$> (normalizeDTParts $ parseDTParts $ fromMaybe [] valueParts), T.concat <$> valueParts ]
where valueParts = extractValueClassPattern fs e

extractP Element Maybe Text
extractP e =
asum $ [ extractValueClassPattern [extractValueTitle, extractValue]
asum $ [ extractValueClassPatternConcat [extractValueTitle, extractValue]
, getAbbrTitle, getDataInputValue, getImgAreaAlt, getInnerTextWithImgs ] <*> pure e

extractU Element
@@ -90,15 +98,15 @@ extractU e =
asum $ [ (, True) <$> getAAreaHref e
, (, True) <$> getImgAudioVideoSourceSrc e
, (, True) <$> getObjectData e
, (, False) <$> extractValueClassPattern [extractValueTitle, extractValue] e
, (, False) <$> extractValueClassPatternConcat [extractValueTitle, extractValue] e
, (, False) <$> getAbbrTitle e
, (, False) <$> getDataInputValue e
, (, False) <$> getInnerTextRaw e ]

extractDt Element Maybe Text
extractDt e =
asum $ (extractValueClassPattern ms : ms ++ [getInnerTextRaw]) <*> pure e
where ms = [ getTimeInsDelDatetime, getAbbrTitle, getDataInputValue ]
asum $ (extractValueClassPatternDate ms : ms ++ [getInnerTextRaw]) <*> pure e
where ms = [ getTimeInsDelDatetime, extractValueTitle, extractValue ]

implyProperty String Element Maybe Text
implyProperty "name" e = asum $ [ getImgAreaAlt, getAbbrTitle
@@ -28,6 +28,7 @@ library
, time
, either
, safe
, errors
, containers
, unordered-containers
, vector
@@ -41,10 +42,12 @@ library
, blaze-markup
, xss-sanitize
, pcre-heavy
, attoparsec
default-language: Haskell2010
exposed-modules:
Data.Microformats2.Parser
Data.Microformats2.Parser.Property
Data.Microformats2.Parser.Date
Data.Microformats2.Parser.HtmlUtil
Data.Microformats2.Parser.Util
ghc-options: -Wall
@@ -58,6 +58,22 @@ spec = do
<time class="value" datetime="ti">TIME</time>
<ins class="value" datetime="me">lol</time>
</span>|] `shouldBe` pure "vcptime"
dt [xml|<span class="dt-updated">
<time class="value" datetime="05:55-0700">VCP</time>
<time class="value">2015-08-28</time>
</span>|] `shouldBe` pure "2015-08-28T05:55:00-07:00"
dt [xml|<span class="dt-updated">
<span class="value-title" title="Z"></span>
<time class="value" datetime="05:55"></time>
<time class="value">2015-08-28</time>
</span>|] `shouldBe` pure "2015-08-28T05:55:00+00:00"
dt [xml|<span class="dt-updated">
<span class="value-title" title="+01:00"></span>
<time class="value">2015-08-28</time>
</span>|] `shouldBe` pure "2015-08-28"
dt [xml|<span class="dt-updated">
<time class="value" datetime="2015-08-28 05:55:00+00"></time>
</span>|] `shouldBe` pure "2015-08-28T05:55:00+00:00"
dt [xml|<span class="dt-updated">date</span>|] `shouldBe` pure "date"

describe "implyProperty" $ do

0 comments on commit 28dd7a9

Please sign in to comment.
You can’t perform that action at this time.