Skip to content

Commit

Permalink
feat: add \u{unicode} sequences for characters and fix \uABCD ones
Browse files Browse the repository at this point in the history
  • Loading branch information
aboeglin committed Apr 15, 2024
1 parent 69b907a commit d382cf3
Show file tree
Hide file tree
Showing 11 changed files with 73 additions and 26 deletions.
13 changes: 12 additions & 1 deletion compiler/main/Canonicalize/Canonicalize.hs
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,22 @@ instance Canonicalizable Src.Exp Can.Exp where
return $ Can.Canonical area (Can.LInt x)

Src.LFloat x ->
return $ Can.Canonical area (Can.LFloat x)
if "_f" `List.isSuffixOf` x then
return $ Can.Canonical area (Can.LFloat (init $ init x))
else
return $ Can.Canonical area (Can.LFloat x)

Src.LStr x ->
return $ Can.Canonical area (Can.LStr x)

Src.LChar ('\\':'u':'{':chars) -> do
let char' = head $ fst $ last $ charParser ("\\" ++ show (read ('0':'x':init chars) :: Int))
return $ Can.Canonical area (Can.LChar char')

Src.LChar ('\\':'u':chars) -> do
let char' = head $ fst $ last $ charParser ("\\x" ++ chars)
return $ Can.Canonical area (Can.LChar char')

Src.LChar char -> do
let char' = head $ fst $ last $ charParser char
return $ Can.Canonical area (Can.LChar char')
Expand Down
4 changes: 2 additions & 2 deletions compiler/main/Explain/Format.hs
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ createSimpleErrorDiagnostic color _ typeError = case typeError of

BadEscapeSequence ->
"This escape sequence is not valid\n\n"
<> "Hint: Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD"
<> "Hint: Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD or \\u{ABCDEF} up to 10FFFF"

EmptyChar ->
"Empty char\n\n"
Expand Down Expand Up @@ -1710,7 +1710,7 @@ createErrorDiagnostic color context typeError = case typeError of
)
]
[
Diagnose.Hint "Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD"
Diagnose.Hint "Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD or \\u{ABCDEF} up to 10FFFF"
]

NoContext ->
Expand Down
4 changes: 2 additions & 2 deletions compiler/main/Generate/Javascript.hs
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ instance Compilable Exp where
"`" <> escapeStringLiteral v <> "`"

Literal (LChar v) ->
"String.fromCharCode(" <> (show . fromEnum) v <> ")"
"String.fromCodePoint(" <> (show . fromEnum) v <> ")"

Literal (LBool v) ->
v
Expand Down Expand Up @@ -611,7 +611,7 @@ instance Compilable Exp where
scope <> " === " <> n

PChar n ->
scope <> " === " <> "String.fromCharCode(" <> (show . fromEnum) n <> ")"
scope <> " === " <> "String.fromCodePoint(" <> (show . fromEnum) n <> ")"

PBool n ->
scope <> " === " <> n
Expand Down
2 changes: 1 addition & 1 deletion compiler/main/Parse/Madlib/Grammar.y
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ literal :: { Src.Exp }
| byte %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LByte $ init $ init (strV $1)) }
| short %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LShort $ init $ init (strV $1)) }
| int %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LInt $ init $ init (strV $1)) }
| float %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LFloat $ if "_f" `List.isSuffixOf` (strV $1) then init (init (strV $1)) else strV $1) }
| float %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LFloat $ strV $1) }
| str %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LStr $ strV $1) }
| char %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LChar $ charData $1) }
| true %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LBool $ strV $1) }
Expand Down
52 changes: 35 additions & 17 deletions compiler/main/Parse/Madlib/Lexer.x
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ tokens :-
<0, stringTemplateMadlib> $head*\<\= { mapToken (\_ -> TokenLeftChevronEq) }
<0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \! { mapToken (\_ -> TokenExclamationMark) }
<0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \"(($printable # \")|\\\")*\" { mapToken (\s -> TokenStr (sanitizeStr s)) }
<0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \' ($printable # [\'\\] | " " | \\. | \\x | \\x. | \\x.. | \\u | \\u. | \\u.. | \\u... | \\u.... | \') \' { mapCharToken }
<0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \'\' { mapCharToken }
<0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \' ($printable # [\'\\] | " " | \\. | \\x | \\x.{1,2} | \\u\{.{1,6}\} | \\u | \\u.{1,4} | \') \' { mapCharToken }
<0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \'\' { mapCharToken }
<0, jsxOpeningTag> \#\- ([$alpha $digit \" \_ \' \` \$ \ \% \+ \- \* \. \, \( \) \; \: \{ \} \[ \] \! \? \| \& \n \= \< \> \\ \/\^]|\\\#)* \-\#
{ mapToken (\s -> TokenJSBlock (sanitizeJSBlock s)) }
<0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed, instanceHeader> $empty+ ;
Expand Down Expand Up @@ -530,22 +530,23 @@ charParser = ReadP.readP_to_S $ ReadP.many $ ReadP.readS_to_P Char.readLitChar
mapCharToken :: AlexInput -> Int -> Alex Token
mapCharToken inputData@(posn@((AlexPn _ l1 c1)), prevChar, pending, input) len = do
let src = take len input
when (src == "''") (alexError (printf "EmptyChar\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + 2)))
charData <- processHexaEscapes posn src
let parsed = fst $ last $ charParser charData
-- 1 because we need the character between ' and '
charData' = parsed !! 1
token = TokenChar (init $ tail src)

if length parsed == 3 then do
sourceTarget <- getCurrentSourceTarget
return $ Token (makeArea posn src) sourceTarget token
-- if "\\x" `List.isPrefixOf` src || "\\u" `List.isPrefixOf` src then
-- else
-- return $ Token (makeArea posn src) sourceTarget token
isFormatter <- getIsFormatter
sourceTarget <- getCurrentSourceTarget
if isFormatter then
return $ Token (makeArea posn src) sourceTarget (TokenChar (init $ tail src))
else do
let Area (Loc a l c) _ = makeArea posn src
alexError (printf "%d\n%d\nSyntax error - line: %d, column: %d\nThe following token is not valid: %s" l c l c (show token))
when (src == "''") (alexError (printf "EmptyChar\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + 2)))
charData <- processHexaEscapes posn src
let parsed = fst $ last $ charParser charData
-- 1 because we need the character between ' and '
charData' = parsed !! 1
token = TokenChar (init $ tail src)

if length parsed == 3 then do
return $ Token (makeArea posn src) sourceTarget token
else do
let Area (Loc a l c) _ = makeArea posn src
alexError (printf "%d\n%d\nSyntax error - line: %d, column: %d\nThe following token is not valid: %s" l c l c (show token))


interpretChars :: [Char] -> [Char]
Expand All @@ -565,6 +566,23 @@ processHexaEscapes (AlexPn a l c) input = do
if isFormatter then
return input
else case input of
'\\':'u':'{':more -> do
let hexa = takeWhile (/= '}') more
let hexaLength = length hexa
if hexaLength < 1 || hexaLength > 6 then do
let Area (Loc _ l1 c1) _ = makeArea (AlexPn a l c) ""
alexError (printf "BadEscape\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + hexaLength + 4))
else case interpretChars ("\\" ++ show (read ('0':'x':hexa) :: Int)) of
[] -> do
-- TODO: the range is not correctly interpreted yet, we possibly need to
-- update the error type and the parsing of it in AST.hs
let Area (Loc _ l1 c1) _ = makeArea (AlexPn a l c) ""
alexError (printf "BadEscape\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + hexaLength + 4))

chars -> do
next <- processHexaEscapes (AlexPn (a + 6) l (c + 6)) (tail $ dropWhile (/= '}') more)
return $ chars ++ next

'\\':'u':a1:b1:c1:d1:more -> do
next <- processHexaEscapes (AlexPn (a + 6) l (c + 6)) more

Expand Down
1 change: 1 addition & 0 deletions compiler/test/Blackbox/RunnerSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ spec = do
, "compiler/test/Blackbox/test-cases/recursion-from-closure"
, "compiler/test/Blackbox/test-cases/record-instances"
, "compiler/test/Blackbox/test-cases/record-instance-not-found"
, "compiler/test/Blackbox/test-cases/string-and-char-literals"
, "compiler/test/Blackbox/test-cases/derive-comparable"
, "compiler/test/Blackbox/test-cases/number-inference-error"
, "compiler/test/Blackbox/test-cases/while"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import IO from "IO"

main = () => {
IO.putLine(`\u{1f210}`)
IO.putLine("\u{1f210}")
IO.trace("char", '\u{1f210}')

IO.trace("max", '\u{10ffff}')
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
🈐
🈐
char '🈐'
max '\1114111'
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
🈐
🈐
char '🈐'
max '\1114111'
4 changes: 2 additions & 2 deletions prelude/__internal__/Char.mad
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ import String from "String"
#iftarget js

toShort :: Char -> Short
export toShort = (c) => #- c.charCodeAt(0) -#
export toShort = (c) => #- c.codePointAt(0) -#

fromShort :: Short -> Char
export fromShort = (s) => #- String.fromCharCode(s) -#
export fromShort = (s) => #- String.fromCodePoint(s) -#

#elseif llvm

Expand Down
2 changes: 1 addition & 1 deletion prelude/__internal__/Show.mad
Original file line number Diff line number Diff line change
Expand Up @@ -497,4 +497,4 @@ cShow :: Show a => a -> String
export cShow = (a) => pipe(
show,
printMadlibValue(80, true),
)(a)
)(a)

0 comments on commit d382cf3

Please sign in to comment.