feat: add \u{unicode} sequences for characters and fix \uABCD ones

madlib-lang · Apr 15, 2024 · d382cf3 · d382cf3
1 parent 69b907a
commit d382cf3
Show file tree

Hide file tree

Showing 11 changed files with 73 additions and 26 deletions.
diff --git a/compiler/main/Canonicalize/Canonicalize.hs b/compiler/main/Canonicalize/Canonicalize.hs
@@ -105,11 +105,22 @@ instance Canonicalizable Src.Exp Can.Exp where
       return $ Can.Canonical area (Can.LInt x)
 
     Src.LFloat x ->
-      return $ Can.Canonical area (Can.LFloat x)
+      if "_f" `List.isSuffixOf` x then
+        return $ Can.Canonical area (Can.LFloat (init $ init x))
+      else
+        return $ Can.Canonical area (Can.LFloat x)
 
     Src.LStr  x ->
       return $ Can.Canonical area (Can.LStr x)
 
+    Src.LChar ('\\':'u':'{':chars) -> do
+      let char' = head $ fst $ last $ charParser ("\\" ++ show (read ('0':'x':init chars) :: Int))
+      return $ Can.Canonical area (Can.LChar char')
+
+    Src.LChar ('\\':'u':chars) -> do
+      let char' = head $ fst $ last $ charParser ("\\x" ++ chars)
+      return $ Can.Canonical area (Can.LChar char')
+
     Src.LChar char -> do
       let char' = head $ fst $ last $ charParser char
       return $ Can.Canonical area (Can.LChar char')

diff --git a/compiler/main/Explain/Format.hs b/compiler/main/Explain/Format.hs
@@ -616,7 +616,7 @@ createSimpleErrorDiagnostic color _ typeError = case typeError of
 
   BadEscapeSequence ->
     "This escape sequence is not valid\n\n"
-    <> "Hint: Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD"
+    <> "Hint: Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD or \\u{ABCDEF} up to 10FFFF"
 
   EmptyChar ->
     "Empty char\n\n"
@@ -1710,7 +1710,7 @@ createErrorDiagnostic color context typeError = case typeError of
             )
           ]
           [
-            Diagnose.Hint "Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD"
+            Diagnose.Hint "Valid escape sequences are either a byte: \\xAB or a unicode: \\uABCD or \\u{ABCDEF} up to 10FFFF"
           ]
 
       NoContext ->

diff --git a/compiler/main/Generate/Javascript.hs b/compiler/main/Generate/Javascript.hs
@@ -178,7 +178,7 @@ instance Compilable Exp where
           "`" <> escapeStringLiteral v <> "`"
 
         Literal (LChar v) ->
-          "String.fromCharCode(" <> (show . fromEnum) v <> ")"
+          "String.fromCodePoint(" <> (show . fromEnum) v <> ")"
 
         Literal (LBool v) ->
           v
@@ -611,7 +611,7 @@ instance Compilable Exp where
               scope <> " === " <> n
 
             PChar n ->
-              scope <> " === " <> "String.fromCharCode(" <> (show . fromEnum) n <> ")"
+              scope <> " === " <> "String.fromCodePoint(" <> (show . fromEnum) n <> ")"
 
             PBool n ->
               scope <> " === " <> n

diff --git a/compiler/main/Parse/Madlib/Grammar.y b/compiler/main/Parse/Madlib/Grammar.y
@@ -673,7 +673,7 @@ literal :: { Src.Exp }
   | byte    %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LByte $ init $ init (strV $1)) }
   | short   %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LShort $ init $ init (strV $1)) }
   | int     %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LInt $ init $ init (strV $1)) }
-  | float   %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LFloat $ if "_f" `List.isSuffixOf` (strV $1) then init (init (strV $1)) else strV $1) }
+  | float   %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LFloat $ strV $1) }
   | str     %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LStr $ strV $1) }
   | char    %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LChar $ charData $1) }
   | true    %shift { Src.Source (tokenArea $1) (tokenTarget $1) (Src.LBool $ strV $1) }

diff --git a/compiler/main/Parse/Madlib/Lexer.x b/compiler/main/Parse/Madlib/Lexer.x
@@ -167,8 +167,8 @@ tokens :-
   <0, stringTemplateMadlib> $head*\<\=                                                                  { mapToken (\_ -> TokenLeftChevronEq) }
   <0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \!                                            { mapToken (\_ -> TokenExclamationMark) }
   <0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \"(($printable # \")|\\\")*\"                 { mapToken (\s -> TokenStr (sanitizeStr s)) }
-  <0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \' ($printable # [\'\\] | " " | \\. | \\x | \\x. | \\x.. | \\u | \\u. | \\u.. | \\u... | \\u.... | \') \'  { mapCharToken }
-  <0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \'\'                                                                                                       { mapCharToken }
+  <0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \' ($printable # [\'\\] | " " | \\. | \\x | \\x.{1,2} | \\u\{.{1,6}\} | \\u | \\u.{1,4} | \') \'        { mapCharToken }
+  <0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed> \'\'                                                                                                    { mapCharToken }
   <0, jsxOpeningTag> \#\- ([$alpha $digit \" \_ \' \` \$ \ \% \+ \- \* \. \, \( \) \; \: \{ \} \[ \] \! \? \| \& \n \= \< \> \\ \/\^]|\\\#)* \-\#
     { mapToken (\s -> TokenJSBlock (sanitizeJSBlock s)) }
   <0, stringTemplateMadlib, jsxOpeningTag, jsxAutoClosed, instanceHeader> $empty+                       ;
@@ -530,22 +530,23 @@ charParser = ReadP.readP_to_S $ ReadP.many $ ReadP.readS_to_P Char.readLitChar
 mapCharToken :: AlexInput -> Int -> Alex Token
 mapCharToken inputData@(posn@((AlexPn _ l1 c1)), prevChar, pending, input) len = do
   let src = take len input
-  when (src == "''") (alexError (printf "EmptyChar\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + 2)))
-  charData <- processHexaEscapes posn src
-  let parsed = fst $ last $ charParser charData
-      -- 1 because we need the character between ' and '
-      charData' = parsed !! 1
-      token = TokenChar (init $ tail src)
-
-  if length parsed == 3 then do
-    sourceTarget <- getCurrentSourceTarget
-    return $ Token (makeArea posn src) sourceTarget token
-    -- if "\\x" `List.isPrefixOf` src || "\\u" `List.isPrefixOf` src then
-    -- else 
-    --   return $ Token (makeArea posn src) sourceTarget token
+  isFormatter <- getIsFormatter
+  sourceTarget <- getCurrentSourceTarget
+  if isFormatter then
+    return $ Token (makeArea posn src) sourceTarget (TokenChar (init $ tail src))
   else do
-    let Area (Loc a l c) _ = makeArea posn src
-    alexError (printf "%d\n%d\nSyntax error - line: %d, column: %d\nThe following token is not valid: %s" l c l c (show token))
+    when (src == "''") (alexError (printf "EmptyChar\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + 2)))
+    charData <- processHexaEscapes posn src
+    let parsed = fst $ last $ charParser charData
+        -- 1 because we need the character between ' and '
+        charData' = parsed !! 1
+        token = TokenChar (init $ tail src)
+
+    if length parsed == 3 then do
+      return $ Token (makeArea posn src) sourceTarget token
+    else do
+      let Area (Loc a l c) _ = makeArea posn src
+      alexError (printf "%d\n%d\nSyntax error - line: %d, column: %d\nThe following token is not valid: %s" l c l c (show token))
 
 
 interpretChars :: [Char] -> [Char]
@@ -565,6 +566,23 @@ processHexaEscapes (AlexPn a l c) input = do
   if isFormatter then
     return input
   else case input of
+    '\\':'u':'{':more -> do
+      let hexa = takeWhile (/= '}') more
+      let hexaLength = length hexa
+      if hexaLength < 1 || hexaLength > 6 then do
+        let Area (Loc _ l1 c1) _ = makeArea (AlexPn a l c) ""
+        alexError (printf "BadEscape\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + hexaLength + 4))
+      else case interpretChars ("\\" ++ show (read ('0':'x':hexa) :: Int)) of
+        [] -> do
+          -- TODO: the range is not correctly interpreted yet, we possibly need to
+          -- update the error type and the parsing of it in AST.hs
+          let Area (Loc _ l1 c1) _ = makeArea (AlexPn a l c) ""
+          alexError (printf "BadEscape\n%d\n%d\n%d\n%d" l1 c1 l1 (c1 + hexaLength + 4))
+
+        chars -> do
+          next <- processHexaEscapes (AlexPn (a + 6) l (c + 6)) (tail $ dropWhile (/= '}') more)
+          return $ chars ++ next
+
     '\\':'u':a1:b1:c1:d1:more -> do
       next <- processHexaEscapes (AlexPn (a + 6) l (c + 6)) more
 

diff --git a/compiler/test/Blackbox/RunnerSpec.hs b/compiler/test/Blackbox/RunnerSpec.hs
@@ -148,6 +148,7 @@ spec = do
         , "compiler/test/Blackbox/test-cases/recursion-from-closure"
         , "compiler/test/Blackbox/test-cases/record-instances"
         , "compiler/test/Blackbox/test-cases/record-instance-not-found"
+        , "compiler/test/Blackbox/test-cases/string-and-char-literals"
         , "compiler/test/Blackbox/test-cases/derive-comparable"
         , "compiler/test/Blackbox/test-cases/number-inference-error"
         , "compiler/test/Blackbox/test-cases/while"

diff --git a/compiler/test/Blackbox/test-cases/string-and-char-literals/Entrypoint.mad b/compiler/test/Blackbox/test-cases/string-and-char-literals/Entrypoint.mad
@@ -0,0 +1,9 @@
+import IO from "IO"
+
+main = () => {
+  IO.putLine(`\u{1f210}`)
+  IO.putLine("\u{1f210}")
+  IO.trace("char", '\u{1f210}')
+
+  IO.trace("max", '\u{10ffff}')
+}
diff --git a/compiler/test/Blackbox/test-cases/string-and-char-literals/expected-js b/compiler/test/Blackbox/test-cases/string-and-char-literals/expected-js
@@ -0,0 +1,4 @@
+🈐
+🈐
+char '🈐'
+max '\1114111'
diff --git a/compiler/test/Blackbox/test-cases/string-and-char-literals/expected-llvm b/compiler/test/Blackbox/test-cases/string-and-char-literals/expected-llvm
@@ -0,0 +1,4 @@
+🈐
+🈐
+char '🈐'
+max '\1114111'
diff --git a/prelude/__internal__/Char.mad b/prelude/__internal__/Char.mad
@@ -5,10 +5,10 @@ import String from "String"
 #iftarget js
 
 toShort :: Char -> Short
-export toShort = (c) => #- c.charCodeAt(0) -#
+export toShort = (c) => #- c.codePointAt(0) -#
 
 fromShort :: Short -> Char
-export fromShort = (s) => #- String.fromCharCode(s) -#
+export fromShort = (s) => #- String.fromCodePoint(s) -#
 
 #elseif llvm
 

diff --git a/prelude/__internal__/Show.mad b/prelude/__internal__/Show.mad
@@ -497,4 +497,4 @@ cShow :: Show a => a -> String
 export cShow = (a) => pipe(
   show,
   printMadlibValue(80, true),
-)(a)
+)(a)
-Original file line number
+Diff line change
@@ Expand Up / @@ -497,4 +497,4 @@ cShow :: Show a => a -> String @@
     export cShow = (a) => pipe(
       show,
       printMadlibValue(80, true),
-    )(a)
+    )(a)