openqasm · levbishop · Jun 7, 2023 · Jun 21, 2023 · jakelishman · Jul 21, 2023
diff --git a/source/grammar/qasm3Lexer.g4 b/source/grammar/qasm3Lexer.g4
@@ -130,12 +130,11 @@ OctalIntegerLiteral: '0o' ([0-7] '_'?)* [0-7];
 DecimalIntegerLiteral: ([0-9] '_'?)* [0-9];
 HexIntegerLiteral: ('0x' | '0X') ([0-9a-fA-F] '_'?)* [0-9a-fA-F];
 
-fragment ValidUnicode: [\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]; // valid unicode chars
-fragment Letter: [A-Za-z];
-fragment FirstIdCharacter: '_' | ValidUnicode | Letter;
-fragment GeneralIdCharacter: FirstIdCharacter | [0-9];
-
+// Identifiers comply with UAX31-R1-2
+fragment FirstIdCharacter: [\p{XID_Start}_];
+fragment GeneralIdCharacter: [\p{XID_Continue}];
 Identifier: FirstIdCharacter GeneralIdCharacter*;
+
 HardwareQubit: '$' [0-9]+;
 
 fragment FloatLiteralExponent: [eE] (PLUS | MINUS)? DecimalIntegerLiteral;
@@ -147,45 +146,54 @@ FloatLiteral:
     // 123.456, 123. or 145.32e+1_00
     | DecimalIntegerLiteral DOT DecimalIntegerLiteral? FloatLiteralExponent?;
 
-fragment TimeUnit: 'dt' | 'ns' | 'us' | 'µs' | 'ms' | 's';
+fragment TimeUnit: 'dt' | 'ns' | 'us' | '\u{00B5}s' | '\u{03BC}s' | 'ms' | 's';
 // represents explicit time value in SI or backend units
 TimingLiteral: (DecimalIntegerLiteral | FloatLiteral) TimeUnit;
 
 
 BitstringLiteral: '"' ([01] '_'?)* [01] '"';
 // allow ``"str"`` and ``'str'``
 StringLiteral
-    : '"' ~["\r\t\n]+? '"'
-    | '\'' ~['\r\t\n]+? '\''
+    : '"' NotUAX31NewlineDoubleQuote+? '"'
+    | '\'' NotUAX31NewlineSingleQuote+? '\''
     ;
 
+// UAX31 Whitespace handling
+fragment UAX31Newline: [\u000A\u000B\u000C\u000D\u0085\u2028\u2029];
+fragment NotUAX31Newline: ~[\u000A\u000B\u000C\u000D\u0085\u2028\u2029];
+fragment NotUAX31NewlineSingleQuote: ~['\u000A\u000B\u000C\u000D\u0085\u2028\u2029];
+fragment NotUAX31NewlineDoubleQuote: ~["\u000A\u000B\u000C\u000D\u0085\u2028\u2029];
+fragment UAX31HorizontalSpace: [ \t\u200E\u200F]; // Pattern_White_Space minus newlines
+fragment NotUAX31WhiteSpace: ~[\p{Pattern_White_Space}];
+fragment UAX31IgnorableSpace: [\u200E\u200F];  // Pattern_White_Space AND Default_Ignorable_Code_Point
+
 // Ignore whitespace between tokens, and define C++-style comments.
-Whitespace: [ \t]+ -> skip ;
-Newline: [\r\n]+ -> skip ;
-LineComment : '//' ~[\r\n]* -> skip;
+HorizontalSpace: UAX31HorizontalSpace+ -> skip;
+Newline: UAX31Newline+ -> skip;
+LineComment : '//' NotUAX31Newline* -> skip;
 BlockComment : '/*' .*? '*/' -> skip;
 
 
 // The version identifier token would be ambiguous between itself and
 // integer/floating-point literals, so we use a special mode to ensure it's
 // lexed correctly.
 mode VERSION_IDENTIFIER;
-    VERSION_IDENTIFER_WHITESPACE: [ \t\r\n]+ -> skip;
+    VERSION_IDENTIFER_WHITESPACE: (UAX31HorizontalSpace|UAX31Newline)+ -> skip;
     VersionSpecifier: [0-9]+ ('.' [0-9]+)? -> popMode;
 
 
 // A different lexer mode to swap to when we need handle tokens on a line basis
 // rather than the default arbitrary-whitespace-based tokenisation.  This is
 // used by the annotation and pragma rules.
 mode EAT_TO_LINE_END;
-    EAT_INITIAL_SPACE: [ \t]+ -> skip;
-    EAT_LINE_END: [\r\n] -> popMode, skip;
+    EAT_INITIAL_SPACE: UAX31HorizontalSpace+ -> skip;
+    EAT_LINE_END: UAX31Newline -> popMode, skip;
 
     // The line content must be a non-empty token to satisfy ANTLR (otherwise it
     // would be able to produce an infinite number of tokens).  We could include
     // the line ending to guarantee that this is always non-empty, but that just
     // puts an annoying burden on consumers to remove it again.
-    RemainingLineContent: ~[ \t\r\n] ~[\r\n]*;
+    RemainingLineContent: NotUAX31WhiteSpace NotUAX31Newline*;
 
 
 // We need to do a little context-aware lexing when we hit a `cal` or `defcal`
@@ -195,12 +203,12 @@ mode EAT_TO_LINE_END;
 // manage the state of the lexer, so instead we need to do a little duplication
 // of the tokens, because ANTLR doesn't allow us to inherit rules directly.
 mode CAL_PRELUDE;
-    CAL_PRELUDE_WHITESPACE: [ \t\r\n]+ -> skip;
+    CAL_PRELUDE_WHITESPACE: (UAX31HorizontalSpace|UAX31Newline)+ -> skip;
     CAL_PRELUDE_COMMENT: (LineComment | BlockComment) -> skip;
     CAL_PRELUDE_LBRACE: LBRACE -> type(LBRACE), mode(CAL_BLOCK);
 
 mode DEFCAL_PRELUDE;
-    DEFCAL_PRELUDE_WHITESPACE: [ \t\r\n]+ -> skip;
+    DEFCAL_PRELUDE_WHITESPACE: (UAX31HorizontalSpace|UAX31Newline)+ -> skip;
     DEFCAL_PRELUDE_COMMENT: (LineComment | BlockComment) -> skip;
     DEFCAL_PRELUDE_LBRACE: LBRACE -> type(LBRACE), mode(CAL_BLOCK);
 

diff --git a/source/grammar/tests/reference/declaration/declaration.yaml b/source/grammar/tests/reference/declaration/declaration.yaml
@@ -21,6 +21,9 @@ source: |
   float[32] f = .1e+3;
   duration dur = 1000dt;
   duration dur2 = dur + 200ns;
+  duration dur3a = 1µs;  // unicode MICRO SIGN
+  duration dur3b = 1μs;  // unicode MU
+  duration dur3c = 1us;
   stretch s;
 reference: |
   program
@@ -290,6 +293,36 @@ reference: |
             expression
               200ns
         ;
+    statement
+      classicalDeclarationStatement
+        scalarType
+          duration
+        dur3a
+        =
+        declarationExpression
+          expression
+            1µs
+        ;
+    statement
+      classicalDeclarationStatement
+        scalarType
+          duration
+        dur3b
+        =
+        declarationExpression
+          expression
+            1μs
+        ;
+    statement
+      classicalDeclarationStatement
+        scalarType
+          duration
+        dur3c
+        =
+        declarationExpression
+          expression
+            1us
+        ;
     statement
       classicalDeclarationStatement
         scalarType

diff --git a/source/language/types.rst b/source/language/types.rst
@@ -10,10 +10,33 @@ Types and Casting
 Identifiers
 -----------
 
-Identifiers must begin with a letter [A-Za-z], an underscore or an element from
-the Unicode character categories Lu/Ll/Lt/Lm/Lo/Nl :cite:`wikipediaUnicode`.
-The set of permissible continuation characters consists of all members of the
-aforementioned character sets with the addition of decimal numerals [0-9].
+Roughly, OpenQASM identifiers start with an alphabetic character or underscore and continue with alphanumeric and underscore.
+A precise statement of the Unicode compatibility is:
+
+- `UAX31-C1 <https://www.unicode.org/reports/tr31/tr31-37.html#C1>`_: The OpenQASM language conforms to version 37 of the Unicode® Standard Annex #⁠31
+- `UAX31-C2 <https://www.unicode.org/reports/tr31/tr31-37.html#C2>`_: It observes the following requirements:
+   - `UAX31-R1-2 <https://www.unicode.org/reports/tr31/tr31-37.html#R1-2>`_: Default Identifiers: To determine whether a string is an identifier it uses `UAX31-D1 <https://www.unicode.org/reports/tr31/tr31-37.html#D1>`_ with the following profile:
+      - ``Start := [[:XID_Start:]_]``
+      - ``Continue := [:XID_Continue:]``
+      - ``Medial := []``
+   - `UAX31-R1b <https://www.unicode.org/reports/tr31/tr31-37.html#R1b>`_ Stable Identifiers: Once a string qualifies as an identifier, it does so in all future versions.
+   - `UAX31-R4 <https://www.unicode.org/reports/tr31/tr31-37.html#R4>`_. Equivalent Normalized Identifiers using normalization form C (NFC).
+
+Additionally, to avoid line-break spoofing, we comply with the proposed
+
+- `UAX31-R3a-1`. Use ``Pattern_White_Space`` characters as all and only those the set of characters interpreted as whitespace in parsing., as follows:
+   - A sequence of one or more of any of the following characters shall be interpreted as a sequence of one or more end of line:
+      -  ``U+000A`` (line feed)
+      -  ``U+000B`` (vertical tabulation)
+      -  ``U+000C`` (form feed)
+      -  ``U+000D`` (carriage return)
+      -  ``U+0085`` (next line)
+      -  ``U+2028`` LINE SEPARATOR
+      -  ``U+2029`` PARAGRAPH SEPARATOR
+   - The ``Pattern_White_Space`` characters with the property ``Default_Ignorable_Code_Point`` shall be treated as ignorable format controls
+   - All other characters in ``Pattern_White_Space`` shall be interpreted as horizontal space.
+
+
 Identifiers may not override a reserved identifier.
 
 .. _variables:
@@ -361,10 +384,10 @@ type).  All scalar literals are ``const`` types.
 .. code-block::
 
    // Valid statements
-   
+
    const uint SIZE = 32;  // Declares a compile-time unsigned integer.
 
-   qubit[SIZE] q1;  // Declares a 32-qubit register called `q1`. 
+   qubit[SIZE] q1;  // Declares a 32-qubit register called `q1`.
    int[SIZE] i1;    // Declares a signed integer called `i1` with 32 bits.
 
 
@@ -487,15 +510,16 @@ single-Unicode-character identifier.
 
    .. table:: [tab:real-constants] Built-in real constants in OpenQASM3 of type ``float[64]``.
 
-      +-------------------------------+--------+--------------+---------------------+
-      | Constant                      | ASCII  | Unicode      | Approximate Base 10 |
-      +===============================+========+==============+=====================+
-      | :math:`\pi`                   | pi     | π            | 3.1415926535...     |
-      +-------------------------------+--------+--------------+---------------------+
-      | :math:`\tau = 2\pi`           | tau    | τ            | 6.283185...         |
-      +-------------------------------+--------+--------------+---------------------+
-      | Euler’s number :math:`e`      | euler  | ℇ            | 2.7182818284...     |
-      +-------------------------------+--------+--------------+---------------------+
+      +-------------------------------+--------+----------------------------------+---------------------+
+      | Constant                      | ASCII  | Unicode                          | Approximate Base 10 |
+      +===============================+========+==================================+=====================+
+      | :math:`\pi`                   | pi     | - µ U+00B5 MICRO SIGN            | 3.1415926535...     |
+      |                               |        | - μ U+03BC GREEK SMALL LETTER MU |                     |
+      +-------------------------------+--------+----------------------------------+---------------------+
+      | :math:`\tau = 2\pi`           | tau    | - τ U+03C4 GREEK SMALL LETTER TAU| 6.283185...         |
+      +-------------------------------+--------+----------------------------------+---------------------+
+      | Euler’s number :math:`e`      | euler  | - ℇ U+2107 EULER CONSTANT        | 2.7182818284...     |
+      +-------------------------------+--------+----------------------------------+---------------------+
 
 
 .. _const-expression-functions:
@@ -909,8 +933,8 @@ should be explicitly declared and assigned the concatenation.
    subroutine_call(first ++ third) // forbidden
    subroutine_call(selfConcat) // allowed
 
-Arrays can be sliced just like quantum registers using a range ``a:b:c`` 
-and can be indexed using an integer but cannot be indexed by a a comma-separated 
+Arrays can be sliced just like quantum registers using a range ``a:b:c``
+and can be indexed using an integer but cannot be indexed by a a comma-separated
 list of integers contained in braces ``{a,b,c,…}``. Slicing uses
 the subscript operator ``[]``, but produces an array (or reference in the case
 of assignment) with the same number of dimensions as the given identifier.