Merge pull request #506 from kamil-tekiela/UtfString

Optimize offsetGet
phpmyadmin · Sep 16, 2023 · d57481d · d57481d
2 parents 6fd2c59 + be2ca97
commit d57481d
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 228 deletions.
diff --git a/src/Tools/CustomJsonSerializer.php b/src/Tools/CustomJsonSerializer.php
@@ -43,7 +43,6 @@ class CustomJsonSerializer extends JsonSerializer
         'viewOptions',
         'eventOptions',
         'userOptions',
-        'asciiMap',
     ];
 
     /**

diff --git a/src/UtfString.php b/src/UtfString.php
@@ -10,7 +10,10 @@
 
 use function mb_check_encoding;
 use function mb_strlen;
+use function mb_substr;
 use function ord;
+use function strlen;
+use function substr;
 
 /**
  * Implementation for UTF-8 strings.
@@ -68,119 +71,6 @@ class UtfString implements ArrayAccess, Stringable
      */
     public $charLen = 0;
 
-    /**
-     * A map of ASCII binary values to their ASCII code
-     * This is to improve performance and avoid calling ord($byte)
-     *
-     * Source: https://www.freecodecamp.org/news/ascii-table-hex-to-ascii-value-character-code-chart-2/
-     *
-     * @var array<int|string,int>
-     */
-    protected static $asciiMap = [
-        "\0" => 0, // (00000000) NUL Null
-        "\t" => 9, // (00001001) HT Horizontal Tab
-        "\n" => 10, // (00001010) LF Newline / Line Feed
-        "\v" => 11, // (00001011) VT Vertical Tab
-        "\f" => 12, // (00001100) FF Form Feed
-        "\r" => 13, // (00001101) CR Carriage Return
-        ' ' => 32, // (00100000) SP Space
-        '!' => 33, // (00100001) ! Exclamation mark
-        '"' => 34, // (00100010) " Double quote
-        '#' => 35, // (00100011) # Number
-        '$' => 36, // (00100100) $ Dollar
-        '%' => 37, // (00100101) % Percent
-        '&' => 38, // (00100110) & Ampersand
-        '\'' => 39, // (00100111) ' Single quote
-        '(' => 40, // (00101000) ( Left parenthesis
-        ')' => 41, // (00101001) ) Right parenthesis
-        '*' => 42, // (00101010) * Asterisk
-        '+' => 43, // (00101011) + Plus
-        ',' => 44, // (00101100) , Comma
-        '-' => 45, // (00101101) - Minus
-        '.' => 46, // (00101110) . Period
-        '/' => 47, // (00101111) / Slash
-        '0' => 48, // (00110000) 0 Zero
-        '1' => 49, // (00110001) 1 One
-        '2' => 50, // (00110010) 2 Two
-        '3' => 51, // (00110011) 3 Three
-        '4' => 52, // (00110100) 4 Four
-        '5' => 53, // (00110101) 5 Five
-        '6' => 54, // (00110110) 6 Six
-        '7' => 55, // (00110111) 7 Seven
-        '8' => 56, // (00111000) 8 Eight
-        '9' => 57, // (00111001) 9 Nine
-        ':' => 58, // (00111010) : Colon
-        ';' => 59, // (00111011) ; Semicolon
-        '<' => 60, // (00111100) < Less than
-        '=' => 61, // (00111101) = Equal sign
-        '>' => 62, // (00111110) > Greater than
-        '?' => 63, // (00111111) ? Question mark
-        '@' => 64, // (01000000) @ At sign
-        'A' => 65, // (01000001) A Uppercase A
-        'B' => 66, // (01000010) B Uppercase B
-        'C' => 67, // (01000011) C Uppercase C
-        'D' => 68, // (01000100) D Uppercase D
-        'E' => 69, // (01000101) E Uppercase E
-        'F' => 70, // (01000110) F Uppercase F
-        'G' => 71, // (01000111) G Uppercase G
-        'H' => 72, // (01001000) H Uppercase H
-        'I' => 73, // (01001001) I Uppercase I
-        'J' => 74, // (01001010) J Uppercase J
-        'K' => 75, // (01001011) K Uppercase K
-        'L' => 76, // (01001100) L Uppercase L
-        'M' => 77, // (01001101) M Uppercase M
-        'N' => 78, // (01001110) N Uppercase N
-        'O' => 79, // (01001111) O Uppercase O
-        'P' => 80, // (01010000) P Uppercase P
-        'Q' => 81, // (01010001) Q Uppercase Q
-        'R' => 82, // (01010010) R Uppercase R
-        'S' => 83, // (01010011) S Uppercase S
-        'T' => 84, // (01010100) T Uppercase T
-        'U' => 85, // (01010101) U Uppercase U
-        'V' => 86, // (01010110) V Uppercase V
-        'W' => 87, // (01010111) W Uppercase W
-        'X' => 88, // (01011000) X Uppercase X
-        'Y' => 89, // (01011001) Y Uppercase Y
-        'Z' => 90, // (01011010) Z Uppercase Z
-        '[' => 91, // (01011011) [ Left square bracket
-        '\\' => 92, // (01011100) \ backslash
-        ']' => 93, // (01011101) ] Right square bracket
-        '^' => 94, // (01011110) ^ Caret / circumflex
-        '_' => 95, // (01011111) _ Underscore
-        '`' => 96, // (01100000) ` Grave / accent
-        'a' => 97, // (01100001) a Lowercase a
-        'b' => 98, // (01100010) b Lowercase b
-        'c' => 99, // (01100011) c Lowercase c
-        'd' => 100, // (01100100) d Lowercase d
-        'e' => 101, // (01100101) e Lowercase e
-        'f' => 102, // (01100110) f Lowercase
-        'g' => 103, // (01100111) g Lowercase g
-        'h' => 104, // (01101000) h Lowercase h
-        'i' => 105, // (01101001) i Lowercase i
-        'j' => 106, // (01101010) j Lowercase j
-        'k' => 107, // (01101011) k Lowercase k
-        'l' => 108, // (01101100) l Lowercase l
-        'm' => 109, // (01101101) m Lowercase m
-        'n' => 110, // (01101110) n Lowercase n
-        'o' => 111, // (01101111) o Lowercase o
-        'p' => 112, // (01110000) p Lowercase p
-        'q' => 113, // (01110001) q Lowercase q
-        'r' => 114, // (01110010) r Lowercase r
-        's' => 115, // (01110011) s Lowercase s
-        't' => 116, // (01110100) t Lowercase t
-        'u' => 117, // (01110101) u Lowercase u
-        'v' => 118, // (01110110) v Lowercase v
-        'w' => 119, // (01110111) w Lowercase w
-        'x' => 120, // (01111000) x Lowercase x
-        'y' => 121, // (01111001) y Lowercase y
-        'z' => 122, // (01111010) z Lowercase z
-        '{' => 123, // (01111011) { Left curly bracket
-        '|' => 124, // (01111100) | Vertical bar
-        '}' => 125, // (01111101) } Right curly bracket
-        '~' => 126, // (01111110) ~ Tilde
-        "\x7f" => 127, // (01111111) DEL Delete
-    ];
-
     /**
      * @param string $str the string
      */
@@ -212,6 +102,12 @@ public function offsetExists($offset): bool
      */
     public function offsetGet($offset): string|null
     {
+        // This function moves the internal byte and character pointer to the requested offset.
+        // This function is part of hot code so the aim is to do the following
+        // operations as efficiently as possible.
+        // UTF-8 character encoding is a variable length encoding that encodes Unicode
+        // characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
+        // to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
         if (($offset < 0) || ($offset >= $this->charLen)) {
             return null;
         }
@@ -220,13 +116,13 @@ public function offsetGet($offset): string|null
 
         if ($delta > 0) {
             // Fast forwarding.
-            while ($delta-- > 0) {
-                $this->byteIdx += static::getCharLength($this->str[$this->byteIdx]);
-                ++$this->charIdx;
-            }
+            $this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));
+            $this->charIdx += $delta;
         } elseif ($delta < 0) {
             // Rewinding.
             while ($delta++ < 0) {
+                // We rewind byte by byte and only count characters that are not continuation bytes,
+                // i.e. ASCII characters and first octets of multibyte characters
                 do {
                     $byte = ord($this->str[--$this->byteIdx]);
                 } while (($byte >= 128) && ($byte < 192));
@@ -235,14 +131,8 @@ public function offsetGet($offset): string|null
             }
         }
 
-        $bytesCount = static::getCharLength($this->str[$this->byteIdx]);
-
-        $ret = '';
-        for ($i = 0; $bytesCount-- > 0; ++$i) {
-            $ret .= $this->str[$this->byteIdx + $i];
-        }
-
-        return $ret;
+        // Fetch the first Unicode character within the next 4 bytes in the string.
+        return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
     }
 
     /**
@@ -270,52 +160,6 @@ public function offsetUnset($offset): void
         throw new Exception('Not implemented.');
     }
 
-    /**
-     * Gets the length of an UTF-8 character.
-     *
-     * According to RFC 3629, a UTF-8 character can have at most 4 bytes.
-     * However, this implementation supports UTF-8 characters containing up to 6
-     * bytes.
-     *
-     * @see https://tools.ietf.org/html/rfc3629
-     *
-     * @param string $byte the byte to be analyzed
-     */
-    public static function getCharLength($byte): int
-    {
-        // Use the default ASCII map as queries are mostly ASCII chars
-        // ord($byte) has a performance cost
-
-        if (! isset(static::$asciiMap[$byte])) {
-            // Complete the cache with missing items
-            static::$asciiMap[$byte] = ord($byte);
-        }
-
-        $byte = static::$asciiMap[$byte];
-
-        if ($byte < 128) {
-            return 1;
-        }
-
-        if ($byte < 224) {
-            return 2;
-        }
-
-        if ($byte < 240) {
-            return 3;
-        }
-
-        if ($byte < 248) {
-            return 4;
-        }
-
-        if ($byte < 252) {
-            return 5; // unofficial
-        }
-
-        return 6; // unofficial
-    }
-
     /**
      * Returns the length in characters of the string.
      */

diff --git a/tests/Misc/UtfStringTest.php b/tests/Misc/UtfStringTest.php
@@ -9,8 +9,6 @@
 use PHPUnit\Framework\Attributes\DataProvider;
 use Throwable;
 
-use function chr;
-
 class UtfStringTest extends TestCase
 {
     /**
@@ -55,27 +53,6 @@ public function testUnset(): void
         unset($str[0]);
     }
 
-    public function testGetCharLength(): void
-    {
-        $this->assertEquals(1, UtfString::getCharLength(chr(0x00))); // 00000000
-        $this->assertEquals(1, UtfString::getCharLength(chr(0x7F))); // 01111111
-
-        $this->assertEquals(2, UtfString::getCharLength(chr(0xC0))); // 11000000
-        $this->assertEquals(2, UtfString::getCharLength(chr(0xDF))); // 11011111
-
-        $this->assertEquals(3, UtfString::getCharLength(chr(0xE0))); // 11100000
-        $this->assertEquals(3, UtfString::getCharLength(chr(0xEF))); // 11101111
-
-        $this->assertEquals(4, UtfString::getCharLength(chr(0xF0))); // 11110000
-        $this->assertEquals(4, UtfString::getCharLength(chr(0xF7))); // 11110111
-
-        $this->assertEquals(5, UtfString::getCharLength(chr(0xF8))); // 11111000
-        $this->assertEquals(5, UtfString::getCharLength(chr(0xFB))); // 11111011
-
-        $this->assertEquals(6, UtfString::getCharLength(chr(0xFC))); // 11111100
-        $this->assertEquals(6, UtfString::getCharLength(chr(0xFD))); // 11111101
-    }
-
     public function testToString(): void
     {
         $str = new UtfString(self::TEST_PHRASE);
@@ -112,7 +89,7 @@ public static function utf8StringsProvider(): array
                 'č',
             ],
             'emoji' => [
-                '😂😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
+                '🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
                 '😂',
                 '😋',
             ],
@@ -121,6 +98,11 @@ public static function utf8StringsProvider(): array
                 null,
                 null,
             ],
+            'random' => [
+                'xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞',
+                'ℾ',
+                '⅞',
+            ],
         ];
     }
 }
diff --git a/tests/benchmarks/UtfStringBench.php b/tests/benchmarks/UtfStringBench.php
@@ -6,7 +6,6 @@
 
 use PhpMyAdmin\SqlParser\UtfString;
 
-use function chr;
 use function file_get_contents;
 
 class UtfStringBench
@@ -19,8 +18,7 @@ class UtfStringBench
      * @Iterations(20)
      * @Revs(4)
      * @OutputTimeUnit("milliseconds")
-     * @Assert("mode(variant.time.avg) < 100 milliseconds +/- 10%")
-     * @Assert("mode(variant.time.avg) > 30 milliseconds +/- 10%")
+     * @Assert("mode(variant.time.avg) < 40 milliseconds +/- 10%")
      */
     public function benchBuildUtfString(): void
     {
@@ -30,38 +28,30 @@ public function benchBuildUtfString(): void
         }
     }
 
-    /**
-     * @BeforeMethods("setUp")
-     * @Iterations(2)
-     * @Revs(2)
-     * @OutputTimeUnit("microseconds")
-     * @Assert("mode(variant.time.avg) < 800 microseconds +/- 20%")
-     * @Assert("mode(variant.time.avg) > 100 microseconds +/- 10%")
-     */
-    public function benchGetCharLength(): void
-    {
-        UtfString::getCharLength(chr(0x00)); // 00000000
-        UtfString::getCharLength(chr(0x7F)); // 01111111
-
-        UtfString::getCharLength(chr(0xC0)); // 11000000
-        UtfString::getCharLength(chr(0xDF)); // 11011111
-
-        UtfString::getCharLength(chr(0xE0)); // 11100000
-        UtfString::getCharLength(chr(0xEF)); // 11101111
-
-        UtfString::getCharLength(chr(0xF0)); // 11110000
-        UtfString::getCharLength(chr(0xF7)); // 11110111
-
-        UtfString::getCharLength(chr(0xF8)); // 11111000
-        UtfString::getCharLength(chr(0xFB)); // 11111011
-
-        UtfString::getCharLength(chr(0xFC)); // 11111100
-        UtfString::getCharLength(chr(0xFD)); // 11111101
-    }
-
     public function setUp(): void
     {
         $contentsPath = __DIR__ . '/../../LICENSE.txt';
         $this->testContents = (string) file_get_contents($contentsPath);
     }
+
+    /**
+     * @Iterations(20)
+     * @Revs(4)
+     * @OutputTimeUnit("microseconds")
+     * @Assert("mode(variant.time.avg) < 120 microseconds +/- 10%")
+     */
+    public function benchUtfStringRandomAccessWithUnicode(): void
+    {
+        $text = 'abcdefghijklmnopqrstuvwxyz
+        áéíóúýěřťǔǐǒǎšďȟǰǩľžčǚň
+        🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯
+        P\xf8\xed\xb9ern\xec \xbelu\xbbou\xe8k\xfd k\xf3d \xfap\xecl \xef\xe1belsk\xe9 k\xf3dy
+        xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞';
+
+        $str1 = new UtfString($text);
+        $str1->offsetGet(10);
+        $str1->offsetGet(100);
+        $str1->offsetGet(20);
+        $str1->offsetGet(0);
+    }
 }