Support decoding UTF-16 paths (#50)

In at least some pak files, a negative path length in the index flags a UTF-16 encoded path. This patch changes the decoding logic to handle that case, but doesn't change index encoding, which still uses UTF-8.
panzi · Jul 12, 2021 · 5c8f61f · 5c8f61f
1 parent 33a5ccd
commit 5c8f61f
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/u4pak.py b/u4pak.py
@@ -603,8 +603,12 @@ def base_offset(self):
 		return self.offset
 
 def read_path(stream: io.BufferedReader, encoding: str = 'utf-8') -> str:
-	path_len, = st_unpack('<I',stream.read(4))
-	return stream.read(path_len).rstrip(b'\0').decode(encoding).replace('/',os.path.sep)
+	path_len, = st_unpack('<i',stream.read(4))
+	if path_len < 0:
+		# in at least some format versions, this indicates a UTF-16 path
+		path_len = -2 * path_len
+		encoding = 'utf-16le'
+	return stream.read(path_len).decode(encoding).rstrip('\0').replace('/',os.path.sep)
 
 def pack_path(path: str, encoding: str = 'utf-8') -> bytes:
 	encoded_path = path.replace(os.path.sep, '/').encode('utf-8') + b'\0'