Revert of Add deduplication logic to .pak files (patchset #10 id:1800…

…01 of https://codereview.chromium.org/2969123002/ ) Reason for revert: https://bugs.chromium.org/p/chromium/issues/detail?id=747171 Original issue's description: > Add deduplication logic to .pak files > > Now, when multiple entries contain the same content, multiple > table-of-contents entries will be created, but only one data region. > > As of now, en-US.pak has ~3200 entries, and 350 of them are duplicates. > > For MonochromePublic.apk, shrinks uncompressed .pak size by ~130kb, and > compressed .pak size by 32kb. > > BUG=738566 > > Review-Url: https://codereview.chromium.org/2969123002 > Cr-Commit-Position: refs/heads/master@{#488215} > Committed: https://chromium.googlesource.com/chromium/src/+/ade347f539a378fb37500f6d17e8835edc1d8ec0 TBR=flackr@chromium.org,sadrul@chromium.org,agrieve@chromium.org # Skipping CQ checks because original CL landed less than 1 days ago. NOPRESUBMIT=true NOTREECHECKS=true NOTRY=true BUG=738566 Review-Url: https://codereview.chromium.org/2989443002 Cr-Original-Commit-Position: refs/heads/master@{#488554} Change-Id: I33eff508d4177907d050424bee967d643fe022e4 Reviewed-on: https://chromium-review.googlesource.com/583542 Reviewed-by: Bernie Thompson <bhthompson@chromium.org> Cr-Commit-Position: refs/branch-heads/3163@{#13} Cr-Branched-From: ff259ba-refs/heads/master@{#488528}
nwjs · Jul 24, 2017 · 1595ed4 · 1595ed4
1 parent 48aa689
commit 1595ed4
Show file tree

Hide file tree

Showing 11 changed files with 213 additions and 396 deletions.
diff --git a/build/android/resource_sizes.py b/build/android/resource_sizes.py
@@ -562,17 +562,12 @@ def PrintPakAnalysis(apk_filename, min_pak_resource_size):
   # Calculate aggregate stats about resources across pak files.
   resource_count_map = collections.defaultdict(int)
   resource_size_map = collections.defaultdict(int)
-  seen_data_ids = set()
-  alias_overhead_bytes = 4
   resource_overhead_bytes = 6
   for pak in paks:
-    for k, v in pak.resources.iteritems():
-      resource_count_map[k] += 1
-      if id(v) not in seen_data_ids:
-        seen_data_ids.add(id(v))
-        resource_size_map[k] += resource_overhead_bytes + len(v)
-      else:
-        resource_size_map[k] += alias_overhead_bytes
+    for r in pak.resources:
+      resource_count_map[r] += 1
+      resource_size_map[r] += len(pak.resources[r]) + resource_overhead_bytes
+
   # Output the overall resource summary.
   total_resource_size = sum(resource_size_map.values())
   total_resource_count = len(resource_count_map)

diff --git a/tools/grit/grit/format/data_pack.py b/tools/grit/grit/format/data_pack.py
@@ -21,18 +21,16 @@
 from grit.node import structure
 
 
-PACK_FILE_VERSION = 5
+PACK_FILE_VERSION = 4
+HEADER_LENGTH = 2 * 4 + 1  # Two uint32s. (file version, number of entries) and
+                           # one uint8 (encoding of text resources)
 BINARY, UTF8, UTF16 = range(3)
 
 
 class WrongFileVersion(Exception):
   pass
 
 
-class CorruptDataPack(Exception):
-  pass
-
-
 DataPackContents = collections.namedtuple(
     'DataPackContents', 'resources encoding')
 
@@ -51,100 +49,56 @@ def Format(root, lang='en', output_dir='.'):
 
 
 def ReadDataPack(input_file):
-  return ReadDataPackFromString(util.ReadFile(input_file, util.BINARY))
-
-
-def ReadDataPackFromString(data):
   """Reads a data pack file and returns a dictionary."""
+  data = util.ReadFile(input_file, util.BINARY)
   original_data = data
 
   # Read the header.
-  version = struct.unpack('<I', data[:4])[0]
-  if version == 4:
-    resource_count, encoding = struct.unpack('<IB', data[4:9])
-    alias_count = 0
-    data = data[9:]
-  elif version == 5:
-    encoding, resource_count, alias_count = struct.unpack('<BxxxHH', data[4:12])
-    data = data[12:]
-  else:
-    raise WrongFileVersion('Found version: ' + str(version))
+  version, num_entries, encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])
+  if version != PACK_FILE_VERSION:
+    print 'Wrong file version in ', input_file
+    raise WrongFileVersion
 
   resources = {}
+  if num_entries == 0:
+    return DataPackContents(resources, encoding)
+
+  # Read the index and data.
+  data = data[HEADER_LENGTH:]
   kIndexEntrySize = 2 + 4  # Each entry is a uint16 and a uint32.
-  def entry_at_index(idx):
-    offset = idx * kIndexEntrySize
-    return struct.unpack('<HI', data[offset:offset + kIndexEntrySize])
-
-  prev_resource_id, prev_offset = entry_at_index(0)
-  for i in xrange(1, resource_count + 1):
-    resource_id, offset = entry_at_index(i)
-    resources[prev_resource_id] = original_data[prev_offset:offset]
-    prev_resource_id, prev_offset = resource_id, offset
-
-  # Read the alias table.
-  alias_data = data[(resource_count + 1) * kIndexEntrySize:]
-  kAliasEntrySize = 2 + 2  # uint16, uint16
-  def alias_at_index(idx):
-    offset = idx * kAliasEntrySize
-    return struct.unpack('<HH', alias_data[offset:offset + kAliasEntrySize])
-
-  for i in xrange(alias_count):
-    resource_id, index = alias_at_index(i)
-    aliased_id = entry_at_index(index)[0]
-    resources[resource_id] = resources[aliased_id]
+  for _ in range(num_entries):
+    id, offset = struct.unpack('<HI', data[:kIndexEntrySize])
+    data = data[kIndexEntrySize:]
+    next_id, next_offset = struct.unpack('<HI', data[:kIndexEntrySize])
+    resources[id] = original_data[offset:next_offset]
 
   return DataPackContents(resources, encoding)
 
 
 def WriteDataPackToString(resources, encoding):
   """Returns a string with a map of id=>data in the data pack format."""
+  ids = sorted(resources.keys())
   ret = []
 
-  # Compute alias map.
-  resource_ids = sorted(resources)
-  # Use reversed() so that for duplicates lower IDs clobber higher ones.
-  id_by_data = {resources[k]: k for k in reversed(resource_ids)}
-  # Map of resource_id -> resource_id, where value < key.
-  alias_map = {k: id_by_data[v] for k, v in resources.iteritems()
-               if id_by_data[v] != k}
-
   # Write file header.
-  resource_count = len(resources) - len(alias_map)
-  # Padding bytes added for alignment.
-  ret.append(struct.pack('<IBxxxHH', PACK_FILE_VERSION, encoding,
-                         resource_count, len(alias_map)))
-  HEADER_LENGTH = 4 + 4 + 2 + 2
-
-  # Each main table entry is: uint16 + uint32 (and an extra entry at the end).
-  # Each alias table entry is: uint16 + uint16.
-  data_offset = HEADER_LENGTH + (resource_count + 1) * 6 + len(alias_map) * 4
-
-  # Write main table.
-  index_by_id = {}
-  deduped_data = []
-  index = 0
-  for resource_id in resource_ids:
-    if resource_id in alias_map:
-      continue
-    data = resources[resource_id]
-    index_by_id[resource_id] = index
-    ret.append(struct.pack('<HI', resource_id, data_offset))
-    data_offset += len(data)
-    deduped_data.append(data)
-    index += 1
-
-  assert index == resource_count
-  # Add an extra entry at the end.
-  ret.append(struct.pack('<HI', 0, data_offset))
+  ret.append(struct.pack('<IIB', PACK_FILE_VERSION, len(ids), encoding))
+  HEADER_LENGTH = 2 * 4 + 1            # Two uint32s and one uint8.
+
+  # Each entry is a uint16 + a uint32s. We have one extra entry for the last
+  # item.
+  index_length = (len(ids) + 1) * (2 + 4)
 
-  # Write alias table.
-  for resource_id in sorted(alias_map):
-    index = index_by_id[alias_map[resource_id]]
-    ret.append(struct.pack('<HH', resource_id, index))
+  # Write index.
+  data_offset = HEADER_LENGTH + index_length
+  for id in ids:
+    ret.append(struct.pack('<HI', id, data_offset))
+    data_offset += len(resources[id])
+
+  ret.append(struct.pack('<HI', 0, data_offset))
 
   # Write data.
-  ret.extend(deduped_data)
+  for id in ids:
+    ret.append(resources[id])
   return ''.join(ret)
 
 

diff --git a/tools/grit/grit/format/data_pack_unittest.py b/tools/grit/grit/format/data_pack_unittest.py
@@ -17,8 +17,8 @@
 
 
 class FormatDataPackUnittest(unittest.TestCase):
-  def testReadDataPackV4(self):
-    expected_data = (
+  def testWriteDataPack(self):
+    expected = (
         '\x04\x00\x00\x00'                  # header(version
         '\x04\x00\x00\x00'                  #        no. entries,
         '\x01'                              #        encoding)
@@ -28,42 +28,9 @@ def testReadDataPackV4(self):
         '\x0a\x00\x3f\x00\x00\x00'          # index entry 10
         '\x00\x00\x3f\x00\x00\x00'          # extra entry for the size of last
         'this is id 4this is id 6')         # data
-    expected_resources = {
-        1: '',
-        4: 'this is id 4',
-        6: 'this is id 6',
-        10: '',
-    }
-    expected_data_pack = data_pack.DataPackContents(
-        expected_resources, data_pack.UTF8)
-    loaded = data_pack.ReadDataPackFromString(expected_data)
-    self.assertEquals(loaded, expected_data_pack)
-
-  def testReadWriteDataPackV5(self):
-    expected_data = (
-        '\x05\x00\x00\x00'                  # version
-        '\x01\x00\x00\x00'                  # encoding & padding
-        '\x03\x00'                          # resource_count
-        '\x01\x00'                          # alias_count
-        '\x01\x00\x28\x00\x00\x00'          # index entry 1
-        '\x04\x00\x28\x00\x00\x00'          # index entry 4
-        '\x06\x00\x34\x00\x00\x00'          # index entry 6
-        '\x00\x00\x40\x00\x00\x00'          # extra entry for the size of last
-        '\x0a\x00\x01\x00'                  # alias table
-        'this is id 4this is id 6')         # data
-    expected_resources = {
-        1: '',
-        4: 'this is id 4',
-        6: 'this is id 6',
-        10: 'this is id 4',
-    }
-    data = data_pack.WriteDataPackToString(expected_resources, data_pack.UTF8)
-    self.assertEquals(data, expected_data)
-
-    expected_data_pack = data_pack.DataPackContents(
-        expected_resources, data_pack.UTF8)
-    loaded = data_pack.ReadDataPackFromString(expected_data)
-    self.assertEquals(loaded, expected_data_pack)
+    input = {1: '', 4: 'this is id 4', 6: 'this is id 6', 10: ''}
+    output = data_pack.WriteDataPackToString(input, data_pack.UTF8)
+    self.failUnless(output == expected)
 
   def testRePackUnittest(self):
     expected_with_whitelist = {
@@ -83,14 +50,12 @@ def testRePackUnittest(self):
               in inputs]
 
     # RePack using whitelist
-    output, _ = data_pack.RePackFromDataPackStrings(
-        inputs, whitelist, suppress_removed_key_output=True)
+    output, _ = data_pack.RePackFromDataPackStrings(inputs, whitelist)
     self.assertDictEqual(expected_with_whitelist, output,
                          'Incorrect resource output')
 
     # RePack a None whitelist
-    output, _ = data_pack.RePackFromDataPackStrings(
-        inputs, None, suppress_removed_key_output=True)
+    output, _ = data_pack.RePackFromDataPackStrings(inputs, None)
     self.assertDictEqual(expected_without_whitelist, output,
                          'Incorrect resource output')
 

diff --git a/tools/grit/pak_util.py b/tools/grit/pak_util.py
@@ -33,7 +33,6 @@ def _ExtractMain(args):
 
 def _PrintMain(args):
   pak = data_pack.ReadDataPack(args.pak_file)
-  id_map = {id(v): k for k, v in sorted(pak.resources.items(), reverse=True)}
   encoding = 'binary'
   if pak.encoding == 1:
     encoding = 'utf-8'
@@ -58,13 +57,8 @@ def _PrintMain(args):
       except UnicodeDecodeError:
         pass
     sha1 = hashlib.sha1(data).hexdigest()[:10]
-    canonical_id = id_map[id(data)]
-    if resource_id == canonical_id:
-      line = u'Entry(id={}, len={}, sha1={}): {}'.format(
-          resource_id, len(data), sha1, desc)
-    else:
-      line = u'Entry(id={}, alias_of={}): {}'.format(
-          resource_id, canonical_id, desc)
+    line = u'Entry(id={}, len={}, sha1={}): {}'.format(
+        resource_id, len(data), sha1, desc)
     print line.encode('utf-8')
 
 

diff --git a/ui/base/BUILD.gn b/ui/base/BUILD.gn
@@ -788,7 +788,6 @@ test("ui_base_unittests") {
     "material_design/material_design_controller_unittest.cc",
     "models/tree_node_iterator_unittest.cc",
     "resource/data_pack_literal.cc",
-    "resource/data_pack_literal.h",
     "resource/data_pack_unittest.cc",
     "resource/resource_bundle_unittest.cc",
     "resource/scale_factor_unittest.cc",