Skip to content

Commit

Permalink
Revert of Add deduplication logic to .pak files (patchset #10 id:1800…
Browse files Browse the repository at this point in the history
…01 of https://codereview.chromium.org/2969123002/ )

Reason for revert:
https://bugs.chromium.org/p/chromium/issues/detail?id=747171

Original issue's description:
> Add deduplication logic to .pak files
>
> Now, when multiple entries contain the same content, multiple
> table-of-contents entries will be created, but only one data region.
>
> As of now, en-US.pak has ~3200 entries, and 350 of them are duplicates.
>
> For MonochromePublic.apk, shrinks uncompressed .pak size by ~130kb, and
> compressed .pak size by 32kb.
>
> BUG=738566
>
> Review-Url: https://codereview.chromium.org/2969123002
> Cr-Commit-Position: refs/heads/master@{#488215}
> Committed: https://chromium.googlesource.com/chromium/src/+/ade347f539a378fb37500f6d17e8835edc1d8ec0

TBR=flackr@chromium.org,sadrul@chromium.org,agrieve@chromium.org
# Skipping CQ checks because original CL landed less than 1 days ago.
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=738566

Review-Url: https://codereview.chromium.org/2989443002
Cr-Original-Commit-Position: refs/heads/master@{#488554}
Change-Id: I33eff508d4177907d050424bee967d643fe022e4
Reviewed-on: https://chromium-review.googlesource.com/583542
Reviewed-by: Bernie Thompson <bhthompson@chromium.org>
Cr-Commit-Position: refs/branch-heads/3163@{#13}
Cr-Branched-From: ff259ba-refs/heads/master@{#488528}
  • Loading branch information
achuith authored and Bernie Thompson committed Jul 24, 2017
1 parent 48aa689 commit 1595ed4
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 396 deletions.
13 changes: 4 additions & 9 deletions build/android/resource_sizes.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,17 +562,12 @@ def PrintPakAnalysis(apk_filename, min_pak_resource_size):
# Calculate aggregate stats about resources across pak files.
resource_count_map = collections.defaultdict(int)
resource_size_map = collections.defaultdict(int)
seen_data_ids = set()
alias_overhead_bytes = 4
resource_overhead_bytes = 6
for pak in paks:
for k, v in pak.resources.iteritems():
resource_count_map[k] += 1
if id(v) not in seen_data_ids:
seen_data_ids.add(id(v))
resource_size_map[k] += resource_overhead_bytes + len(v)
else:
resource_size_map[k] += alias_overhead_bytes
for r in pak.resources:
resource_count_map[r] += 1
resource_size_map[r] += len(pak.resources[r]) + resource_overhead_bytes

# Output the overall resource summary.
total_resource_size = sum(resource_size_map.values())
total_resource_count = len(resource_count_map)
Expand Down
114 changes: 34 additions & 80 deletions tools/grit/grit/format/data_pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,16 @@
from grit.node import structure


PACK_FILE_VERSION = 5
PACK_FILE_VERSION = 4
HEADER_LENGTH = 2 * 4 + 1 # Two uint32s. (file version, number of entries) and
# one uint8 (encoding of text resources)
BINARY, UTF8, UTF16 = range(3)


class WrongFileVersion(Exception):
pass


class CorruptDataPack(Exception):
pass


DataPackContents = collections.namedtuple(
'DataPackContents', 'resources encoding')

Expand All @@ -51,100 +49,56 @@ def Format(root, lang='en', output_dir='.'):


def ReadDataPack(input_file):
return ReadDataPackFromString(util.ReadFile(input_file, util.BINARY))


def ReadDataPackFromString(data):
"""Reads a data pack file and returns a dictionary."""
data = util.ReadFile(input_file, util.BINARY)
original_data = data

# Read the header.
version = struct.unpack('<I', data[:4])[0]
if version == 4:
resource_count, encoding = struct.unpack('<IB', data[4:9])
alias_count = 0
data = data[9:]
elif version == 5:
encoding, resource_count, alias_count = struct.unpack('<BxxxHH', data[4:12])
data = data[12:]
else:
raise WrongFileVersion('Found version: ' + str(version))
version, num_entries, encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])
if version != PACK_FILE_VERSION:
print 'Wrong file version in ', input_file
raise WrongFileVersion

resources = {}
if num_entries == 0:
return DataPackContents(resources, encoding)

# Read the index and data.
data = data[HEADER_LENGTH:]
kIndexEntrySize = 2 + 4 # Each entry is a uint16 and a uint32.
def entry_at_index(idx):
offset = idx * kIndexEntrySize
return struct.unpack('<HI', data[offset:offset + kIndexEntrySize])

prev_resource_id, prev_offset = entry_at_index(0)
for i in xrange(1, resource_count + 1):
resource_id, offset = entry_at_index(i)
resources[prev_resource_id] = original_data[prev_offset:offset]
prev_resource_id, prev_offset = resource_id, offset

# Read the alias table.
alias_data = data[(resource_count + 1) * kIndexEntrySize:]
kAliasEntrySize = 2 + 2 # uint16, uint16
def alias_at_index(idx):
offset = idx * kAliasEntrySize
return struct.unpack('<HH', alias_data[offset:offset + kAliasEntrySize])

for i in xrange(alias_count):
resource_id, index = alias_at_index(i)
aliased_id = entry_at_index(index)[0]
resources[resource_id] = resources[aliased_id]
for _ in range(num_entries):
id, offset = struct.unpack('<HI', data[:kIndexEntrySize])
data = data[kIndexEntrySize:]
next_id, next_offset = struct.unpack('<HI', data[:kIndexEntrySize])
resources[id] = original_data[offset:next_offset]

return DataPackContents(resources, encoding)


def WriteDataPackToString(resources, encoding):
"""Returns a string with a map of id=>data in the data pack format."""
ids = sorted(resources.keys())
ret = []

# Compute alias map.
resource_ids = sorted(resources)
# Use reversed() so that for duplicates lower IDs clobber higher ones.
id_by_data = {resources[k]: k for k in reversed(resource_ids)}
# Map of resource_id -> resource_id, where value < key.
alias_map = {k: id_by_data[v] for k, v in resources.iteritems()
if id_by_data[v] != k}

# Write file header.
resource_count = len(resources) - len(alias_map)
# Padding bytes added for alignment.
ret.append(struct.pack('<IBxxxHH', PACK_FILE_VERSION, encoding,
resource_count, len(alias_map)))
HEADER_LENGTH = 4 + 4 + 2 + 2

# Each main table entry is: uint16 + uint32 (and an extra entry at the end).
# Each alias table entry is: uint16 + uint16.
data_offset = HEADER_LENGTH + (resource_count + 1) * 6 + len(alias_map) * 4

# Write main table.
index_by_id = {}
deduped_data = []
index = 0
for resource_id in resource_ids:
if resource_id in alias_map:
continue
data = resources[resource_id]
index_by_id[resource_id] = index
ret.append(struct.pack('<HI', resource_id, data_offset))
data_offset += len(data)
deduped_data.append(data)
index += 1

assert index == resource_count
# Add an extra entry at the end.
ret.append(struct.pack('<HI', 0, data_offset))
ret.append(struct.pack('<IIB', PACK_FILE_VERSION, len(ids), encoding))
HEADER_LENGTH = 2 * 4 + 1 # Two uint32s and one uint8.

# Each entry is a uint16 + a uint32s. We have one extra entry for the last
# item.
index_length = (len(ids) + 1) * (2 + 4)

# Write alias table.
for resource_id in sorted(alias_map):
index = index_by_id[alias_map[resource_id]]
ret.append(struct.pack('<HH', resource_id, index))
# Write index.
data_offset = HEADER_LENGTH + index_length
for id in ids:
ret.append(struct.pack('<HI', id, data_offset))
data_offset += len(resources[id])

ret.append(struct.pack('<HI', 0, data_offset))

# Write data.
ret.extend(deduped_data)
for id in ids:
ret.append(resources[id])
return ''.join(ret)


Expand Down
49 changes: 7 additions & 42 deletions tools/grit/grit/format/data_pack_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@


class FormatDataPackUnittest(unittest.TestCase):
def testReadDataPackV4(self):
expected_data = (
def testWriteDataPack(self):
expected = (
'\x04\x00\x00\x00' # header(version
'\x04\x00\x00\x00' # no. entries,
'\x01' # encoding)
Expand All @@ -28,42 +28,9 @@ def testReadDataPackV4(self):
'\x0a\x00\x3f\x00\x00\x00' # index entry 10
'\x00\x00\x3f\x00\x00\x00' # extra entry for the size of last
'this is id 4this is id 6') # data
expected_resources = {
1: '',
4: 'this is id 4',
6: 'this is id 6',
10: '',
}
expected_data_pack = data_pack.DataPackContents(
expected_resources, data_pack.UTF8)
loaded = data_pack.ReadDataPackFromString(expected_data)
self.assertEquals(loaded, expected_data_pack)

def testReadWriteDataPackV5(self):
expected_data = (
'\x05\x00\x00\x00' # version
'\x01\x00\x00\x00' # encoding & padding
'\x03\x00' # resource_count
'\x01\x00' # alias_count
'\x01\x00\x28\x00\x00\x00' # index entry 1
'\x04\x00\x28\x00\x00\x00' # index entry 4
'\x06\x00\x34\x00\x00\x00' # index entry 6
'\x00\x00\x40\x00\x00\x00' # extra entry for the size of last
'\x0a\x00\x01\x00' # alias table
'this is id 4this is id 6') # data
expected_resources = {
1: '',
4: 'this is id 4',
6: 'this is id 6',
10: 'this is id 4',
}
data = data_pack.WriteDataPackToString(expected_resources, data_pack.UTF8)
self.assertEquals(data, expected_data)

expected_data_pack = data_pack.DataPackContents(
expected_resources, data_pack.UTF8)
loaded = data_pack.ReadDataPackFromString(expected_data)
self.assertEquals(loaded, expected_data_pack)
input = {1: '', 4: 'this is id 4', 6: 'this is id 6', 10: ''}
output = data_pack.WriteDataPackToString(input, data_pack.UTF8)
self.failUnless(output == expected)

def testRePackUnittest(self):
expected_with_whitelist = {
Expand All @@ -83,14 +50,12 @@ def testRePackUnittest(self):
in inputs]

# RePack using whitelist
output, _ = data_pack.RePackFromDataPackStrings(
inputs, whitelist, suppress_removed_key_output=True)
output, _ = data_pack.RePackFromDataPackStrings(inputs, whitelist)
self.assertDictEqual(expected_with_whitelist, output,
'Incorrect resource output')

# RePack a None whitelist
output, _ = data_pack.RePackFromDataPackStrings(
inputs, None, suppress_removed_key_output=True)
output, _ = data_pack.RePackFromDataPackStrings(inputs, None)
self.assertDictEqual(expected_without_whitelist, output,
'Incorrect resource output')

Expand Down
10 changes: 2 additions & 8 deletions tools/grit/pak_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def _ExtractMain(args):

def _PrintMain(args):
pak = data_pack.ReadDataPack(args.pak_file)
id_map = {id(v): k for k, v in sorted(pak.resources.items(), reverse=True)}
encoding = 'binary'
if pak.encoding == 1:
encoding = 'utf-8'
Expand All @@ -58,13 +57,8 @@ def _PrintMain(args):
except UnicodeDecodeError:
pass
sha1 = hashlib.sha1(data).hexdigest()[:10]
canonical_id = id_map[id(data)]
if resource_id == canonical_id:
line = u'Entry(id={}, len={}, sha1={}): {}'.format(
resource_id, len(data), sha1, desc)
else:
line = u'Entry(id={}, alias_of={}): {}'.format(
resource_id, canonical_id, desc)
line = u'Entry(id={}, len={}, sha1={}): {}'.format(
resource_id, len(data), sha1, desc)
print line.encode('utf-8')


Expand Down
1 change: 0 additions & 1 deletion ui/base/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -788,7 +788,6 @@ test("ui_base_unittests") {
"material_design/material_design_controller_unittest.cc",
"models/tree_node_iterator_unittest.cc",
"resource/data_pack_literal.cc",
"resource/data_pack_literal.h",
"resource/data_pack_unittest.cc",
"resource/resource_bundle_unittest.cc",
"resource/scale_factor_unittest.cc",
Expand Down
Loading

0 comments on commit 1595ed4

Please sign in to comment.