Skip to content

Commit

Permalink
Alternative cache invalidation approach (fixes #62)
Browse files Browse the repository at this point in the history
  • Loading branch information
xolox committed Nov 7, 2015
1 parent c6613a1 commit 3b16ce4
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 57 deletions.
2 changes: 1 addition & 1 deletion pip_accel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
from pip.exceptions import DistributionNotFound

# Semi-standard module versioning.
__version__ = '0.35'
__version__ = '0.36'

# Initialize a logger for this module.
logger = logging.getLogger(__name__)
Expand Down
78 changes: 69 additions & 9 deletions pip_accel/bdist.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Accelerator for pip, the Python package manager.
#
# Author: Peter Odding <peter.odding@paylogic.com>
# Last Change: October 31, 2015
# Last Change: November 7, 2015
# URL: https://github.com/paylogic/pip-accel

"""
Expand All @@ -13,6 +13,7 @@
"""

# Standard library modules.
import errno
import fnmatch
import logging
import os
Expand All @@ -34,7 +35,7 @@
from pip_accel.caches import CacheManager
from pip_accel.deps import SystemPackageManager
from pip_accel.exceptions import BuildFailed, InvalidSourceDistribution, NoBuildOutput
from pip_accel.utils import compact, makedirs
from pip_accel.utils import AtomicReplace, compact, makedirs

# Initialize a logger for this module.
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -74,11 +75,13 @@ def get_binary_dist(self, requirement):
packages were installed.
"""
cache_file = self.cache.get(requirement)
if cache_file and requirement.last_modified > os.path.getmtime(cache_file):
logger.info("Invalidating old %s binary (source is newer) ..", requirement)
cache_file = None
if not cache_file:
if cache_file:
if self.needs_invalidation(requirement, cache_file):
logger.info("Invalidating old %s binary (source has changed) ..", requirement)
cache_file = None
else:
logger.debug("%s hasn't been cached yet, doing so now.", requirement)
if not cache_file:
# Build the binary distribution.
try:
raw_file = self.build_binary_dist(requirement)
Expand Down Expand Up @@ -109,10 +112,67 @@ def get_binary_dist(self, requirement):
os.remove(transformed_file)
# Get the absolute pathname of the file in the local cache.
cache_file = self.cache.get(requirement)
# Enable checksum based cache invalidation.
self.persist_checksum(requirement, cache_file)
archive = tarfile.open(cache_file, 'r:gz')
for member in archive.getmembers():
yield member, archive.extractfile(member.name)
archive.close()
try:
for member in archive.getmembers():
yield member, archive.extractfile(member.name)
finally:
archive.close()

def needs_invalidation(self, requirement, cache_file):
"""
Check whether a cached binary distribution needs to be invalidated.
:param requirement: A :class:`.Requirement` object.
:param cache_file: The pathname of a cached binary distribution (a string).
:returns: :data:`True` if the cached binary distribution needs to be
invalidated, :data:`False` otherwise.
"""
if self.config.trust_mod_times:
return requirement.last_modified > os.path.getmtime(cache_file)
else:
checksum = self.recall_checksum(cache_file)
return checksum and checksum != requirement.checksum

def recall_checksum(self, cache_file):
"""
Get the checksum of the input used to generate a binary distribution archive.
:param cache_file: The pathname of the binary distribution archive (a string).
:returns: The checksum (a string) or :data:`None` (when no checksum is available).
"""
# EAFP instead of LBYL because of concurrency between pip-accel
# processes (https://docs.python.org/2/glossary.html#term-lbyl).
checksum_file = '%s.txt' % cache_file
try:
with open(checksum_file) as handle:
contents = handle.read()
return contents.strip()
except IOError as e:
if e.errno == errno.ENOENT:
# Gracefully handle missing checksum files.
return None
else:
# Don't swallow exceptions we don't expect!
raise

def persist_checksum(self, requirement, cache_file):
"""
Persist the checksum of the input used to generate a binary distribution.
:param requirement: A :class:`.Requirement` object.
:param cache_file: The pathname of a cached binary distribution (a string).
.. note:: The checksum is only calculated and persisted when
:attr:`~.Config.trust_mod_times` is :data:`False`.
"""
if not self.config.trust_mod_times:
checksum_file = '%s.txt' % cache_file
with AtomicReplace(checksum_file) as temporary_file:
with open(temporary_file, 'w') as handle:
handle.write('%s\n' % requirement.checksum)

def build_binary_dist(self, requirement):
"""
Expand Down
21 changes: 20 additions & 1 deletion pip_accel/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Accelerator for pip, the Python package manager.
#
# Author: Peter Odding <peter.odding@paylogic.com>
# Last Change: October 31, 2015
# Last Change: November 7, 2015
# URL: https://github.com/paylogic/pip-accel

"""
Expand Down Expand Up @@ -297,6 +297,25 @@ def max_retries(self):
except:
return 3

@cached_property
def trust_mod_times(self):
"""
Whether to trust file modification times for cache invalidation.
- Environment variable: ``$PIP_ACCEL_TRUST_MOD_TIMES``
- Configuration option: ``trust-mod-times``
- Default: :data:`True` unless the AppVeyor_ continuous integration
environment is detected (see `issue 62`_).
.. _AppVeyor: http://www.appveyor.com
.. _issue 62: https://github.com/paylogic/pip-accel/issues/62
"""
on_appveyor = coerce_boolean(os.environ.get('APPVEYOR', 'False'))
return coerce_boolean(self.get(property_name='trust_mod_times',
environment_variable='PIP_ACCEL_TRUST_MOD_TIMES',
configuration_option='trust-mod-times',
default=(not on_appveyor)))

@cached_property
def s3_cache_url(self):
"""
Expand Down
43 changes: 27 additions & 16 deletions pip_accel/req.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Accelerator for pip, the Python package manager.
#
# Author: Peter Odding <peter.odding@paylogic.com>
# Last Change: October 31, 2015
# Last Change: November 7, 2015
# URL: https://github.com/paylogic/pip-accel

"""
Expand Down Expand Up @@ -34,6 +34,7 @@

# Modules included in our package.
from pip_accel.exceptions import UnknownDistributionFormat
from pip_accel.utils import hash_files

# External dependencies.
from cached_property import cached_property
Expand Down Expand Up @@ -83,14 +84,14 @@ def version(self):
@cached_property
def related_archives(self):
"""
Try to find the source distribution archive(s) for this requirement.
Returns a list of pathnames (strings).
This property is very new in pip-accel and its logic may need some time
to mature. For now any misbehavior by this property shouldn't be too
much of a problem because the pathnames reported by this property are
only used for cache invalidation (see :attr:`last_modified`).
The pathnames of the source distribution(s) for this requirement (a list of strings).
.. note:: This property is very new in pip-accel and its logic may need
some time to mature. For now any misbehavior by this property
shouldn't be too much of a problem because the pathnames
reported by this property are only used for cache
invalidation (see the :attr:`last_modified` and
:attr:`checksum` properties).
"""
# Escape the requirement's name for use in a regular expression.
name_pattern = escape_name(self.name)
Expand All @@ -112,18 +113,28 @@ def related_archives(self):
@cached_property
def last_modified(self):
"""
Try to find the last modified time of the requirement's source distribution archive(s).
The last modified time of the requirement's source distribution archive(s) (a number).
Returns a number.
Based on :attr:`related_archives`. If no related archives are found
the current time is reported. In the balance between not invalidating
cached binary distributions enough and invalidating them too
frequently, this property causes the latter to happen.
The value of this property is based on the :attr:`related_archives`
property. If no related archives are found the current time is
reported. In the balance between not invalidating cached binary
distributions enough and invalidating them too frequently, this
property causes the latter to happen.
"""
mtimes = list(map(os.path.getmtime, self.related_archives))
return max(mtimes) if mtimes else time.time()

@cached_property
def checksum(self):
"""
The SHA1 checksum of the requirement's source distribution archive(s) (a string).
The value of this property is based on the :attr:`related_archives`
property. If no related archives are found the SHA1 digest of the empty
string is reported.
"""
return hash_files('sha1', *sorted(self.related_archives))

@cached_property
def source_directory(self):
"""
Expand Down

0 comments on commit 3b16ce4

Please sign in to comment.