# Getting to know CacheControl

CacheControl is a port of the caching algorithms in httplib2 for use with requests session object.

My initial examples of using Cache Control,
pleass refer to the
project site https://github.com/ionrock/cachecontrol
and the
documentation http://cachecontrol.readthedocs.org/en/latest/.

Install the Cache Control package using
`(cd ~/proj/CheatSheetsAndOtherRecipes; pipenv install CacheControl)`


In [1]:
import requests
from cachecontrol import CacheControl


sess = requests.session()
cachedSess = CacheControl(sess)

r       = sess.get('http://google.com')
print(r)
rCached = cachedSess.get('http://google.com')
print(rCached)

<Response [200]>
<Response [200]>


In [2]:
import datetime

def changedResponseHeaders(rOrig, rNew):
    ch = {}
    ch['new'] = []
    ch['missing'] = []
    ch['changed'] = []
    
    for k in rNew.headers.keys():
        if not k in rOrig.headers:
            ch['new'].append(k)
            
    for k in rOrig.headers.keys():
        if not k in rNew.headers:
            ch['missing'].append(k)
    
    for k in rNew.headers.keys():
        if k in rOrig.headers.keys() and not rNew.headers[k] == rOrig.headers[k]:
            ch['changed'].append(k)

    return ch

crh = changedResponseHeaders(r, rCached)

print('Current time:', datetime.datetime.now(tz=datetime.timezone.utc))
print('Extra headers in the cached response: ', crh['new'])
for k in crh['new']:
    print('...  {0}: "{1}"'.format(k, rCached.headers[k]))

print('Missing headers in the cached response: ', crh['missing'])
for k in crh['missing']:
    print('...  {0}: "{1}"'.format(k, r.headers[k]))
        
print('Changed headers in the cached response: ', crh['changed'])
for k in crh['changed']:
    print('...  "{0}":'.format(k))
    print('...      Original   : "{0}"'.format((r.headers[k] if k in r.headers.keys() else '')))
    print('...      Changed to : "{0}"'.format(rCached.headers[k]))



Current time: 2020-07-21 09:13:05.231052+00:00
Extra headers in the cached response:  []
Missing headers in the cached response:  ['P3P']
...  P3P: "CP="This is not a P3P policy! See g.co/p3phelp for more info.""
Changed headers in the cached response:  ['Content-Length', 'Set-Cookie']
...  "Content-Length":
...      Original   : "5614"
...      Changed to : "5611"
...  "Set-Cookie":
...      Original   : "1P_JAR=2020-07-21-09; expires=Thu, 20-Aug-2020 09:13:05 GMT; path=/; domain=.google.com; Secure, NID=204=EXxpbUoYS_CtkuQ9i0B468nVlaslasJfDOdQO8fwBEwqtaa1JFMmlP8Xq9WXOt4Y6fVhIlQgCZ79Yc-At9q7WILHKsrUYDkli0xMFH516rFOzvGYhkpOdYNBgUIQRUtff1rpLrv8kMRwc1_tNod3ow3p6HF5zVG-ocVXbffGqHM; expires=Wed, 20-Jan-2021 09:13:05 GMT; path=/; domain=.google.com; HttpOnly"
...      Changed to : "1P_JAR=2020-07-21-09; expires=Thu, 20-Aug-2020 09:13:05 GMT; path=/; domain=.google.com; Secure"


## Housekeeping and infrastructure setup.

Some boilerplate code.

In [3]:
P_NAME = 'gettingToKnowCacheControl'
P_TMP_DIR = '/tmp/' + P_NAME 
P_CACHE_DIR = P_TMP_DIR + '/cache'
P_PERSISTENCE_DIR = P_TMP_DIR + '/persistence'

def typename(x): return type(x).__name__
def classname(x): return x.__class__.__qualname__

#
# Set up logging
#
import os
import logging, logging.handlers

LOGFORMAT = '%(asctime)s %(name)s %(levelname)-8s: %(message)s'
LOGDATEFORMAT = '%Y-%m-%d %H:%M:%S'
NAME = 'getttingToKnowCacheControl'

logFormatter = logging.Formatter(fmt=LOGFORMAT, datefmt=LOGDATEFORMAT)
#
logger = logging.getLogger(NAME)
logger.setLevel(logging.DEBUG)

# log to file
logdir = filename='{0}'.format(P_TMP_DIR, NAME)
os.makedirs(logdir, exist_ok=True)
logFileHandler = logging.handlers.RotatingFileHandler(filename='{0}/{1}.log'.format(logdir, P_NAME),
                                                      mode='a',
                                                      maxBytes=100*1024, backupCount=5)
logFileHandler.setLevel(logging.DEBUG)
logFileHandler.setFormatter(logFormatter)
logger.addHandler(logFileHandler)
#



## Use files for persistant caching

The FileCache is similar to the caching mechanism provided by httplib2. 
It requires lockfile be installed 
as it prevents multiple threads from writing to the same file at the same time.

You can install this dependency automatically with pip by requesting the filecache extra 
`pipenv install cachecontrol[filecache]`

In [4]:
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache

cachedir = P_CACHE_DIR
os.makedirs(cachedir, exist_ok=True)

cachedSess = CacheControl(requests.Session(),
                    cache=FileCache(cachedir))
response = cachedSess.get('http://google.com')
print(response)

<Response [200]>


## Explicit control of Caching Strategy

In [7]:
import datetime
import requests
from cachecontrol import CacheControl
from cachecontrol.heuristics import BaseHeuristic
from cachecontrol.caches.file_cache import FileCache
from email.utils import parsedate, formatdate
import calendar


class OneWeekHeuristic(BaseHeuristic):
    def __init__(self):
        logger.debug('Class "%s" instantiated.',
                     classname(self)) 

    def update_headers(self, response):
        logger.debug('"%s": response header "date" = %s.',
                     classname(self),
                     str(response.headers['date']))
        logger.debug('"%s": response header "expires" = %s.',
                     classname(self),
                     str(response.headers['cache-control']))
        logger.debug('"%s": response header "expires" = %s.',
                     classname(self),
                     str(response.headers['cache-control']))
        date = parsedate(response.headers['date'])
        expires = datetime.datetime(*date[:6]) + datetime.timedelta(weeks=1)
        logger.debug('"%s": modified header "expires" = %s.',
                     classname(self),
                     str(expires))
        
        return {
            'expires' : formatdate(calendar.timegm(expires.timetuple())),
            'cache-control' : 'public',
        }

    def warning(self, response):
        msg = 'Automatically cached! Response is Stale.'
        return '110 - "%s"' % msg

cachedir = P_CACHE_DIR
os.makedirs(cachedir, exist_ok=True)
r       = sess.get('http://google.com')
cachedSess = CacheControl(requests.Session(), 
                          heuristic=OneWeekHeuristic(),
                          cache=FileCache(cachedir))
rCached = cachedSess.get('http://google.com')
assert rCached.from_cache


In [8]:
crh = changedResponseHeaders(r, rCached)

print('Current time:', datetime.datetime.now(tz=datetime.timezone.utc))
print('Extra headers in the cached response: ', crh['new'])
for k in crh['new']:
    print('...  {0}: "{1}"'.format(k, rCached.headers[k]))

print('Missing headers in the cached response: ', crh['missing'])
for k in crh['missing']:
    print('...  {0}: "{1}"'.format(k, r.headers[k]))
        
print('Changed headers in the cached response: ', crh['changed'])
for k in crh['changed']:
    print('...  "{0}":'.format(k))
    print('...      Original   : "{0}"'.format((r.headers[k] if k in r.headers.keys() else '')))
    print('...      Changed to : "{0}"'.format(rCached.headers[k]))


Current time: 2020-07-21 09:14:46.273339+00:00
...  P3P: "CP="This is not a P3P policy! See g.co/p3phelp for more info.""
Missing headers in the cached response:  []
Changed headers in the cached response:  ['Date', 'expires', 'cache-control', 'Content-Length', 'Set-Cookie']
...  "Date":
...      Original   : "Tue, 21 Jul 2020 09:14:42 GMT"
...      Changed to : "Tue, 21 Jul 2020 07:10:49 GMT"
...  "expires":
...      Original   : "-1"
...      Changed to : "Tue, 28 Jul 2020 07:10:49 -0000"
...  "cache-control":
...      Original   : "private, max-age=0"
...      Changed to : "public"
...  "Content-Length":
...      Original   : "5616"
...      Changed to : "5639"
...  "Set-Cookie":
...      Original   : "1P_JAR=2020-07-21-09; expires=Thu, 20-Aug-2020 09:14:42 GMT; path=/; domain=.google.com; Secure"
...      Changed to : "NID=204=G3q-wj68d-u3bZi_K_q3VB3o3XtrM45Sn3j9qX-kjka1xe3AF_ie4oRWmdkR6GE_K6JxultbztXhYyGirWVYdIAmBTY3Gt4ruhRzQSF1IT4w0EGVxG-xSrZbJfZ9wp7MV_3TBFfe_E9EJ0ue2gCzroHQqjisU

## Scratchpad for inspecting Caching Policy of a ressource

In [25]:
urls = []
urls.append('http://google.com')
urls.append('http://coop.dk')
urls.append('https://www.iso20022.org/market-identifier-codes')
hk = []
hk.append('Cache-Control')
hk.append('Date')
hk.append('Expires')
hk.append('Set-Cookie')
#
for url in urls:
    print('')
    print('url = "{0}"'.format(url))
    r = sess.get(url)
    for k in hk:
        if k in r.headers.keys():
            print(' * {0}: {1}'.format(k, r.headers[k]))
    # print('Response header keys:')
    # for k in r.headers.keys():
    #     print('   "{0}"'.format(k))
    # print(r.headers.keys())


url = "http://google.com"
 * Cache-Control: private, max-age=0
 * Date: Tue, 21 Jul 2020 11:24:49 GMT
 * Expires: -1
 * Set-Cookie: 1P_JAR=2020-07-21-11; expires=Thu, 20-Aug-2020 11:24:49 GMT; path=/; domain=.google.com; Secure

url = "http://coop.dk"
 * Cache-Control: private
 * Date: Tue, 21 Jul 2020 11:24:48 GMT
 * Set-Cookie: RichRelSession=545eb02e-08c6-4e45-8058-5b83477e0689; domain=local.dk; expires=Thu, 20-Aug-2020 11:24:49 GMT; path=/, rr_sessionId=ce053fec-9194-4c03-86bf-d6cab69ef43e; domain=local.dk; expires=Thu, 20-Aug-2020 11:24:49 GMT; path=/

url = "https://www.iso20022.org/market-identifier-codes"
 * Cache-Control: private, no-cache, must-revalidate
 * Date: Tue, 21 Jul 2020 11:24:49 GMT
 * Expires: Tue, 21 Jul 2020 11:24:49 GMT
