# Data generation

We are going to download GitHub activity from the [GitHub archive](https://www.githubarchive.org).

Each hour of each day gets a file, in which each line is a JSON-serialized record of an event on GitHub - comments, pull requests, commits, etc.

We define a `fetch_one` function to fetch a single file into our `../data` directory,
and parallelize many calls to `fetch_one` with
our `fetch_range` function via `ThreadPoolExecuter.map`.

In [1]:
from datetime import datetime, timedelta
from glob import glob
import gzip
import os
import shutil

import requests

data = os.path.join('..', 'data')
if not os.path.exists(data):
    os.mkdir(data)


url_t = 'http://data.githubarchive.org/{year:04}-{month:02}-{day:02}-{hour}.json.gz'
fn_t =  os.path.join(data, 'data-{year:04}-{month:02}-{day:02}-{hour}.json')


# Pick 11 July 2015, last year's SciPy sprints
day1 = datetime(year=2015, month=7, day=11)

def fetch_one(timestamp=day1):
    """Fetch a single hour of github data.
    
    Returns filename downloaded.
    
    No-op if already downloaded.
    """
    # build URL and filename from date
    ns = dict(
        year = timestamp.year,
        month = timestamp.month,
        day = timestamp.day,
        hour = timestamp.hour,
    )
    url = url_t.format(**ns)
    fn = fn_t.format(**ns)
    
    if os.path.exists(fn):
        # already downloaded
        return fn
    
    # download the gzip-compressed json data
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    with gzip.open(resp.raw, mode='rb') as r:
        with open(fn, 'wb') as w:
            shutil.copyfileobj(r, w)
    return fn
    
def fetch_range(ex, start=day1, hours=24):
    """Fetch a range, starting from a date and grabbing every hourly dataset.
    
    parallelized via an Executor.
    
    ex: Executor
    start: datetime
    hours: integer
    """
    return ex.map(fetch_one, 
                 (start + timedelta(hours=i) for i in range(hours))
    )


In [2]:
from concurrent.futures import ThreadPoolExecutor
ex = ThreadPoolExecutor()

In [3]:
%%time

total = 0

for fn in fetch_range(ex):
    MB = os.stat(fn).st_size / 2**20
    print("%6.1f MB %s" % (MB, fn))
    total += MB
print("%6.1f MB total" % total)

filenames = glob(os.path.join(data, '*.json'))

  36.7 MB ../data/data-2015-07-11-0.json
  34.1 MB ../data/data-2015-07-11-1.json
  30.3 MB ../data/data-2015-07-11-2.json
  28.9 MB ../data/data-2015-07-11-3.json
  27.7 MB ../data/data-2015-07-11-4.json
  25.0 MB ../data/data-2015-07-11-5.json
  30.5 MB ../data/data-2015-07-11-6.json
  28.8 MB ../data/data-2015-07-11-7.json
  30.3 MB ../data/data-2015-07-11-8.json
  29.6 MB ../data/data-2015-07-11-9.json
  27.8 MB ../data/data-2015-07-11-10.json
  29.5 MB ../data/data-2015-07-11-11.json
  31.7 MB ../data/data-2015-07-11-12.json
  37.2 MB ../data/data-2015-07-11-13.json
  39.6 MB ../data/data-2015-07-11-14.json
  42.8 MB ../data/data-2015-07-11-15.json
  42.5 MB ../data/data-2015-07-11-16.json
  40.6 MB ../data/data-2015-07-11-17.json
  37.9 MB ../data/data-2015-07-11-18.json
  40.3 MB ../data/data-2015-07-11-19.json
  40.3 MB ../data/data-2015-07-11-20.json
  39.8 MB ../data/data-2015-07-11-21.json
  37.2 MB ../data/data-2015-07-11-22.json
  30.3 MB ../data/data-2015-07-11-23.json
 8

In [4]:
import json
import pprint
with open(filenames[0]) as f:
    for i in range(10):
        event = json.loads(f.readline())
        event['payload'] = 'redacted' # payload is big
        pprint.pprint(event)

{'actor': {'avatar_url': 'https://avatars.githubusercontent.com/u/12275921?',
           'gravatar_id': '',
           'id': 12275921,
           'login': 'peterpih',
           'url': 'https://api.github.com/users/peterpih'},
 'created_at': '2015-07-11T12:00:00Z',
 'id': '2966020339',
 'payload': 'redacted',
 'public': True,
 'repo': {'id': 37070319,
          'name': 'peterpih/Miscellaneous',
          'url': 'https://api.github.com/repos/peterpih/Miscellaneous'},
 'type': 'PushEvent'}
{'actor': {'avatar_url': 'https://avatars.githubusercontent.com/u/2352034?',
           'gravatar_id': '',
           'id': 2352034,
           'login': 'henricavalcante',
           'url': 'https://api.github.com/users/henricavalcante'},
 'created_at': '2015-07-11T12:00:00Z',
 'id': '2966020340',
 'org': {'avatar_url': 'https://avatars.githubusercontent.com/u/10598228?',
         'gravatar_id': '',
         'id': 10598228,
         'login': 'maisquestoes',
         'url': 'https://api.github.com/orgs/