In [1]:
import boto3
import concurrent.futures

from path import Path

BUCKET_NAME = 'safe-ucosp-2017'
PROJECT_DIR = Path.getcwd().parent # Could assert some things here to check we got the right path

In [3]:
session = boto3.Session(profile_name='default')  # Change to match the profile name in ~/.aws/credentials
s3 = session.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

In [4]:
index_file_path = Path.joinpath(PROJECT_DIR, 'file_index.txt')
assert index_file_path.exists(), 'Index File is missing'
index_file_path

Path('/home/bird/Dev/mozilla/sb2018/file_index.txt')

In [9]:
# Read the first few lines
with open(index_file_path, 'r') as index_file:
    index = index_file.readlines()

for file_name in index[0:5]:
    print(file_name.rstrip())

1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json
1_000014b53a60c645e3ac9bde6bae020430c930b3cc5903677e0d5cb2.json
1_00003e3765a73da45db5265de2b22424e025d61380f7cf8080b378aa.json
1_00004636d8310609e710934f194bfb41a5f0ac7ed5e05c0fb9047e48.json
1_00004b8315fd1954f06dd80b85ebc61f7ab006785cd3cf37dd59f789.json


### Download all files

    previously_downloaded = []
    freshly_downloaded = []
    with open(index_file_path, 'r') as index_file:
        contents = index_file.readlines()
        for line in contents:
            key = line.rstrip()
            cache_file_path = Path.joinpath(PROJECT_DIR, 'cache', key)
            if cache_file_path.exists():
                previously_downloaded.append(key)
            else:
                freshly_downloaded.append(key)
                bucket.download_file(Key=key, Filename=cache_file_path)
    
    # Took around 8min to download 2000 files
    # Need to use threading to get this moving fast enough

In [42]:
errored = []
existing = []
new = []

def download(line):
    key = line.rstrip()
    cache_file_path = Path.joinpath(PROJECT_DIR, 'cache', key)
    if cache_file_path.exists():
        existing.append(key)
        return False
    else:
        bucket.download_file(Key=key, Filename=cache_file_path)
        new.append(key)
        return True

with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
    print("Building future_map")
    future_map = {executor.submit(download, l): l for l in index}
    print("Finished building future_map")
    for future in concurrent.futures.as_completed(future_map):
        line = future_map[future]
        try:
            future.result()
        except:
            errored.append(line)

Building future_map
Finished building future_map


In [43]:
errored

['1_0f69f683adf139491cd701484c178d646350fed7b182243ad9b6fea8.json\n',
 '1_0f6a062c9e4b1cedb2de540150224b701b742b8626732f0f6fc34919.json\n']

Downloading 2,059,733 files took ~8 hours.

Could try more max_workers next time

### Try re-downloading the errored files, they may have failed just because of network

In [44]:
download(errored[0])

True

In [45]:
download(errored[1])

True

### Same number of files in index and on disk?

In [46]:
len(index)

2059735

In [49]:
!tree ../cache | tail -n1

0 directories, 2059735 files


### Notes

I am not an expert at using concurrent.futures. In other notebooks I've also used the ProcessPoolExecutor. It may have been faster using ProcessPoolExecutor I didn't try anything different. It downloaded overnight and that was good enough for my needs.

Two files died probably because my internet went out briefly. I had no problems re-downloading them.
