In [2]:
import requests
import pandas as pd
import os
import xml.etree.ElementTree as ET
from datetime import datetime
import io

In [None]:
# Setup
# https://s3-eu-west-1.amazonaws.com/cycling.data.tfl.gov.uk/ contains useful information
BUCKET_URL = "https://s3-eu-west-1.amazonaws.com/cycling.data.tfl.gov.uk/"
DATA_DIR = "data"


In [12]:
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# 1. Get list of all files
print("Fetching file list...")
response = requests.get(BUCKET_URL)
root = ET.fromstring(response.content)


Fetching file list...


In [None]:
 
 print(response.content[:500])

b'<?xml version="1.0" encoding="UTF-8"?>\n<ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Name>cycling.data.tfl.gov.uk</Name><Prefix></Prefix><Marker></Marker><MaxKeys>1000</MaxKeys><IsTruncated>false</IsTruncated><Contents><Key>ActiveTravelCountsProgramme/</Key><LastModified>2023-06-05T10:35:42.000Z</LastModified><ETag>&quot;d41d8cd98f00b204e9800998ecf8427e&quot;</ETag><Size>0</Size><StorageClass>STANDARD</StorageClass></Contents><Contents><Key>ActiveTravelCountsProgramme/0 Stra'


In [11]:
namespace = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'}

In [7]:
# 2. Filter and Download
contents = root.findall('s3:Contents', namespace)
total = len(contents)

In [8]:
contents

[<Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129466570>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129466930>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129466bb0>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129466e30>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x1294670b0>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129467330>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x1294675b0>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129467830>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129467ab0>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129467d30>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x129467fb0>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}Contents' at 0x12946c270>,
 <Element '{http://s3.amazonaws.com/doc/2006-03-01/}

In [14]:
# 2. Filter and Download
contents = root.findall('s3:Contents', namespace)
all_keys = [c.find('s3:Key', namespace).text for c in contents]
KEYWORDS = ["journey", "data", "extract"]
years = ["2018", "2019", "2020", "2021", "2022", "2023"]

In [19]:
all_keys

['ActiveTravelCountsProgramme/',
 'ActiveTravelCountsProgramme/0 Strategic active travel counts - release note.pdf',
 'ActiveTravelCountsProgramme/0.5 Strategic cycling estimates - methodology note.pdf',
 'ActiveTravelCountsProgramme/1 Monitoring locations.csv',
 'ActiveTravelCountsProgramme/2 Availability matrix.xlsx',
 'ActiveTravelCountsProgramme/2014 Q1 (Jan-Mar)-Central.csv',
 'ActiveTravelCountsProgramme/2014 Q2 spring (Apr-Jun)-Central.csv',
 'ActiveTravelCountsProgramme/2014 Q3 (Jul-Sep)-Central.csv',
 'ActiveTravelCountsProgramme/2014 Q4 autumn (Oct-Dec)-Central.csv',
 'ActiveTravelCountsProgramme/2014 Q4 autumn (Oct-Dec)-Cycleways.csv',
 'ActiveTravelCountsProgramme/2015 Q1 (Jan-Mar)-Central.csv',
 'ActiveTravelCountsProgramme/2015 Q2 spring (Apr-Jun)-Central.csv',
 'ActiveTravelCountsProgramme/2015 Q2 spring (Apr-Jun)-Cycleways.csv',
 'ActiveTravelCountsProgramme/2015 Q2 spring (Apr-Jun)-Inner.csv',
 'ActiveTravelCountsProgramme/2015 Q2 spring (Apr-Jun)-Outer.csv',
 'ActiveT

In [16]:
all_necessary_keys = []
for key in all_keys:
    key_lower = key.lower()
    if (key_lower.endswith('.csv') and 
        'usage-stats' in key_lower and 
        all(k in key_lower for k in KEYWORDS) and
        any(y in key_lower for y in years)):
        all_necessary_keys.append(key)


In [20]:
all_necessary_keys

['usage-stats/100JourneyDataExtract07Mar2018-13Mar2018.csv',
 'usage-stats/101JourneyDataExtract14Mar2018-20Mar2018.csv',
 'usage-stats/102JourneyDataExtract21Mar2018-27Mar2018.csv',
 'usage-stats/103JourneyDataExtract28Mar2018-03Apr2018.csv',
 'usage-stats/104JourneyDataExtract04Apr2018-10Apr2018.csv',
 'usage-stats/105JourneyDataExtract11Apr2018-17Apr2018.csv',
 'usage-stats/106JourneyDataExtract18Apr2018-24Apr2018.csv',
 'usage-stats/107JourneyDataExtract25Apr2018-01May2018.csv',
 'usage-stats/108JourneyDataExtract02May2018-08May2018.csv',
 'usage-stats/109JourneyDataExtract09May2018-15May2018.csv',
 'usage-stats/110JourneyDataExtract16May2018-22May2018.csv',
 'usage-stats/111JourneyDataExtract23May2018-29May2018.csv',
 'usage-stats/112JourneyDataExtract30May2018-05June2018.csv',
 'usage-stats/113JourneyDataExtract06June2018-12June2018.csv',
 'usage-stats/114JourneyDataExtract13June2018-19June2018.csv',
 'usage-stats/115JourneyDataExtract20June2018-26June2018.csv',
 'usage-stats/116

In [18]:
enumerate(all_necessary_keys)

<enumerate at 0x12a13e700>

In [21]:
for i, key in enumerate(all_necessary_keys):
    filename = key.replace('/', '_')
    filepath = os.path.join(DATA_DIR, filename)
    
    print(f"[{i+1}/{len(all_necessary_keys)}] Downloading {key}...")
    
    with requests.get(BUCKET_URL + key, stream=True) as r:
        if r.status_code == 200:
            with open(filepath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

[1/299] Downloading usage-stats/100JourneyDataExtract07Mar2018-13Mar2018.csv...
[2/299] Downloading usage-stats/101JourneyDataExtract14Mar2018-20Mar2018.csv...
[3/299] Downloading usage-stats/102JourneyDataExtract21Mar2018-27Mar2018.csv...
[4/299] Downloading usage-stats/103JourneyDataExtract28Mar2018-03Apr2018.csv...
[5/299] Downloading usage-stats/104JourneyDataExtract04Apr2018-10Apr2018.csv...
[6/299] Downloading usage-stats/105JourneyDataExtract11Apr2018-17Apr2018.csv...
[7/299] Downloading usage-stats/106JourneyDataExtract18Apr2018-24Apr2018.csv...
[8/299] Downloading usage-stats/107JourneyDataExtract25Apr2018-01May2018.csv...
[9/299] Downloading usage-stats/108JourneyDataExtract02May2018-08May2018.csv...
[10/299] Downloading usage-stats/109JourneyDataExtract09May2018-15May2018.csv...
[11/299] Downloading usage-stats/110JourneyDataExtract16May2018-22May2018.csv...
[12/299] Downloading usage-stats/111JourneyDataExtract23May2018-29May2018.csv...
[13/299] Downloading usage-stats/112J