In [1]:
# Copyright (c) Microsoft Corporation. All rights reserved
# Licensed under the MIT License.
import pandas as pd

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [2]:
df = pd.read_csv("../data/naip_most_recent_100cm.csv")
urls = df["image_fn"].values
urls = [
    url.replace("http://", "https://")
    for url in urls
]

In [4]:
# https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure
retry_strategy = Retry(
    total=3,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

In [5]:
%%time
total_size = 0
for i, url in enumerate(urls):
    if i % 1000 == 0:
        print("%d/%d -- %0.2f%%"  % (i, len(urls), i/len(urls)*100))

    response = http.head(url, headers={'Connection':'close'})
    size = int(response.headers["Content-Length"])
    response.close()
    total_size += size

0/212354 -- 0.00%
1000/212354 -- 0.47%
2000/212354 -- 0.94%
3000/212354 -- 1.41%
4000/212354 -- 1.88%
5000/212354 -- 2.35%
6000/212354 -- 2.83%
7000/212354 -- 3.30%
8000/212354 -- 3.77%
9000/212354 -- 4.24%
10000/212354 -- 4.71%
11000/212354 -- 5.18%
12000/212354 -- 5.65%
13000/212354 -- 6.12%
14000/212354 -- 6.59%
15000/212354 -- 7.06%
16000/212354 -- 7.53%
17000/212354 -- 8.01%
18000/212354 -- 8.48%
19000/212354 -- 8.95%
20000/212354 -- 9.42%
21000/212354 -- 9.89%
22000/212354 -- 10.36%
23000/212354 -- 10.83%
24000/212354 -- 11.30%
25000/212354 -- 11.77%
26000/212354 -- 12.24%
27000/212354 -- 12.71%
28000/212354 -- 13.19%
29000/212354 -- 13.66%
30000/212354 -- 14.13%
31000/212354 -- 14.60%
32000/212354 -- 15.07%
33000/212354 -- 15.54%
34000/212354 -- 16.01%
35000/212354 -- 16.48%
36000/212354 -- 16.95%
37000/212354 -- 17.42%
38000/212354 -- 17.89%
39000/212354 -- 18.37%
40000/212354 -- 18.84%
41000/212354 -- 19.31%
42000/212354 -- 19.78%
43000/212354 -- 20.25%
44000/212354 -- 20.72%


In [6]:
print("%d bytes" % (total_size))
print("%0.2f TB" % (total_size / 1e12))

41855747596045 bytes
41.86 TB
