In [0]:
# Import statements for connecting to google cloud. This should give you a link
# for authentication. Click on the link and sign in with your google account
# that is connected to the google bucket containing your data.
from google.colab import auth
from google.cloud import storage
auth.authenticate_user()

# First, we need to set our project. Replace the assignments below with the
# project ID and bucket name to access for your data.
project_id = 'humanitarian-tent'
bucket_name = 'tent-bucket'
!gcloud config set project {project_id}
client = storage.Client(project_id)
bucket = client.get_bucket(bucket_name)

# 0. Downloading Digital Globe Data

In [0]:
# Required import statements. Note that this colab can use any runtime!
import urllib2
import re
from google.cloud import storage
from google.cloud.storage import Blob
import time

## Extracting URLs to Download
Digital Globe does not make it easy to download content (even with their export options). Luckily, we can just get all the URLs we need to download and then use a standard request library in python to pull all the data down from their platform. Before we get started, however, we need to be able to first get the URLs, which also implicitly include an access token for a session that you will need to make consecutive requests.



1. Go to your library of files
2. If your files are in folders, you will have to expand all the folders (do this colab in sections if you have many folders)
3.   **On Chrome** Ctrl Click on one of the files you want to pull down and select 'Inspect Element.' This should give you a panel which has a bunch of HTML
4. Within the highlighted HTML should be something with a URL starting with something like 'https://services.digitalglobe.com/..../<your_tif_file.tif.''
5. Copy paste that URL for the first cell in the WGET section. We will continue on extracting everything in the later section



## Example WGET
Now that you have an example URL you want to pull down, let's try to run an example WGET request to pull the file off of Digital Globe and the upload it in to your Google Bucket.

In [0]:
# Example URL - you will have to replace this with the URL you got from your library above
url = "https://services.digitalglobe.com/earthservice/kmlaccess/library/request/205615/4842545/?connectId=9e9c4948-da83-4e7b-a558-0c8a71ac3bb2&DGTOKEN=2ea34dc72e4dd768d419242278259b41cadd3d5733d6f211885a3ea8dc3a7436&requestType=undefined&retrievePath=/content/com/library/9e9c4948-da83-4e7b-a558-0c8a71ac3bb2/103001006758EF00_20669/103001006758EF00_R1C1.tif"
file_name = url.split("/")[-1]

In [0]:
file_name

In [0]:
resp = urllib2.urlopen(url)

In [0]:
resp.info().type

In [0]:
blob = bucket.blob(file_name)

In [0]:
blob.upload_from_string(resp.read(), content_type=resp.info().type)

In [0]:
# Ensure we have all the expected data in this folder
!gsutil ls gs://{bucket_name}

## Scraped URLS
Now that we know we can scrape one file from a URL, let's try a whole bunch! Now instead of inspecting a single element and copying that single URL, we are instead going to Ctrl click and select the view page source option. Copy and paste all that HTML code in to a separate file. Then upload it to your drive (in the same folder as this colab).

In [21]:
# We called our HTML output digitalglobe.html but call it whatever you want and
# and replace the name and filepath for your file in your drive below
from google.colab import drive
drive.mount('/content/gdrive')

dg_url_pattern = r"https:\/\/services.digitalglobe.com\/\S+.tif"
# !cd /content/gdrive/My\ Drive/Colab\ Notebooks/ && ls
with open('/content/gdrive/My Drive/Colab Notebooks/digitalglobe.html', 'rb') as f:
  content = '\n'.join(f.readlines())
  matches = re.findall(dg_url_pattern, content)
matches = list(set(matches))
print("There are %d URLS from this source!" % len(matches))
print("Sample URL:")
print(matches[0])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
There are 121 URLS from this source!
Sample URL:
https://services.digitalglobe.com/earthservice/kmlaccess/library/request/205559/4841200/?connectId=9e9c4948-da83-4e7b-a558-0c8a71ac3bb2&amp;DGTOKEN=2ea34dc72e4dd768d419242278259b41cadd3d5733d6f211885a3ea8dc3a7436&amp;requestType=undefined&amp;retrievePath=/content/com/library/9e9c4948-da83-4e7b-a558-0c8a71ac3bb2/103001006EB13800_20669/103001006EB13800_BROWSE.tif


## Mass Download

In [0]:
def upload_file(bucket, token, url):
  """
  Pull down content from the given URL and upload it in to the bucket. Note that
  this will upload each file to <tile>/<filename> where filename is the existing
  URL extension filename and <tile> is the extracted tile where this file is
  taken from
  bucket: Google bucket object for uploading
  token: The Digital Globe token we extracted in the last section from the most
         recent URL
  url: The pre-collected Digital Globe URL for the file we are interested in
  """
  file_name = url.split("/")[-1]
  url = '%sDGTOKEN=%s&requestType%s' % (url.split('DGTOKEN=')[0], token, url.split('&requestType')[1])
  token = recent_url.split('DGTOKEN=')[1].split('&requestType')[0]
  directory = file_name.split("_")[0]
  resp = urllib2.urlopen(url)
  if not resp:
    print("response empty: %s" % url)
    return
  if resp.info().type != 'application/octet-stream':
    print("response has wrong type: %s" % url)
    return
  blob = bucket.blob('%s/%s' % (directory, file_name))
  blob.upload_from_string(resp.read(), content_type=resp.info().type)

In [0]:
# Retrieve a recent link with the current session token in it (note if you are
# in the same session you can use your existing url or the Digital Globe urls.
# Otherwise, repeat the instructions for getting a single URL at the start of
# the Example WGET section).
recent_url = "https://services.digitalglobe.com/earthservice/kmlaccess/library/request/205611/4842139/?connectId=9e9c4948-da83-4e7b-a558-0c8a71ac3bb2&DGTOKEN=73a95e654bd51395aa1fcf0be071ac7a3b2160d239c43776f04e087ef6634918&requestType=undefined&retrievePath=/content/com/library/9e9c4948-da83-4e7b-a558-0c8a71ac3bb2/1030010061C87300_20669/1030010061C87300_R8C2.tif"

In [0]:
# Parse access from recent url (has to be one whose session is currently running)
token = recent_url.split('DGTOKEN=')[1].split('&requestType')[0]

In [0]:
# Upload our sample urls we collected earlier
for i in range(39, 90):
  print("Processing url at index %d" % i)
  upload_file(bucket, token, sample_urls[i])
  time.sleep(1)

Processing url at index 39
Processing url at index 40
Processing url at index 41
Processing url at index 42
Processing url at index 43
Processing url at index 44
Processing url at index 45
Processing url at index 46
Processing url at index 47
Processing url at index 48
Processing url at index 49
Processing url at index 50
Processing url at index 51
Processing url at index 52
Processing url at index 53
Processing url at index 54
Processing url at index 55
Processing url at index 56
Processing url at index 57
Processing url at index 58
Processing url at index 59
Processing url at index 60
Processing url at index 61
Processing url at index 62
Processing url at index 63
Processing url at index 64
Processing url at index 65
Processing url at index 66
Processing url at index 67
Processing url at index 68
Processing url at index 69
Processing url at index 70
Processing url at index 71
Processing url at index 72
Processing url at index 73
Processing url at index 74
Processing url at index 75
P