Given a .csv full of URLs for images, lookup archive on wayback
                 and then download image

Authors: Giovanni Castro, Josh Cheung

In [None]:
import csv

!pip install waybackpy
import waybackpy # interface with wayback machine API
import requests # to get image from the web
import shutil # to save it locally

# to parallelize downloads
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

Collecting waybackpy
  Downloading https://files.pythonhosted.org/packages/0c/9f/6fa786f18a7a08b319ed1ddf0673f9335b1f50745865c764275e7316edb2/waybackpy-2.4.2-py3-none-any.whl
Installing collected packages: waybackpy
Successfully installed waybackpy-2.4.2


In [None]:
# mount shared drive
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
#
# define parameters
#
num_images = 1500
### our cities:  london  new_delhi   new_york   san_francisco   tokyo
city = "san_francisco" # exactly as in the csv file name
in_dir = "/content/drive/Shared drives/Team_2_cs121S21/city perception database/imageURLs/"  # must end in "/"
out_dir = f"/content/drive/Shared drives/Team_2_cs121S21/code/data/streets_database/{num_images}/{city}/"
in_filename = f"{city}_URL.csv"

path_to_read = in_dir + in_filename

In [None]:
#
# read in .csv file filled with URLs
#

with open(path_to_read, newline="") as csvfile:
    reader = csv.reader(csvfile)
    url_list = []
    counter = 0
    for row in reader:
        url_list.append(row[0])
        counter += 1
        if counter >= num_images:
            break
            
print(url_list)

['http://static.panoramio.com/photos/original/10002316.jpg', 'http://static.panoramio.com/photos/original/10002389.jpg', 'http://static.panoramio.com/photos/original/10011923.jpg', 'http://static.panoramio.com/photos/original/10013438.jpg', 'http://static.panoramio.com/photos/original/10013470.jpg', 'http://static.panoramio.com/photos/original/10014602.jpg', 'http://static.panoramio.com/photos/original/10014624.jpg', 'http://static.panoramio.com/photos/original/10014669.jpg', 'http://static.panoramio.com/photos/original/10014756.jpg', 'http://static.panoramio.com/photos/original/10017149.jpg', 'http://static.panoramio.com/photos/original/10017299.jpg', 'http://static.panoramio.com/photos/original/10018182.jpg', 'http://static.panoramio.com/photos/original/10018188.jpg', 'http://static.panoramio.com/photos/original/10027397.jpg', 'http://static.panoramio.com/photos/original/10027954.jpg', 'http://static.panoramio.com/photos/original/10028044.jpg', 'http://static.panoramio.com/photos/ori

In [None]:
def get_wayback(url):
  """
  returns the wayback version of the input url
  """
  user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36"
  return str(waybackpy.Url(url, user_agent))

In [None]:
def download(url, save_dir):
  """
  downloads a single image from <url> and saves it in <save_dir>
  NOTE: <save_dir> must exist before calling this function
  """
  filename = url.split("/")[-1]
  filename = save_dir + filename

  try:
    # Open the url image, set stream to True, this will return the stream content.
    r = requests.get(url, stream = True)

    # Check if the image was retrieved successfully
    if r.status_code == 200:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True

        # Open a local file with wb ( write binary ) permission.
        with open(filename,'wb') as f:
            shutil.copyfileobj(r.raw, f)
        return 0
    else:
      return 1111111
  except:
    print(f"skipped {filename}")
    return 2222222
        


In [None]:
num_cores = multiprocessing.cpu_count()
print(num_cores)

2


In [None]:
#
# actually get list of wayback urls
#
url_list_tqdm = tqdm(url_list)

wayback_list = Parallel(n_jobs=num_cores)(delayed(get_wayback)(url) for url in url_list_tqdm)

100%|██████████| 1500/1500 [10:10<00:00,  2.46it/s]


In [None]:
#
# access, download, and save images to out_dir
#
wayback_list_tqdm = tqdm(wayback_list)

# may take a little while after this block is run for images to appear in drive
save_files = Parallel(n_jobs=num_cores)(delayed(download)(url, out_dir) for url in wayback_list_tqdm)

 70%|██████▉   | 1048/1500 [31:52<12:38,  1.68s/it]

In [None]:
print(save_files)