# Retrieve Recaptures from Google Images

In [1]:
INFO = '/mnt/files/datasets/vgdb_2016/vgdb_2016.csv'
IMG_SIZE = 'XXLARGE'

MAX_RECAPTURES = 10

### Setup

In [2]:
# ! pip install google-cloud-vision

In [3]:
import re
import os
import shutil
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from PIL import Image, ImageFile
from google_images_search import GoogleImagesSearch

In [4]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

Image.MAX_IMAGE_PIXELS = None
ImageFile.LOAD_TRUNCATED_IMAGES = True

### Disconvering Names of Paintings in VGDB_2016

In [5]:
info = pd.read_csv(INFO)
info.head(2)

Unnamed: 0,PageID,DescriptionURL,ImageURL,ImageSHA1,PixelHeight,PixelWidth,PaintingID,Artist,RealHeightInches,RealWidthInches,DensityHeight,DensityWidth,DensityRatio
0,149071,http://commons.wikimedia.org/wiki/File:Paul_C%C3%A9zanne_101.jpg,http://upload.wikimedia.org/wikipedia/commons/2/2d/Paul_C%C3%A9zanne_101.jpg,cdcdeed0d2667884b7362e6e1a0deb5d26d010f6,5700,3952,29.100.65,Paul C\xe9zanne,21.61417,15.23622,263.71585,259.38191,0.01671
1,151874,http://commons.wikimedia.org/wiki/File:Vincent_Willem_van_Gogh_044.jpg,http://upload.wikimedia.org/wikipedia/commons/7/75/Vincent_Willem_van_Gogh_044.jpg,fd61fa5a06507e0f2e919eac5828f5c4a2951226,5939,7560,Van Gogh catalogues|F=579|JH=1692,Vincent van Gogh,28.7,36.2,206.9338,208.83978,0.00921


In [6]:
import concurrent.futures
 
def bs_preprocess(html):
    """remove distracting whitespaces and newline characters"""
    pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE)
    html = re.sub(pat, '', html)       # remove leading and trailing whitespaces
    html = re.sub('\n', ' ', html)     # convert newlines to spaces
                                    # this preserves newline delimiters
    html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
    html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
    return html

def postprocessing(name):
    name = re.sub(r'(?i)\(.*\)|"|label.*|title.*', ' ', name)
    name = re.sub(r"(\w)([A-Z])", r"\1 \2", name)
    name = re.sub(r'(?i)(English|Nederlands|Deutsch|Tahitian|French|Français|Italian|Dutch|«|»)\s?\:?', '', name)
    return name

def retrieve_name(url):
    with requests.get(url) as r:
        text = bs_preprocess(r.text)
        doc = BeautifulSoup(text, 'html.parser')
        title = doc.find('td', id='fileinfotpl_art_title', text='Title')
        if title and title.next_sibling:
            return ' '.join(e.text for e in title.next_sibling)

        heading = doc.find(id='firstHeading')
        heading = os.path.splitext(heading.text.split('File:')[-1])[0]
        return heading

In [7]:
# with concurrent.futures.ThreadPoolExecutor(max_workers=24) as executor:
#     info['PaintingName'] = list(executor.map(retrieve_name, info.DescriptionURL))
#     info['PaintingName'] = info.PaintingName.map(postprocessing)

In [8]:
info.head()

Unnamed: 0,PageID,DescriptionURL,ImageURL,ImageSHA1,PixelHeight,PixelWidth,PaintingID,Artist,RealHeightInches,RealWidthInches,DensityHeight,DensityWidth,DensityRatio
0,149071,http://commons.wikimedia.org/wiki/File:Paul_C%C3%A9zanne_101.jpg,http://upload.wikimedia.org/wikipedia/commons/2/2d/Paul_C%C3%A9zanne_101.jpg,cdcdeed0d2667884b7362e6e1a0deb5d26d010f6,5700,3952,29.100.65,Paul C\xe9zanne,21.61417,15.23622,263.71585,259.38191,0.01671
1,151874,http://commons.wikimedia.org/wiki/File:Vincent_Willem_van_Gogh_044.jpg,http://upload.wikimedia.org/wikipedia/commons/7/75/Vincent_Willem_van_Gogh_044.jpg,fd61fa5a06507e0f2e919eac5828f5c4a2951226,5939,7560,Van Gogh catalogues|F=579|JH=1692,Vincent van Gogh,28.7,36.2,206.9338,208.83978,0.00921
2,809945,http://commons.wikimedia.org/wiki/File:Whitehousenight.jpg,http://upload.wikimedia.org/wikipedia/commons/1/16/Whitehousenight.jpg,3b52b91662bdef61268b9ba459e04214913142ce,10528,12682,Van Gogh catalogues|F=766|JH=2031,Vincent van Gogh,28.7,36.2,366.82927,350.33149,0.04709
3,1595836,http://commons.wikimedia.org/wiki/File:Jean_Jacques_Henner_-_Madame_Uhring.jpg,http://upload.wikimedia.org/wikipedia/commons/e/e2/Jean_Jacques_Henner_-_Madame_Uhring.jpg,e4355ce986b8c04a8409ee52c68eefdb6bb90bb7,3212,2177,1963.10.33,Jean-Jacques Henner,10.7,7.4,300.18692,294.18919,0.02039
4,1869973,http://commons.wikimedia.org/wiki/File:Blake_jacobsladder.jpg,http://upload.wikimedia.org/wikipedia/commons/e/ea/Blake_jacobsladder.jpg,1c85a5c6399839d4e4e24f060c97807525d6269e,4536,3564,1949.11.12.2,William Blake,14.56693,11.49606,311.39027,310.01918,0.00442


In [14]:
# import shutil
ROOT = os.path.dirname(INFO)

def resize_and_save(source, destination, max_size=600):
    im = Image.open(source)
    ratio = max_size / max(im.size)
    im = im.resize([int(s*ratio) for s in im.size], Image.ANTIALIAS)
    im.save(destination)

for subset in ('train', 'valid', 'test'):
    for label in ('vg', 'nvg'):
        path = os.path.join(ROOT, subset, label)
        dest = os.path.join(ROOT, 'small', subset, label)
        
        if os.path.exists(dest):
            continue
        os.makedirs(dest)
        for f in os.listdir(path):
            resize_and_save(os.path.join(path, f),
                            os.path.join(dest, f))

## Retrieving Images

In [10]:
gis = GoogleImagesSearch(
    developer_key=os.environ.get('GCS_DEVELOPER_KEY'),
    custom_search_cx=os.environ.get('GCS_CX'), 
    progressbar_fn=lambda url, progress: print(f'\n{url}', end=' ') if progress == 1 else
                                         print(f'{progress}%', end=' ') if progress % 25 == 0 else None)

In [11]:
info.head(2)

Unnamed: 0,PageID,DescriptionURL,ImageURL,ImageSHA1,PixelHeight,PixelWidth,PaintingID,Artist,RealHeightInches,RealWidthInches,DensityHeight,DensityWidth,DensityRatio
0,149071,http://commons.wikimedia.org/wiki/File:Paul_C%C3%A9zanne_101.jpg,http://upload.wikimedia.org/wikipedia/commons/2/2d/Paul_C%C3%A9zanne_101.jpg,cdcdeed0d2667884b7362e6e1a0deb5d26d010f6,5700,3952,29.100.65,Paul C\xe9zanne,21.61417,15.23622,263.71585,259.38191,0.01671
1,151874,http://commons.wikimedia.org/wiki/File:Vincent_Willem_van_Gogh_044.jpg,http://upload.wikimedia.org/wikipedia/commons/7/75/Vincent_Willem_van_Gogh_044.jpg,fd61fa5a06507e0f2e919eac5828f5c4a2951226,5939,7560,Van Gogh catalogues|F=579|JH=1692,Vincent van Gogh,28.7,36.2,206.9338,208.83978,0.00921


In [16]:
ROOT_RECAP = f'/mnt/files/datasets/vgdb_2016/recaptures/'

recaptures_info = []

def download_recaptures(query, directory, custom_name):
    gis.search(search_params={
                   'q': query,
                   'num': MAX_RECAPTURES,
                   'safe': 'medium',
                   'imgSize': IMG_SIZE,
               },
               custom_image_name=custom_name,
               path_to_dir=directory)

    return [(image.url, image.path) for image in gis.results()]

for subset in ('train', 'valid', 'test'):
    print(f'### Processing subset {subset}')
    
    for label in ('vg', 'nvg'):
        path_original = os.path.join(ROOT, 'small', subset, label)
        path_recap = os.path.join(ROOT_RECAP, subset, label)

        for f in sorted(os.listdir(path_original)):
            page_id = os.path.splitext(f)[0].split('_')[-1]
            dest = os.path.join(path_recap, page_id)
            if os.path.exists(dest) and len(os.listdir(dest)) > 1:
                continue
            
            os.makedirs(dest, exist_ok=True)
            
            shutil.copy(os.path.join(path_original, f),
                        os.path.join(dest, '0.original_vgdb2016.png'))
            
#             p_info = info[info.PageID == int(page_id)].iloc[0]
#             query = f'{p_info.PaintingName} {p_info.Artist}'
#             i_recaps = download_recaptures(query, dest, page_id)
#             recaptures_info += [[p_info.PageID, *e] for e in i_recaps]
#         system.sleep(60)
    print('\n')

### Processing subset train


### Processing subset valid


### Processing subset test


