In [3]:
import flickrapi
import requests
import os
import config
import logging

In [4]:
flickr = flickrapi.FlickrAPI(config.api_key, config.api_secret, format='parsed-json')

In [5]:
logging.basicConfig(filename='logs/collection_log.log', 
                    level=logging.INFO,
                    format='%(asctime)s %(levelname)s:%(message)s')

In [6]:
def get_photo_sizes(photo_id):
    try:
        sizes = flickr.photos.getSizes(photo_id=photo_id)
        return sizes['sizes']['size']
    except flickrapi.exceptions.FlickrError as e:
        if e.code == 500:
            logging.error(f"Error 500 encountered for photo ID {photo_id}. Skipping this photo.")
        return None

In [5]:
# photo_id = '53364746727'
# sizes = get_photo_sizes(photo_id)

# if sizes:
#     print(f"Available sizes for photo ID {photo_id}:")
#     for size in sizes:
#         print(f"- Label: {size['label']}, Width: {size['width']}, Height: {size['height']}, Source: {size['source']}")
# else:
#     print("No sizes available or error occurred.")

In [7]:
def download_image(url, target_directory, filename):
    try:
        if not os.path.exists(target_directory):
            os.makedirs(target_directory)
    
        filepath = os.path.join(target_directory, filename)
    
        response = requests.get(url)
        if response.status_code == 200:
            with open(filepath, 'wb') as file:
                file.write(response.content)
        else:
            logging.warning(f"Failed to download: {url}")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error downloading {url}: {e}. Skipping this photo.")

In [8]:
downloaded_ids_file_path = 'logs/downloaded_ids.txt'

In [9]:
def save_id(photo_id, file_path=downloaded_ids_file_path):
    with open(file_path, 'a') as file:
        file.write(photo_id + '\n')

In [10]:
def load_ids(file_path=downloaded_ids_file_path ):
    try:
        with open(file_path, 'r') as file:
            return set(file.read().splitlines())
    except FileNotFoundError:
        return set()

In [11]:
def collect_group_images(group_id, target_directory, size_label, max_images):
    downloaded_ids = load_ids()
    per_page = 500
    page = 1
    image_count = 0
    file_count = 0
    
    while image_count < max_images:
        photos = flickr.groups.pools.getPhotos(group_id=group_id, per_page=per_page, page=page)
        photo_list = photos['photos']['photo']
        
        if not photo_list:
            break
        
        for photo in photo_list:
            photo_id = photo['id']
            if photo_id in downloaded_ids:
                continue
                
            sizes = get_photo_sizes(photo_id)
            if sizes:
                size_url = next((s['source'] for s in sizes if s['label'] == size_label), None)
                if size_url:
                    download_image(size_url, target_directory, photo['id'] + '.jpg')
                    save_id(photo_id)
                    image_count += 1
                    file_count += 1
                    
                    if file_count % 1000 == 0:
                        print(f"Copied {file_count} files to {target_directory}")
            
        page += 1

In [12]:
def get_group_id(url):
    response = flickr.urls.lookupGroup(url=url)
    return response['group']['id']

In [11]:
# group_id_film = get_group_id('https://www.flickr.com/groups/filmdatabase/')

In [12]:
# collect_group_images(group_id_film, config.target_directory_film, 'Large 1600', 110200)

In [13]:
group_id_digital = get_group_id('https://www.flickr.com/groups/digitalp/')

In [14]:
collect_group_images(group_id_digital, config.target_directory_digital, 'Large 1600', 30000)

Copied 1000 files to /Volumes/Elemental/DSI/film-detective-data/images/digital_photography_1600
Copied 2000 files to /Volumes/Elemental/DSI/film-detective-data/images/digital_photography_1600
