In [28]:

import json
import logging
import os
import time
from io import BytesIO
from urllib.parse import urljoin

import numpy as np
import pandas as pd
import requests
from dotenv import load_dotenv
from PIL import Image
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
from requests.exceptions import RequestException

In [19]:
# Create list of styles to use:

top_styles = [
    "Impressionism",
    "Realism",
    "Romanticism",
    "Expressionism",
    "Post-Impressionism",
    "Baroque",
    "Art Nouveau (Modern)",
    "Surrealism",
    "Symbolism",
    "Abstract Expressionism",
    "Neoclassicism",
    "Rococo",
    "Cubism",
    "Northern Renaissance",
    "Academicism",
    "Pop Art",
    "Mannerism (Late Renaissance)",
    "Minimalism",
    "Conceptual Art",
    "Abstract Art",
    "Art Informel",
    "Early Renaissance",
    "Ukiyo-e",
    "Neo-Expressionism",
    "High Renaissance",
    "Color Field Painting",
    "Orientalism",
    "Lyrical Abstraction",
    "Fauvism",
    "Contemporary",
    "Op Art",
    "Neo-Impressionism",
    "Art-Deco",
]

In [2]:
# Load and examine the csv of all artwork pieces from the AIC
all_df = pd.read_csv("all_artworks_aic.csv", low_memory=False)

In [3]:
all_df['artwork_type_title'].value_counts()

artwork_type_title
Print                      45468
Photograph                 25346
Drawing and Watercolor     14697
Textile                     9825
Painting                    3916
Vessel                      3578
Architectural Drawing       3435
Book                        3243
Coin                        2783
Costume and Accessories     2518
Glass                       2364
Sculpture                   2314
Ceramics                    2055
Decorative Arts             1468
Metalwork                   1292
Graphic Design              1134
Design                       771
Arms                         637
Furniture                    587
Religious/Ritual Object      481
Armor                        430
Architectural fragment       286
Archives (groupings)         210
Mixed Media                  187
Model                        176
Coverings and Hangings       171
non-art                      163
Film, Video, New Media       163
Mask                          82
Miniature room          

In [4]:
all_df['style_titles'].value_counts()

style_titles
[]                                                                                           95963
['Japanese (culture or style)']                                                               7766
['21st Century']                                                                              4391
['19th century']                                                                              3997
['20th Century']                                                                              3113
                                                                                             ...  
['maninka', 'Arts of Africa', 'african Art', 'Northern Africa and the Sahel']                    1
['hausa', 'Arts of Africa', 'african Art', 'Northern Africa and the Sahel']                      1
['shona', 'Arts of Africa', 'african Art', 'Eastern and Southern Africa']                        1
['gwembe tonga', 'Arts of Africa', 'african Art', 'tonga', 'Eastern and Southern Africa']       

In [5]:
all_df['style_title'].value_counts()

style_title
Japanese (culture or style)    7864
21st Century                   4531
19th century                   4042
20th Century                   3154
Chinese (culture or style)     1859
                               ... 
native north american             1
Abstraction                       1
eskimo                            1
Neogothique                       1
heriz                             1
Name: count, Length: 610, dtype: int64

In [15]:
full_list = all_df['style_title'].dropna().to_list()
full_list

['Surrealism',
 'South Asian',
 'South Asian',
 'South Asian',
 'Himalayan',
 'Impressionism',
 'Japanese (culture or style)',
 '21st Century',
 'Japanese (culture or style)',
 '20th Century',
 'nineteenth century',
 'nineteenth century',
 'imperial (roman)',
 '18th Century',
 '18th Century',
 '21st Century',
 '21st Century',
 'New Bauhaus (Institute of Design)',
 'New Bauhaus (Institute of Design)',
 'New Bauhaus (Institute of Design)',
 'New Bauhaus (Institute of Design)',
 'New Bauhaus (Institute of Design)',
 'New Bauhaus (Institute of Design)',
 'New Bauhaus (Institute of Design)',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 '19th century',
 'Pictorialism',
 '19th century',
 '19th century',
 '19th century',
 'Impressionism',
 'Modernism',
 '21st Century',
 '19th century',
 '21st Century',
 '17th Century',
 'asante',
 'ayma

In [13]:
# Combine single style_title and multiple style_titles
all_styles = set() 

# Add styles from style_title column (single style per artwork)
all_styles.update(all_df['style_title'].dropna().unique())

# Add styles from style_titles column (multiple styles per artwork)
for style_list in all_df['style_titles'].dropna():
    if isinstance(style_list, list):  # Ensure it's a list
        all_styles.update(style_list)  # Add all styles from the list

# Convert back to list to preserve order if needed
unique_styles = list(all_styles)

print(f"Total number of unique styles: {len(unique_styles)}")


Total number of unique styles: 610


In [20]:
def check_top_styles(style_titles, top_styles):
    # Convert all style_titles to lowercase for case-insensitive comparison
    style_titles_lower = [title.lower() for title in style_titles]
    
    # Check each top style
    not_found = []
    found = {}
    
    for style in top_styles:
        style_lower = style.lower()
        matches = []
        
        for i, title_lower in enumerate(style_titles_lower):
            if style_lower == title_lower:
                matches.append(style_titles[i])
            elif (style_lower == "art-deco" and "art deco" in title_lower) or \
                 (style_lower == "mannerism (late renaissance)" and ("mannerism" in title_lower or "late renaissance" in title_lower)) or \
                 (style_lower == "romanticism" and "romantic" in title_lower) or \
                 (style_lower == "art nouveau (modern)" and title_lower in ["modern art", "modern", "modernist", "modernism"]) or \
                 (style_lower == "abstract art" and "abstract" in title_lower) or \
                 (style_lower == "neo-impressionism" and ("neo impressionism" in title_lower or "neo-impressionism" in title_lower)):
                matches.append(style_titles[i])

        
        if matches:
            found[style] = matches
        else:
            not_found.append(style)
    
    # Print results
    if not_found:
        print("The following styles were not found in the df:")
        for style in not_found:
            print(f"- {style}")
    else:
        print("All top styles were found in the df.")
    
    print("\nMatches found:")
    for style, matches in found.items():
        print(f"{style}:")
        for match in matches:
            print(f"  - {match}")
    
    # Print summary
    print(f"\nFound {len(found)} out of {len(top_styles)} top styles.")
    
    return found

# Usage
found_styles = check_top_styles(unique_styles, top_styles)

The following styles were not found in the df:
- Northern Renaissance
- Academicism
- Art Informel
- Ukiyo-e
- Neo-Expressionism
- High Renaissance
- Color Field Painting
- Orientalism
- Lyrical Abstraction
- Neo-Impressionism

Matches found:
Impressionism:
  - Impressionism
Realism:
  - Realism
Romanticism:
  - romantic
  - neo-romantic
Expressionism:
  - Expressionism
Post-Impressionism:
  - Post-Impressionism
Baroque:
  - Baroque
Art Nouveau (Modern):
  - Modernism
  - Modern Art
Surrealism:
  - Surrealism
Symbolism:
  - Symbolism
Abstract Expressionism:
  - Abstract Expressionism
Neoclassicism:
  - Neoclassicism
Rococo:
  - Rococo
Cubism:
  - Cubism
Pop Art:
  - Pop Art
Mannerism (Late Renaissance):
  - Mannerism
Minimalism:
  - Minimalism
Conceptual Art:
  - Conceptual Art
Abstract Art:
  - Abstract Expressionism
  - Abstract
  - Abstraction
Early Renaissance:
  - early renaissance
Fauvism:
  - Fauvism
Contemporary:
  - contemporary
Op Art:
  - Op Art
Art-Deco:
  - Art Deco

Found

Only only finding 23 of 33 styles, I decided to examine the results if I did search the API instead of
using the data dump.

In [17]:
def get_all_style_titles():
    base_url = "https://api.artic.edu/api/v1/category-terms"
    all_titles = set() 
    page = 1
    limit = 100  # Maximum allowed by the API

    while True:
        params = {
            "fields": "id,title",
            "limit": limit,
            "page": page,
            "query[term][category]": "Style",
            "sort": "title"
        }
        
        response = requests.get(base_url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            new_titles = [item['title'] for item in data['data']]
            all_titles.update(new_titles)
            
            # Check if we've reached the last page
            if len(new_titles) < limit:
                break
            
            page += 1
        else:
            print(f"Error on page {page}: {response.status_code}")
            break

    return sorted(list(all_titles))

# Fetch all style titles
style_titles = get_all_style_titles()

if style_titles:
    print(f"\nTotal styles retrieved: {len(style_titles)}")
else:
    print("Failed to retrieve style titles.")


Total styles retrieved: 9585


Match the style from the top_styles list to the style terms used by the API

In [21]:

found_styles_api = check_top_styles(style_titles, top_styles)

The following styles were not found in the df:
- Academicism
- Color Field Painting
- Orientalism
- Neo-Impressionism

Matches found:
Impressionism:
  - Impressionism
Realism:
  - Realism
Romanticism:
  - neo-romantic
  - romantic
Expressionism:
  - Expressionism
Post-Impressionism:
  - Post-Impressionism
Baroque:
  - Baroque
Art Nouveau (Modern):
  - Modern Art
  - Modernism
  - modern
  - modern art
  - modernism
  - modernist
Surrealism:
  - Surrealism
  - surrealism
Symbolism:
  - Symbolism
  - symbolism
Abstract Expressionism:
  - Abstract Expressionism
Neoclassicism:
  - Neoclassicism
Rococo:
  - Rococo
Cubism:
  - Cubism
Northern Renaissance:
  - northern renaissance
Pop Art:
  - Pop Art
  - Pop art
Mannerism (Late Renaissance):
  - Mannerism
  - late renaissance
  - mannerism
Minimalism:
  - Minimalism
  - minimalism
Conceptual Art:
  - Conceptual Art
Abstract Art:
  - Abstract
  - Abstract Expressionism
  - Abstraction
  - abstract
  - abstract figures
  - abstract forms
  - a

Now I'm matching 29 our of 33. I'll create a list to search the API since we're getting more matches that way. Then 
I can examine and filter the results.

In [22]:
# Convert found styles to style_search_terms list
style_search_terms = []
for matches in found_styles_api.values():
    style_search_terms.extend(matches)

# Remove duplicates while preserving order
style_search_terms = list(dict.fromkeys(style_search_terms))

print("Style search terms:")
for term in style_search_terms:
    print(f"- {term}")
print(f"\nTotal unique search terms: {len(style_search_terms)}")

Style search terms:
- Impressionism
- Realism
- neo-romantic
- romantic
- Expressionism
- Post-Impressionism
- Baroque
- Modern Art
- Modernism
- modern
- modern art
- modernism
- modernist
- Surrealism
- surrealism
- Symbolism
- symbolism
- Abstract Expressionism
- Neoclassicism
- Rococo
- Cubism
- northern renaissance
- Pop Art
- Pop art
- Mannerism
- late renaissance
- mannerism
- Minimalism
- minimalism
- Conceptual Art
- Abstract
- Abstraction
- abstract
- abstract figures
- abstract forms
- abstract imagist
- abstract impressionist
- abstract motifs
- abstract objects
- abstract patterns
- abstract shapes
- abstraction
- eccentric abstraction
- lyrical abstraction
- Art Informel
- early renaissance
- ukiyo-e
- Neo-Expressionism
- high renaissance
- Fauvism
- contemporary
- Op Art
- Art Deco

Total unique search terms: 53


Test our search terms by getting the first entry from each style

In [23]:
BASE_URL = "https://api.artic.edu/api/v1"

def get_artwork_by_style(style_term, limit=1):
    endpoint = f"{BASE_URL}/artworks/search"
    params = {
        "q": style_term,
        "fields": "id,title,style_title,image_id,artwork_type_title",
        "limit": limit
    }
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()
        return data.get('data', [])
    else:
        print(f"Error fetching data for {style_term}: {response.status_code}")
        return []

def create_test_dataframe(style_search_terms):
    all_artworks = []
    for term in style_search_terms:
        artworks = get_artwork_by_style(term)
        if artworks:
            artwork = artworks[0]
            artwork['search_term'] = term
            all_artworks.append(artwork)
    
    df = pd.DataFrame(all_artworks)
    if '_score' in df.columns:
        df = df.drop('_score', axis=1)
    return df

# Assuming style_search_terms is already defined
api_test_df = create_test_dataframe(style_search_terms)

print(f"Number of entries in test_df: {len(test_df)}")

Number of entries in test_df: 53


In [24]:
print(api_test_df.info())
api_test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   artwork_type_title  53 non-null     object
 1   style_title         35 non-null     object
 2   id                  53 non-null     int64 
 3   image_id            53 non-null     object
 4   title               53 non-null     object
 5   search_term         53 non-null     object
dtypes: int64(1), object(5)
memory usage: 2.6+ KB


Notice mixed media types were pulled. Searching with the API for just paintings along with the style_terms was causing errors, 
so I decided to pull all and will filter the dataframe afterwards. Also filling the missing 'style_title' values with the 'search_term'.

Now I search for every piece of artwork matching our style terms, adhering to the API's rate limiting.

In [29]:


BASE_URL = "https://api.artic.edu/api/v1"
RATE_LIMIT = 60  # requests per minute
MAX_RECORDS = 10000  # maximum records we can fetch for a single search query
CHECKPOINT_FILE = 'search_checkpoint.json'

def load_checkpoint():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def save_checkpoint(checkpoint):
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f)

def get_artworks_by_style(style_term, limit=100, start_page=1):
    all_artworks = []
    page = start_page
    total_pages = None

    while (total_pages is None or page <= total_pages) and len(all_artworks) < MAX_RECORDS:
        try:
            endpoint = f"{BASE_URL}/artworks/search"
            params = {
                "q": style_term,
                "fields": "id,title,style_title,image_id,artwork_type_title",
                "limit": limit,
                "page": page
            }
            response = requests.get(endpoint, params=params)
            response.raise_for_status()
            
            data = response.json()
            artworks = data.get('data', [])
            all_artworks.extend(artworks)
            
            if total_pages is None:
                total_pages = data.get('pagination', {}).get('total_pages', 1)
                print(f"Total pages for {style_term}: {total_pages}")
            
            print(f"Fetched page {page} of {total_pages} for {style_term}")
            page += 1
            
            # Throttle to respect rate limit
            time.sleep(60 / RATE_LIMIT)
        
        except RequestException as e:
            print(f"Error fetching data for {style_term} on page {page}: {e}")
            break  
        
        if len(all_artworks) >= MAX_RECORDS:
            print(f"Reached {MAX_RECORDS} record limit for {style_term}")
            break
    
    return all_artworks, page

def create_full_dataframe(style_search_terms):
    all_artworks = []
    checkpoint = load_checkpoint()

    for term in style_search_terms:
        start_page = checkpoint.get(term, {}).get('next_page', 1)
        total_retrieved = checkpoint.get(term, {}).get('total_retrieved', 0)
        
        print(f"Fetching artworks for {term} starting from page {start_page}")
        artworks, next_page = get_artworks_by_style(term, start_page=start_page)
        
        for artwork in artworks:
            artwork['search_term'] = term
        all_artworks.extend(artworks)
        
        total_retrieved += len(artworks)
        checkpoint[term] = {'next_page': next_page, 'total_retrieved': total_retrieved}
        save_checkpoint(checkpoint)
        
        print(f"Total artworks retrieved for {term}: {total_retrieved}")
        
        if total_retrieved >= MAX_RECORDS:
            print(f"Reached {MAX_RECORDS} record limit for {term}. You can resume later.")
    
    df = pd.DataFrame(all_artworks)
    if '_score' in df.columns:
        df = df.drop('_score', axis=1)
    return df

# Usage
full_df = create_full_dataframe(style_search_terms)

print(f"Total number of artworks retrieved: {len(full_df)}")


Fetching artworks for Impressionism starting from page 5
Total pages for Impressionism: 4
Fetched page 5 of 4 for Impressionism
Total artworks retrieved for Impressionism: 316
Fetching artworks for Realism starting from page 1
Total pages for Realism: 3
Fetched page 1 of 3 for Realism
Fetched page 2 of 3 for Realism
Fetched page 3 of 3 for Realism
Total artworks retrieved for Realism: 275
Fetching artworks for neo-romantic starting from page 1
Total pages for neo-romantic: 24
Fetched page 1 of 24 for neo-romantic
Fetched page 2 of 24 for neo-romantic
Fetched page 3 of 24 for neo-romantic
Fetched page 4 of 24 for neo-romantic
Fetched page 5 of 24 for neo-romantic
Fetched page 6 of 24 for neo-romantic
Fetched page 7 of 24 for neo-romantic
Fetched page 8 of 24 for neo-romantic
Fetched page 9 of 24 for neo-romantic
Fetched page 10 of 24 for neo-romantic
Error fetching data for neo-romantic on page 11: 403 Client Error: Forbidden for url: https://api.artic.edu/api/v1/artworks/search?q=neo-r

In [30]:
full_df.head(10)

Unnamed: 0,artwork_type_title,style_title,id,image_id,title,search_term
0,Painting,suprematist,207293,12baff03-af57-0410-df53-740c56575732,Painterly Realism of a Football Player—Color M...,Realism
1,Painting,Realism,27138,e1f6ef79-7216-4f19-ac78-a307a2fb4f32,A Friendly Warning,Realism
2,Painting,Realism,121629,e6dd6199-c245-d1b4-a997-ce89eafb48ca,Life Study (Study of an Egyptian Girl),Realism
3,Sculpture,Realism,191185,f2f929b9-995a-9ed3-ef39-2fe81b35a64e,The Puritan,Realism
4,Painting,Realism,111377,3cea045a-92d6-36cf-1508-2c99ea740218,For Sunday's Dinner,Realism
5,Painting,Realism,181777,6754947f-4ccf-b3ff-41ee-45b738534769,The Irish Question,Realism
6,Painting,Impressionism,4749,074c1b1b-fe92-01ce-9e62-cc99369142dd,Mrs. George Swinton (Elizabeth Ebsworth),Realism
7,Print,Realism,192634,c4824acc-1240-8f7b-abf9-f1b32435990c,Glory to the Soviet Youth!,Realism
8,Painting,Realism,81564,8a647e1c-d778-db71-6ba4-6456b800d6b9,"Husking Bee, Island of Nantucket",Realism
9,Print,Realism,192586,3110a7a4-a6df-dca4-1f42-e3290d07e4af,Königsberg Is Taken!,Realism


In [31]:
full_df['artwork_type_title'].value_counts()

artwork_type_title
Painting                   8844
Print                      4675
Vessel                     4187
Sculpture                  3701
Photograph                 2722
Drawing and Watercolor     1719
Textile                    1672
Decorative Arts             820
Costume and Accessories     817
Ceramics                    793
Coin                        710
Religious/Ritual Object     461
Mixed Media                 453
Graphic Design              426
Furniture                   352
Architectural Drawing       316
Metalwork                   313
Glass                       201
Book                        192
Mask                        191
Architectural fragment      156
Design                      147
Installation                120
Film, Video, New Media      119
Armor                        65
Coverings and Hangings       64
Basketry                     63
Miniature room               61
Funerary Object              58
Arms                         58
Model                

In [32]:
# Now filter for only paintings or prints
filtered_full_df = full_df[(full_df['artwork_type_title'] == 'Painting') | (full_df['artwork_type_title'] == 'Print')]
filtered_full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13519 entries, 0 to 34622
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   artwork_type_title  13519 non-null  object
 1   style_title         6397 non-null   object
 2   id                  13519 non-null  int64 
 3   image_id            13341 non-null  object
 4   title               13519 non-null  object
 5   search_term         13519 non-null  object
dtypes: int64(1), object(5)
memory usage: 739.3+ KB


In [33]:
filtered_full_df['artwork_type_title'].value_counts()

artwork_type_title
Painting    8844
Print       4675
Name: count, dtype: int64

In [34]:
filtered_full_df.isna().sum()

artwork_type_title       0
style_title           7122
id                       0
image_id               178
title                    0
search_term              0
dtype: int64

In [35]:
filtered_full_df['style_title'] = filtered_full_df['style_title'].fillna(filtered_full_df['search_term'])
filtered_full_df = filtered_full_df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_full_df['style_title'] = filtered_full_df['style_title'].fillna(filtered_full_df['search_term'])


In [36]:
filtered_full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13341 entries, 0 to 34622
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   artwork_type_title  13341 non-null  object
 1   style_title         13341 non-null  object
 2   id                  13341 non-null  int64 
 3   image_id            13341 non-null  object
 4   title               13341 non-null  object
 5   search_term         13341 non-null  object
dtypes: int64(1), object(5)
memory usage: 729.6+ KB


Download the images we want.

In [37]:


BASE_URL = "https://www.artic.edu/iiif/2/"
RATE_LIMIT = 60  # requests per minute
IMAGE_QUALITY = "full"
IMAGE_FORMAT = "jpg"
OUTPUT_DIR = 'downloaded_images_aic'
MAX_RETRIES = 2  # Retry up to 2 times if the content is empty or there are errors
RETRY_DELAY = 2  # Delay (in seconds) between retries
REQUEST_DELAY = 1  # Delay (in seconds) between each request

# Set up logging
logging.basicConfig(filename='image_download.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')

def download_image(image_id, output_path):
    if not image_id:
        return False, "No image ID provided"
    
    url = urljoin(BASE_URL, f"{image_id}/full/{IMAGE_QUALITY}/0/default.{IMAGE_FORMAT}")
    for attempt in range(MAX_RETRIES + 1):  # Attempt download + retries
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raises an HTTPError for bad responses
            
            # Log content length and headers
            content_length = len(response.content)
            logging.info(f"Attempt {attempt + 1} - Content length: {content_length}, Headers: {response.headers}")
            
            # Check for non-empty content
            if content_length > 0:
                with open(output_path, 'wb') as f:
                    f.write(response.content)
                
                # Check that the file was written and is not empty
                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
                    return True, "Success"
                else:
                    logging.warning(f"Downloaded file {output_path} is empty.")
            else:
                logging.warning(f"Attempt {attempt + 1} - Empty content for {image_id}")
        
        except requests.exceptions.HTTPError as e:
            return False, f"HTTP error: {e}"
        except requests.exceptions.ConnectionError as e:
            return False, f"Connection error: {e}"
        except requests.exceptions.Timeout as e:
            return False, f"Timeout error: {e}"
        except requests.exceptions.RequestException as e:
            return False, f"Error downloading: {e}"

        # Wait before retrying if not the last attempt
        if attempt < MAX_RETRIES:
            logging.info(f"Retrying in {RETRY_DELAY} seconds (attempt {attempt + 2}) for {image_id}")
            time.sleep(RETRY_DELAY)

    return False, "Failed after maximum retries"

def download_all_images(df):
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    file_names = []
    total_images = len(df)
    successful_downloads = 0
    failed_downloads = 0
    skipped_downloads = 0
    
    for index, row in df.iterrows():
        image_id = row['image_id']
        if pd.isna(image_id) or not image_id:
            file_names.append(None)
            skipped_downloads += 1
            logging.warning(f"Skipped image at index {index}: No valid image_id")
            continue

        file_name = f"{image_id}.{IMAGE_FORMAT}"
        output_path = os.path.join(OUTPUT_DIR, file_name)

        success, message = download_image(image_id, output_path)
        if success:
            file_names.append(file_name)
            successful_downloads += 1
            logging.info(f"Downloaded: {file_name}")
        else:
            file_names.append(None)
            failed_downloads += 1
            logging.warning(f"Failed to download {image_id}: {message}")

        # Print progress every 100 images
        if (index + 1) % 100 == 0:
            print(f"Progress: {index + 1}/{total_images} images processed")
            print(f"Successful downloads: {successful_downloads}")
            print(f"Failed downloads: {failed_downloads}")
            print(f"Skipped downloads: {skipped_downloads}")
            logging.info(f"Progress: {index + 1}/{total_images} images processed. "
                         f"Successful: {successful_downloads}, Failed: {failed_downloads}, Skipped: {skipped_downloads}")

        # Throttle to respect rate limit and add a 1-second delay between requests
        time.sleep(1)

    return file_names, successful_downloads, failed_downloads, skipped_downloads

# Usage
file_names, successful, failed, skipped = download_all_images(filtered_full_df)
filtered_full_df['file_name'] = file_names

print(f"Image download complete. Total successful downloads: {successful}, "
      f"Total failed: {failed}, Total skipped: {skipped}")
logging.info(f"Process completed. Successful downloads: {successful}, "
             f"Failed downloads: {failed}, Skipped downloads: {skipped}")


Progress: 100/13341 images processed
Successful downloads: 81
Failed downloads: 0
Skipped downloads: 0
Progress: 200/13341 images processed
Successful downloads: 167
Failed downloads: 3
Skipped downloads: 0
Progress: 800/13341 images processed
Successful downloads: 260
Failed downloads: 5
Skipped downloads: 0
Progress: 1600/13341 images processed
Successful downloads: 342
Failed downloads: 7
Skipped downloads: 0
Progress: 2300/13341 images processed
Successful downloads: 458
Failed downloads: 7
Skipped downloads: 0
Progress: 2400/13341 images processed
Successful downloads: 535
Failed downloads: 7
Skipped downloads: 0
Progress: 2600/13341 images processed
Successful downloads: 692
Failed downloads: 7
Skipped downloads: 0
Progress: 2800/13341 images processed
Successful downloads: 802
Failed downloads: 8
Skipped downloads: 0
Progress: 3000/13341 images processed
Successful downloads: 908
Failed downloads: 10
Skipped downloads: 0
Progress: 3100/13341 images processed
Successful downloads

In [40]:
filtered_full_df.head()

Unnamed: 0,artwork_type_title,style_title,id,image_id,title,search_term,file_name
0,Painting,suprematist,207293,12baff03-af57-0410-df53-740c56575732,Painterly Realism of a Football Player—Color M...,Realism,12baff03-af57-0410-df53-740c56575732.jpg
1,Painting,Realism,27138,e1f6ef79-7216-4f19-ac78-a307a2fb4f32,A Friendly Warning,Realism,e1f6ef79-7216-4f19-ac78-a307a2fb4f32.jpg
2,Painting,Realism,121629,e6dd6199-c245-d1b4-a997-ce89eafb48ca,Life Study (Study of an Egyptian Girl),Realism,e6dd6199-c245-d1b4-a997-ce89eafb48ca.jpg
4,Painting,Realism,111377,3cea045a-92d6-36cf-1508-2c99ea740218,For Sunday's Dinner,Realism,3cea045a-92d6-36cf-1508-2c99ea740218.jpg
5,Painting,Realism,181777,6754947f-4ccf-b3ff-41ee-45b738534769,The Irish Question,Realism,6754947f-4ccf-b3ff-41ee-45b738534769.jpg


Check dataframe against actual files in the directory

In [42]:
# Get path to check files
image_dir = 'downloaded_images_aic'

# First remove rows where file_name is None
filtered_full_df = filtered_full_df.dropna(subset=['file_name'])

# Now create mask of existing files
exists_mask = filtered_full_df['file_name'].apply(lambda x: os.path.exists(os.path.join(image_dir, x)))

# Filter DataFrame to keep only rows where files exist
filtered_full_df = filtered_full_df[exists_mask]

In [44]:
filtered_full_df['artwork_type_title'].value_counts()

artwork_type_title
Painting    8626
Print       4570
Name: count, dtype: int64

In [46]:
# Get count of duplicates
duplicate_count = filtered_full_df['file_name'].duplicated().sum()

if duplicate_count > 0:
    # Show the duplicate entries
    duplicates = filtered_full_df[filtered_full_df['file_name'].duplicated(keep=False)]
    print(f"Found {duplicate_count} duplicate entries")
else:
    print("No duplicate filenames found")

Found 9074 duplicate entries


Whoops, probably should have checked for duplicates before downloading.

In [47]:
filtered_full_df = filtered_full_df.drop_duplicates(subset=['file_name'], keep='first')

In [49]:
filtered_full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4122 entries, 0 to 33435
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   artwork_type_title  4122 non-null   object
 1   style_title         4122 non-null   object
 2   id                  4122 non-null   int64 
 3   image_id            4122 non-null   object
 4   title               4122 non-null   object
 5   search_term         4122 non-null   object
 6   file_name           4122 non-null   object
dtypes: int64(1), object(6)
memory usage: 257.6+ KB


In [51]:
filtered_full_df['style_title'].value_counts()

style_title
Japanese (culture or style)    682
Pop Art                        472
Modern Art                     398
Mannerism                      314
Cubism                         275
                              ... 
13th century                     1
american colonial                1
de Stijl                         1
lakota                           1
suprematist                      1
Name: count, Length: 86, dtype: int64

In [52]:
filtered_full_df['search_term'].value_counts()

search_term
Modern Art                617
ukiyo-e                   536
Pop Art                   476
Mannerism                 382
Post-Impressionism        304
Cubism                    260
Realism                   195
early renaissance         123
neo-romantic              122
late renaissance          118
Baroque                   106
abstract figures           98
northern renaissance       88
Abstract Expressionism     83
Art Informel               59
Surrealism                 56
Conceptual Art             54
Modernism                  52
Rococo                     45
abstract patterns          44
high renaissance           43
mannerism                  42
Symbolism                  35
abstract objects           33
Fauvism                    22
abstract motifs            22
contemporary               19
abstract shapes            18
abstract imagist           15
abstract forms             11
lyrical abstraction        11
Neoclassicism               7
Abstract                    

In [53]:
filtered_full_df.to_csv("aic_images.csv", index=False)