In [4]:

import os
import time
import json
from io import BytesIO
from tqdm import tqdm

import numpy as np
import pandas as pd
import requests
from dotenv import load_dotenv

from PIL import Image
from ratelimit import limits, sleep_and_retry
from requests.exceptions import RequestException
from urllib.parse import urljoin




Accessing the Art Institute of Chicago's data dump

In [2]:
# Path JSONL file 
file_path = 'allArtworks.jsonl'

# Read the JSONL file
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]

# Create a pandas DataFrame
df = pd.DataFrame(data)

print(df.info())

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132769 entries, 0 to 132768
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   id                     132769 non-null  int64 
 1   title                  132768 non-null  object
 2   main_reference_number  132769 non-null  object
 3   department_title       126212 non-null  object
 4   artist_title           114074 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.1+ MB
None


Unnamed: 0,id,title,main_reference_number,department_title,artist_title
0,4,Priest and Boy,1880.1,Prints and Drawings,Lawrence Carmichael Earle
1,9,"Interior of St. Mark's, Venice",1887.232,Arts of the Americas,David Dalhoff Neal
2,11,Self-Portrait,1887.234,Arts of the Americas,Walter Shirlaw
3,16,The Fall of the Giants,1887.249,Prints and Drawings,Salvator Rosa
4,19,"View of Ponte Lugano on the Anio, from Views o...",1887.252,Prints and Drawings,Giovanni Battista Piranesi


In [3]:
df['department_title'].value_counts()

department_title
Prints and Drawings                                  51217
Photography and Media                                23783
Arts of Asia                                         16375
Textiles                                             11547
Architecture and Design                               5913
Applied Arts of Europe                                5507
Arts of the Americas                                  4246
Arts of the Ancient Mediterranean and Byzantium       2195
Contemporary Art                                      1721
Arts of Africa                                        1493
Painting and Sculpture of Europe                      1382
Modern Art                                             463
AIC Archives                                           242
Ryerson and Burnham Libraries Special Collections      124
Modern and Contemporary Art                              4
Name: count, dtype: int64

In [5]:


# Set the path to the directory containing JSON files
json_dir = r"C:\Users\16148\Downloads\artic-api-data\json\artworks"

# Initialize an empty list to store the data from each JSON file
data_list = []

# Iterate through all JSON files in the directory
for filename in tqdm(os.listdir(json_dir), desc="Loading files"):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                # Load the JSON data
                json_data = json.load(file)
                # Append the data to our list
                data_list.append(json_data)
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file: {filename}")

# Create a DataFrame from the list of JSON data
all_df = pd.DataFrame(data_list)

# Display the first few rows and basic information about the DataFrame
all_df.info()

Loading files: 100%|██████████| 132769/132769 [36:02<00:00, 61.41it/s] 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132769 entries, 0 to 132768
Data columns (total 99 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   id                             132769 non-null  int64  
 1   api_model                      132769 non-null  object 
 2   api_link                       132769 non-null  object 
 3   is_boosted                     132769 non-null  bool   
 4   title                          132768 non-null  object 
 5   alt_titles                     1800 non-null    object 
 6   thumbnail                      118916 non-null  object 
 7   main_reference_number          132769 non-null  object 
 8   has_not_been_viewed_much       132769 non-null  bool   
 9   boost_rank                     20 non-null      float64
 10  date_start                     127671 non-null  float64
 11  date_end                       127671 non-null  float64
 12  date_display                  

In [10]:
all_df['artwork_type_title'].value_counts()

artwork_type_title
Print                      45468
Photograph                 25346
Drawing and Watercolor     14697
Textile                     9825
Painting                    3916
Vessel                      3578
Architectural Drawing       3435
Book                        3243
Coin                        2783
Costume and Accessories     2518
Glass                       2364
Sculpture                   2314
Ceramics                    2055
Decorative Arts             1468
Metalwork                   1292
Graphic Design              1134
Design                       771
Arms                         637
Furniture                    587
Religious/Ritual Object      481
Armor                        430
Architectural fragment       286
Archives (groupings)         210
Mixed Media                  187
Model                        176
Coverings and Hangings       171
non-art                      163
Film, Video, New Media       163
Mask                          82
Miniature room          

In [28]:
all_df['style_titles'].value_counts()

style_titles
[]                                                                                 95963
[Japanese (culture or style)]                                                       7766
[21st Century]                                                                      4391
[19th century]                                                                      3997
[20th Century]                                                                      3113
                                                                                   ...  
[maninka, Arts of Africa, african Art, Northern Africa and the Sahel]                  1
[hausa, Arts of Africa, african Art, Northern Africa and the Sahel]                    1
[shona, Arts of Africa, african Art, Eastern and Southern Africa]                      1
[gwembe tonga, Arts of Africa, african Art, tonga, Eastern and Southern Africa]        1
[late period (egyptian), saite period, twenty-sixth dynasty]                           1
Name: co

In [36]:
# Combine single style_title and multiple style_titles
all_styles = []

# Add styles from style_title column (single style per artwork)
all_styles.extend(all_df['style_title'].dropna().unique())

# Add styles from style_titles column (multiple styles per artwork)
all_styles.extend([style for sublist in all_df['style_titles'] if sublist for style in sublist if style])

# Remove duplicates while preserving order
unique_styles = list(dict.fromkeys(all_styles))

print(f"Total number of unique styles: {len(unique_styles)}")


Total number of unique styles: 878


In [37]:
from collections import Counter

# Count occurrences of each style
style_counts = Counter(all_styles)

# Sort the styles by count in descending order
sorted_styles = sorted(style_counts.items(), key=lambda x: x[1], reverse=True)

print("Unique styles for Prints and Paintings, sorted by count:")
for style, count in sorted_styles:
    print(f"{style}: {count}")

print(f"\nTotal number of unique styles for Prints and Paintings: {len(sorted_styles)}")

Unique styles for Prints and Paintings, sorted by count:
Japanese (culture or style): 7867
21st Century: 4864
19th century: 4836
20th Century: 3955
Chinese (culture or style): 2291
Modernism: 1737
Arts of the Americas: 1498
americas: 1332
Pre-Columbian: 1181
andes: 1150
andean: 1146
egyptian: 901
roman period (egyptian): 851
greco-roman (egyptian): 845
asian: 782
East Asian: 670
ancient: 665
Pop Art: 614
Arts of Africa: 588
early intermediate period: 560
south american: 494
african Art: 486
Japanism: 467
nineteenth century: 443
contemporary: 428
nazca: 420
european: 402
Cubism: 392
avant-garde: 389
moche: 365
Pictorialism: 356
Arts and Crafts Movement: 347
bauhaus: 331
qing: 329
Coastal West Africa: 311
South Asian: 285
new kingdom: 271
mesoamerican: 271
18th Century: 264
chinese export: 233
Himalayan: 228
Impressionism: 227
edo (japanese period): 225
mochica: 221
17th Century: 217
Folk Art: 215
late intermediate period: 215
Art Deco: 213
third intermediate period: 213
native american:

In [None]:
# Create list of styles to use:

top_styles = [
    "Modernism",
    "Pop Art",
    "contemporary",
    "Cubism",
    "avant-garde",
    "bauhaus",
    "Impressionism",
    "Art Deco",
    "Folk Art",
    "meiji",
    "Realism",
    "Surrealism",
    "Renaissance",
    "Post-Impressionism",
    "Art Nouveau",
    "Rococo",
    "postmodern",
    "Conceptual Art",
    "Baroque",
    





]

In [5]:

def get_all_style_titles():
    base_url = "https://api.artic.edu/api/v1/category-terms"
    all_titles = []
    page = 1
    limit = 100  # Maximum allowed by the API

    while True:
        params = {
            "fields": "id,title",
            "limit": limit,
            "page": page,
            "query[term][category]": "Style",
            "sort": "title"
        }
        
        response = requests.get(base_url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            new_titles = [item['title'] for item in data['data']]
            all_titles.extend(new_titles)
            
            # Check if we've reached the last page
            if len(new_titles) < limit:
                break
            
            page += 1
        else:
            print(f"Error on page {page}: {response.status_code}")
            break

    return all_titles

# Fetch all style titles
style_titles = get_all_style_titles()

if style_titles:
    print(f"\nTotal styles retrieved: {len(style_titles)}")
else:
    print("Failed to retrieve style titles.")


Total styles retrieved: 10101


Match the style from the top_styles list to the style terms used by the API

In [19]:
def check_top_styles(style_titles, top_styles):
    # Convert all style_titles to lowercase for case-insensitive comparison
    style_titles_lower = [title.lower() for title in style_titles]
    
    # Check each top style
    not_found = []
    found = {}
    
    for style in top_styles:
        style_lower = style.lower()
        matches = []
        
        for i, title_lower in enumerate(style_titles_lower):
            if style_lower == title_lower:
                matches.append(style_titles[i])
            elif (style_lower == "art-deco" and "art deco" in title_lower) or \
                 (style_lower == "mannerism (late renaissance)" and ("mannerism" in title_lower or "late renaissance" in title_lower)) or \
                 (style_lower == "romanticism" and "romantic" in title_lower) or \
                 (style_lower == "art nouveau (modern)" and title_lower in ["modern art", "modern", "modernist", "modernism"]):
                matches.append(style_titles[i])
        
        if matches:
            found[style] = matches
        else:
            not_found.append(style)
    
    # Print results
    if not_found:
        print("The following styles were not found in the API results:")
        for style in not_found:
            print(f"- {style}")
    else:
        print("All top styles were found in the API results.")
    
    print("\nMatches found:")
    for style, matches in found.items():
        print(f"{style}:")
        for match in matches:
            print(f"  - {match}")
    
    # Print summary
    print(f"\nFound {len(found)} out of {len(top_styles)} top styles.")
    
    return found

# Usage
#found_styles = check_top_styles(style_titles, top_styles)

Create a list to use for search

In [15]:
# Convert found styles to style_search_terms list
style_search_terms = []
for matches in found_styles.values():
    style_search_terms.extend(matches)

# Remove duplicates while preserving order
style_search_terms = list(dict.fromkeys(style_search_terms))

print("Style search terms:")
for term in style_search_terms:
    print(f"- {term}")
print(f"\nTotal unique search terms: {len(style_search_terms)}")

Style search terms:
- Impressionism
- Realism
- neo-romantic
- romantic
- Expressionism
- Post-Impressionism
- Baroque
- modern art
- modernism
- Modern Art
- modern
- modernist
- Modernism
- surrealism
- Surrealism
- symbolism
- Symbolism
- Abstract Expressionism
- Neoclassicism
- Rococo
- Cubism
- northern renaissance
- Pop art
- Pop Art
- mannerism
- late renaissance
- Mannerism
- minimalism
- Minimalism
- Conceptual Art
- Art Informel
- early renaissance
- ukiyo-e
- Neo-Expressionism
- high renaissance
- lyrical abstraction
- Fauvism
- contemporary
- Op Art
- Art Deco
- postmodern

Total unique search terms: 41


Test our search terms by getting the first entry from each style

In [28]:
BASE_URL = "https://api.artic.edu/api/v1"

def get_artwork_by_style(style_term, limit=1):
    endpoint = f"{BASE_URL}/artworks/search"
    params = {
        "q": style_term,
        "fields": "id,title,style_title,image_id,artwork_type_title",
        "limit": limit
    }
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()
        return data.get('data', [])
    else:
        print(f"Error fetching data for {style_term}: {response.status_code}")
        return []

def create_test_dataframe(style_search_terms):
    all_artworks = []
    for term in style_search_terms:
        artworks = get_artwork_by_style(term)
        if artworks:
            artwork = artworks[0]
            artwork['search_term'] = term
            all_artworks.append(artwork)
    
    df = pd.DataFrame(all_artworks)
    if '_score' in df.columns:
        df = df.drop('_score', axis=1)
    return df

# Assuming style_search_terms is already defined
test_df = create_test_dataframe(style_search_terms)

print(f"Number of entries in test_df: {len(test_df)}")

Number of entries in test_df: 41


In [29]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   artwork_type_title  41 non-null     object
 1   style_title         32 non-null     object
 2   id                  41 non-null     int64 
 3   image_id            41 non-null     object
 4   title               41 non-null     object
 5   search_term         41 non-null     object
dtypes: int64(1), object(5)
memory usage: 2.0+ KB


In [30]:
test_df

Unnamed: 0,artwork_type_title,style_title,id,image_id,title,search_term
0,Painting,Impressionism,64729,ef96e79b-f481-8114-0804-4bd39c101983,"Early Morning, Tarpon Springs",Impressionism
1,Painting,suprematist,207293,12baff03-af57-0410-df53-740c56575732,Painterly Realism of a Football Player—Color M...,Realism
2,Sculpture,late assyrian,64936,c78e7b95-0652-a2dd-05f6-f9b1d68ff24d,Relief Showing the Head of a Winged Genius,neo-romantic
3,Mixed Media,,99775,19e4effc-bcab-2b1e-29f1-8325877111f3,Homage to the Romantic Ballet,romantic
4,Costume and Accessories,Expressionism,61859,8ec2ad7b-c47d-d189-5646-6c5b960b875c,Scarf,Expressionism
5,Sculpture,Post-Impressionism,719,d0a72b06-f1ee-4464-d58c-5e39bc0534b3,A Burgher of Calais (Jean d'Aire),Post-Impressionism
6,Sculpture,Baroque,73413,d19683c2-2ef4-e586-7add-c1008b6a8fb2,Bust of a Youth (Saint John the Baptist?),Baroque
7,Painting,,109529,567b2141-2556-c195-01fe-b47c6d1a63dc,Reminiscence of a Cathedral,modern art
8,Sculpture,,152961,9036870e-9ec4-19ab-3d73-3cbf79c1af8b,"""Untitled"" (Portrait of Ross in L.A.)",modernism
9,Painting,,109529,567b2141-2556-c195-01fe-b47c6d1a63dc,Reminiscence of a Cathedral,Modern Art


Notice mixed media types were pulled. Searching with the API for just paintings along with the style_terms was causing errors, 
so I decided to pull all and will filter the dataframe afterwards. We'll also fill the missing values with the 'search_term' after some checking.

Now we search for every piece of artwork matching our style terms. 

In [32]:


BASE_URL = "https://api.artic.edu/api/v1"
RATE_LIMIT = 60  # requests per minute
MAX_RECORDS = 10000  # maximum records we can fetch for a single search query
CHECKPOINT_FILE = 'search_checkpoint.json'

def load_checkpoint():
    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def save_checkpoint(checkpoint):
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f)

def get_artworks_by_style(style_term, limit=100, start_page=1):
    all_artworks = []
    page = start_page
    total_pages = None

    while (total_pages is None or page <= total_pages) and len(all_artworks) < MAX_RECORDS:
        try:
            endpoint = f"{BASE_URL}/artworks/search"
            params = {
                "q": style_term,
                "fields": "id,title,style_title,image_id,artwork_type_title",
                "limit": limit,
                "page": page
            }
            response = requests.get(endpoint, params=params)
            response.raise_for_status()
            
            data = response.json()
            artworks = data.get('data', [])
            all_artworks.extend(artworks)
            
            if total_pages is None:
                total_pages = data.get('pagination', {}).get('total_pages', 1)
                print(f"Total pages for {style_term}: {total_pages}")
            
            print(f"Fetched page {page} of {total_pages} for {style_term}")
            page += 1
            
            # Throttle to respect rate limit
            time.sleep(60 / RATE_LIMIT)
        
        except RequestException as e:
            print(f"Error fetching data for {style_term} on page {page}: {e}")
            break  
        
        if len(all_artworks) >= MAX_RECORDS:
            print(f"Reached {MAX_RECORDS} record limit for {style_term}")
            break
    
    return all_artworks, page

def create_full_dataframe(style_search_terms):
    all_artworks = []
    checkpoint = load_checkpoint()

    for term in style_search_terms:
        start_page = checkpoint.get(term, {}).get('next_page', 1)
        total_retrieved = checkpoint.get(term, {}).get('total_retrieved', 0)
        
        print(f"Fetching artworks for {term} starting from page {start_page}")
        artworks, next_page = get_artworks_by_style(term, start_page=start_page)
        
        for artwork in artworks:
            artwork['search_term'] = term
        all_artworks.extend(artworks)
        
        total_retrieved += len(artworks)
        checkpoint[term] = {'next_page': next_page, 'total_retrieved': total_retrieved}
        save_checkpoint(checkpoint)
        
        print(f"Total artworks retrieved for {term}: {total_retrieved}")
        
        if total_retrieved >= MAX_RECORDS:
            print(f"Reached {MAX_RECORDS} record limit for {term}. You can resume later.")
    
    df = pd.DataFrame(all_artworks)
    if '_score' in df.columns:
        df = df.drop('_score', axis=1)
    return df

# Usage
full_df = create_full_dataframe(style_search_terms)

print(f"Total number of artworks retrieved: {len(full_df)}")



Fetching artworks for Impressionism starting from page 1
Total pages for Impressionism: 4
Fetched page 1 of 4 for Impressionism
Fetched page 2 of 4 for Impressionism
Fetched page 3 of 4 for Impressionism
Fetched page 4 of 4 for Impressionism
Total artworks retrieved for Impressionism: 316
Fetching artworks for Realism starting from page 1
Total pages for Realism: 3
Fetched page 1 of 3 for Realism
Fetched page 2 of 3 for Realism
Fetched page 3 of 3 for Realism
Total artworks retrieved for Realism: 275
Fetching artworks for neo-romantic starting from page 1
Total pages for neo-romantic: 24
Fetched page 1 of 24 for neo-romantic
Fetched page 2 of 24 for neo-romantic
Fetched page 3 of 24 for neo-romantic
Fetched page 4 of 24 for neo-romantic
Fetched page 5 of 24 for neo-romantic
Fetched page 6 of 24 for neo-romantic
Fetched page 7 of 24 for neo-romantic
Fetched page 8 of 24 for neo-romantic
Fetched page 9 of 24 for neo-romantic
Fetched page 10 of 24 for neo-romantic
Error fetching data for 

In [33]:
full_df.head(10)

Unnamed: 0,artwork_type_title,style_title,id,image_id,title,search_term
0,Painting,Impressionism,64729,ef96e79b-f481-8114-0804-4bd39c101983,"Early Morning, Tarpon Springs",Impressionism
1,Sculpture,Post-Impressionism,719,d0a72b06-f1ee-4464-d58c-5e39bc0534b3,A Burgher of Calais (Jean d'Aire),Impressionism
2,Painting,Impressionism,138,e72305c9-1a1c-8a36-7450-582619366338,Flower Girl in Holland,Impressionism
3,Painting,Impressionism,70065,2e796bd8-4e0b-f55a-7c69-75a70a3e97d7,Afterglow,Impressionism
4,Painting,Impressionism,90899,9604cbbd-722b-8de3-e7cc-4a80be648d79,Lady in Green and Gray,Impressionism
5,Painting,Pointillism,27992,2d484387-2509-5e8e-2c43-22f9981972eb,A Sunday on La Grande Jatte — 1884,Impressionism
6,Painting,Post-Impressionism,13763,045a1779-36a4-282f-49b8-b8fb007f8254,Snow-Crowned Hills,Impressionism
7,Painting,Impressionism,111326,6f92b799-de7c-7603-3644-3934563703f2,"The Hippodrome, London",Impressionism
8,Painting,Impressionism,71573,a67c4473-57a4-9807-a94e-1136d3daf876,A Holiday,Impressionism
9,Print,Post-Impressionism,17229,3b374643-5328-3e00-c02b-5ab56e5ae8f8,The Scream,Impressionism


In [36]:
full_df['artwork_type_title'].value_counts()

artwork_type_title
Painting                   6739
Print                      4240
Sculpture                  2392
Photograph                 2227
Vessel                     1641
Drawing and Watercolor     1227
Textile                     737
Coin                        698
Decorative Arts             669
Costume and Accessories     461
Ceramics                    408
Architectural Drawing       318
Furniture                   315
Metalwork                   278
Mixed Media                 238
Book                        171
Mask                        171
Graphic Design              156
Glass                       153
Design                      140
Film, Video, New Media      117
Religious/Ritual Object     110
Architectural fragment      100
Installation                 96
Armor                        66
Miniature room               62
Arms                         61
Funerary Object              53
Archives (groupings)         43
Model                        41
Basketry             

Now filter for only paintings

In [34]:
filtered_full_df = full_df[full_df['artwork_type_title'] == 'Painting']
filtered_full_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6739 entries, 0 to 24125
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   artwork_type_title  6739 non-null   object
 1   style_title         3550 non-null   object
 2   id                  6739 non-null   int64 
 3   image_id            6679 non-null   object
 4   title               6739 non-null   object
 5   search_term         6739 non-null   object
dtypes: int64(1), object(5)
memory usage: 368.5+ KB


In [39]:
filtered_full_df.isna().sum()

artwork_type_title       0
style_title           3189
id                       0
image_id                60
title                    0
search_term              0
dtype: int64

Download the images we want.

In [42]:
import logging
import os
import time
from urllib.parse import urljoin
import requests
import pandas as pd

BASE_URL = "https://www.artic.edu/iiif/2/"
RATE_LIMIT = 60  # requests per minute
IMAGE_QUALITY = "full"
IMAGE_FORMAT = "jpg"
OUTPUT_DIR = 'downloaded_images'
MAX_RETRIES = 2  # Retry up to 2 times if the content is empty or there are errors
RETRY_DELAY = 2  # Delay (in seconds) between retries
REQUEST_DELAY = 1  # Delay (in seconds) between each request

# Set up logging
logging.basicConfig(filename='image_download.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')

def download_image(image_id, output_path):
    if not image_id:
        return False, "No image ID provided"
    
    url = urljoin(BASE_URL, f"{image_id}/full/{IMAGE_QUALITY}/0/default.{IMAGE_FORMAT}")
    for attempt in range(MAX_RETRIES + 1):  # Attempt download + retries
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raises an HTTPError for bad responses
            
            # Log content length and headers
            content_length = len(response.content)
            logging.info(f"Attempt {attempt + 1} - Content length: {content_length}, Headers: {response.headers}")
            
            # Check for non-empty content
            if content_length > 0:
                with open(output_path, 'wb') as f:
                    f.write(response.content)
                
                # Check that the file was written and is not empty
                if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
                    return True, "Success"
                else:
                    logging.warning(f"Downloaded file {output_path} is empty.")
            else:
                logging.warning(f"Attempt {attempt + 1} - Empty content for {image_id}")
        
        except requests.exceptions.HTTPError as e:
            return False, f"HTTP error: {e}"
        except requests.exceptions.ConnectionError as e:
            return False, f"Connection error: {e}"
        except requests.exceptions.Timeout as e:
            return False, f"Timeout error: {e}"
        except requests.exceptions.RequestException as e:
            return False, f"Error downloading: {e}"

        # Wait before retrying if not the last attempt
        if attempt < MAX_RETRIES:
            logging.info(f"Retrying in {RETRY_DELAY} seconds (attempt {attempt + 2}) for {image_id}")
            time.sleep(RETRY_DELAY)

    return False, "Failed after maximum retries"

def download_all_images(df):
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    file_names = []
    total_images = len(df)
    successful_downloads = 0
    failed_downloads = 0
    skipped_downloads = 0
    
    for index, row in df.iterrows():
        image_id = row['image_id']
        if pd.isna(image_id) or not image_id:
            file_names.append(None)
            skipped_downloads += 1
            logging.warning(f"Skipped image at index {index}: No valid image_id")
            continue

        file_name = f"{image_id}.{IMAGE_FORMAT}"
        output_path = os.path.join(OUTPUT_DIR, file_name)

        success, message = download_image(image_id, output_path)
        if success:
            file_names.append(file_name)
            successful_downloads += 1
            logging.info(f"Downloaded: {file_name}")
        else:
            file_names.append(None)
            failed_downloads += 1
            logging.warning(f"Failed to download {image_id}: {message}")

        # Print progress every 100 images
        if (index + 1) % 100 == 0:
            print(f"Progress: {index + 1}/{total_images} images processed")
            print(f"Successful downloads: {successful_downloads}")
            print(f"Failed downloads: {failed_downloads}")
            print(f"Skipped downloads: {skipped_downloads}")
            logging.info(f"Progress: {index + 1}/{total_images} images processed. "
                         f"Successful: {successful_downloads}, Failed: {failed_downloads}, Skipped: {skipped_downloads}")

        # Throttle to respect rate limit and add a 1-second delay between requests
        time.sleep(1)

    return file_names, successful_downloads, failed_downloads, skipped_downloads

# Assuming filtered_full_df is your DataFrame
file_names, successful, failed, skipped = download_all_images(filtered_full_df)
filtered_full_df['file_name'] = file_names

print(f"Image download complete. Total successful downloads: {successful}, "
      f"Total failed: {failed}, Total skipped: {skipped}")
logging.info(f"Process completed. Successful downloads: {successful}, "
             f"Failed downloads: {failed}, Skipped downloads: {skipped}")


Progress: 100/6739 images processed
Successful downloads: 79
Failed downloads: 0
Skipped downloads: 0
Progress: 200/6739 images processed
Successful downloads: 155
Failed downloads: 0
Skipped downloads: 0
Progress: 300/6739 images processed
Successful downloads: 250
Failed downloads: 0
Skipped downloads: 0
Progress: 600/6739 images processed
Successful downloads: 337
Failed downloads: 0
Skipped downloads: 0
Progress: 2000/6739 images processed
Successful downloads: 406
Failed downloads: 0
Skipped downloads: 1
Progress: 2200/6739 images processed
Successful downloads: 424
Failed downloads: 0
Skipped downloads: 2
Progress: 2700/6739 images processed
Successful downloads: 511
Failed downloads: 0
Skipped downloads: 2
Progress: 2800/6739 images processed
Successful downloads: 584
Failed downloads: 0
Skipped downloads: 2
Progress: 2900/6739 images processed
Successful downloads: 656
Failed downloads: 0
Skipped downloads: 2
Progress: 3100/6739 images processed
Successful downloads: 728
Failed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_full_df['file_name'] = file_names


In [43]:
filtered_full_df.head()

Unnamed: 0,artwork_type_title,style_title,id,image_id,title,search_term,file_name
0,Painting,Impressionism,64729,ef96e79b-f481-8114-0804-4bd39c101983,"Early Morning, Tarpon Springs",Impressionism,ef96e79b-f481-8114-0804-4bd39c101983.jpg
2,Painting,Impressionism,138,e72305c9-1a1c-8a36-7450-582619366338,Flower Girl in Holland,Impressionism,e72305c9-1a1c-8a36-7450-582619366338.jpg
3,Painting,Impressionism,70065,2e796bd8-4e0b-f55a-7c69-75a70a3e97d7,Afterglow,Impressionism,2e796bd8-4e0b-f55a-7c69-75a70a3e97d7.jpg
4,Painting,Impressionism,90899,9604cbbd-722b-8de3-e7cc-4a80be648d79,Lady in Green and Gray,Impressionism,9604cbbd-722b-8de3-e7cc-4a80be648d79.jpg
5,Painting,Pointillism,27992,2d484387-2509-5e8e-2c43-22f9981972eb,A Sunday on La Grande Jatte — 1884,Impressionism,2d484387-2509-5e8e-2c43-22f9981972eb.jpg


In [44]:
filtered_full_df.to_csv("images_dataframe.csv", index=False)