In [2]:
import os
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

wikidata_dir = os.path.join(os.getcwd(), 'wikidata')
os.makedirs(wikidata_dir, exist_ok=True)

# Define the SPARQL endpoint and the query
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
query = """
SELECT ?painting ?image ?iconclass WHERE {
  ?painting wdt:P31 wd:Q3305213;        # instance of painting
           wdt:P1257 ?iconclass.        # has an Iconclass code
  ?painting wdt:P18 ?image.             # image filename if available
  FILTER(strstarts(?iconclass, '11H'))
}
"""
sparql.setQuery(query)
sparql.setReturnFormat(JSON)

# Execute the query and convert the result to a pandas DataFrame
sparql.setRequestMethod('GET')
results = sparql.query().convert()
data = results['results']['bindings']

# Extract the relevant fields and store them in a list of dictionaries
data_list = []
for item in data:
  data_list.append({
    'painting': item['painting']['value'],
    'image': item['image']['value'],
    'iconclass': item['iconclass']['value']
  })

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data_list)

# Remove duplicate paintings
df = df.drop_duplicates(subset='painting')

# Save the DataFrame to a CSV file
df.to_csv(os.path.join(wikidata_dir, 'paintings.csv'),
          index=False, quotechar="'")
print(f"Saved {len(df)} paintings to 'paintings.csv'")



Saved 2180 paintings to 'paintings.csv'


In [31]:
# Create the wikidata-data directory if it doesn't exist
wikidata_data_dir = os.path.join(os.getcwd(), 'wikidata-data')
os.makedirs(wikidata_data_dir, exist_ok=True)

# Read the CSV file
df = pd.read_csv(os.path.join(wikidata_dir, 'paintings.csv'), quotechar="'")

# Remove the last character from the iconclass code (e.g. '11H(PAUL)11' -> '11H(PAUL)')
# And retrieve only the top 10 iconclasses
df['iconclass'] = df['iconclass'].str.extract(r'([^\)]+\))')
iconclass_counts = df['iconclass'].value_counts().head(10)
print(iconclass_counts)

# Filter the images to only include the top 10 iconclasses
df_filtered = df[df['iconclass'].isin(iconclass_counts.index)]
df_filtered = df_filtered.drop_duplicates(subset='image')

df_filtered.to_csv(os.path.join(wikidata_dir, 'wikidata.csv'), index=False, quotechar="'")
print(f"Saved {len(df_filtered)} paintings to 'wikidata.csv'")

iconclass_counts.to_csv(os.path.join(wikidata_data_dir, 'pre_classes.csv'), header=True)
print(f"Saved top 10 iconclass to 'pre_classes.csv'")

iconclass
11HH(MARY MAGDALENE)     177
11H(JOHN THE BAPTIST)    131
11H(JEROME)               78
11HH(CATHERINE)           76
11H(PETER)                68
11H(JOHN)                 51
11H(FRANCIS)              40
11H(ANTONY ABBOT)         38
11H(JOSEPH)               35
11H(PAUL)                 31
Name: count, dtype: int64
Saved 724 paintings to 'wikidata.csv'
Saved top 10 iconclass to 'pre_classes.csv'


In [35]:
import requests
from tqdm import tqdm
import json
from PIL import Image
import io
import numpy as np

# Create the directory to save images if it doesn't exist
jpeg_images_dir = os.path.join(wikidata_dir, 'JPEGImages')
os.makedirs(jpeg_images_dir, exist_ok=True)

images_df = pd.read_csv(os.path.join(wikidata_dir, 'wikidata.csv'))

# Initialize a list to store the image data
image_data = []

# Define target size - 512x512 is a good balance for most models
# This size works well for the 512 model and the 384 models can downscale during processing
target_size = (512, 512)

# Enhanced function to download and resize an image from a URL
def download_image(url, save_path, max_pixels=178956970, target_size=(512, 512)):
  """
  Download and resize an image from a URL.
  
  Args:
    url: URL of the image to download
    save_path: Path where the image will be saved
    max_pixels: Maximum number of pixels allowed (width × height)
    target_size: Optional (width, height) tuple for resizing
  """
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
  
  try:
    # Download the image
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code != 200:
      print(f"Failed to download {url}: HTTP status code {response.status_code}")
      return False
      
    # Read the image data into memory
    image_data = io.BytesIO()
    for chunk in response.iter_content(1024):
      image_data.write(chunk)
    image_data.seek(0)
    
    # Open the image
    with Image.open(image_data) as img:
      # Convert to RGB if needed (handles PNG, RGBA, etc.)
      if img.mode != 'RGB':
        img = img.convert('RGB')
        
      width, height = img.size
      num_pixels = width * height
      original_size = f"{width}x{height}"
      
      # Check if resizing is needed to prevent decompression bomb
      if num_pixels > max_pixels:
        # Calculate new dimensions while maintaining aspect ratio
        ratio = width / height
        if ratio > 1:
          new_width = int(np.sqrt(max_pixels * ratio))
          new_height = int(new_width / ratio)
        else:
          new_height = int(np.sqrt(max_pixels / ratio))
          new_width = int(new_height * ratio)
        
        # Resize the image
        img = img.resize((new_width, new_height), Image.LANCZOS)
      
      # If a target size is specified, resize to that size
      if target_size:
        img = img.resize(target_size, Image.LANCZOS)
      
      # Save the image
      img.save(save_path, 'JPEG', quality=95)
      return True
      
  except Exception as e:
    print(f"Failed to download {url}: {e}")
    return False

# Modify the filename to use the painting column (which is a URI)
for idx, row in tqdm(images_df.iterrows(), total=len(images_df)):
  if row['iconclass'] in iconclass_counts:
    filename = row['painting'].split('/')[-1] + '.jpg'
    save_path = os.path.join(jpeg_images_dir, filename)
    
    # Use the enhanced download function
    success = download_image(
      row['image'], 
      save_path,
      max_pixels=178956970,  # PIL's default limit
      target_size=target_size
    )
    
    if success:
      # Store the image and its class in the list
      image_data.append({
        'painting': row['painting'],
        'image': row['image'],
        'iconclass': row['iconclass']
      })
    
    # Save the data to a JSON file every 50 images
    if (idx + 1) % 50 == 0:
      with open(os.path.join(wikidata_dir, 'wikidata.json'), 'w') as f:
        json.dump(image_data, f)

# Save any remaining data to the JSON file
with open(os.path.join(wikidata_dir, 'wikidata.json'), 'w') as f:
  json.dump(image_data, f)

print("Image download complete.")

 21%|██        | 149/724 [02:19<33:14,  3.47s/it]

Failed to download http://commons.wikimedia.org/wiki/Special:FilePath/Conversion%20of%20Paul%20%28Bruegel%29.jpg: Image size (1078805910 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.


 40%|███▉      | 289/724 [05:35<1:02:13,  8.58s/it]

Failed to download http://commons.wikimedia.org/wiki/Special:FilePath/Hm%2054-oberrheinischer%20meister%20um%20141020-das%20paradiesgartlein-1410.png: Image size (346536264 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.


 57%|█████▋    | 414/724 [08:26<14:55,  2.89s/it]

Failed to download http://commons.wikimedia.org/wiki/Special:FilePath/El%20Descendimiento%2C%20by%20Rogier%20van%20der%20Weyden%2C%20from%20Prado%20in%20Google%20Earth.jpg: Image size (698310000 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.


 77%|███████▋  | 558/724 [11:25<09:12,  3.33s/it]

Failed to download http://commons.wikimedia.org/wiki/Special:FilePath/Giovanni%20Bellini%20-%20Saint%20Francis%20in%20the%20Desert%20-%20Google%20Art%20Project.jpg: Image size (789570000 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.


 80%|████████  | 580/724 [11:53<03:02,  1.27s/it]

Failed to download http://commons.wikimedia.org/wiki/Special:FilePath/Ambrosius%20Benson%20-%20Hieronymus%20-%20MMB.0043%20-%20Museum%20Mayer%20van%20den%20Bergh.tiff: can't concat tuple to bytes


 80%|████████  | 581/724 [11:55<03:44,  1.57s/it]

Failed to download http://commons.wikimedia.org/wiki/Special:FilePath/Atelier%20van%20Joachim%20Patinir%20-%20Rotslandschap%20met%20Hieronymus%20-%20MMB.0030%20-%20Museum%20Mayer%20van%20den%20Bergh.tiff: can't concat tuple to bytes


100%|██████████| 724/724 [14:52<00:00,  1.23s/it]

Image download complete.





In [4]:
import os
import json
import pandas as pd

wikidata_dir = os.path.join(os.getcwd(), 'wikidata')
wikidata_data_dir = os.path.join(os.getcwd(), 'wikidata-data')

jpeg_images_dir = os.path.join(wikidata_dir, 'JPEGImages')

top_classes_df = pd.read_csv(os.path.join(wikidata_data_dir, 'pre_classes.csv'))
top_classes = top_classes_df['iconclass'].to_list()

print(top_classes)

# Read the paintings.csv file
image_data = pd.read_csv(os.path.join(wikidata_dir, 'wikidata.csv'), quotechar="'")

# Initialize lists to store the data
test_images = []
ground_truth = []

# Iterate over each object in the JSON data
for item in image_data.itertuples():
  # Extract the image filename
  image_filename = item.painting.replace('http://www.wikidata.org/entity/', '')
  image_path = os.path.join(jpeg_images_dir, f'{image_filename}.jpg')

  # Check if the image exists in JPEGImages directory
  if os.path.exists(image_path):
    # Add the image filename to the test file list
    test_images.append(image_filename)
    
    # Add the object to the ground truth list
    ground_truth.append({
      'item': image_filename,
      'class': item.iconclass
    })

# Write the test images to 2_test.txt
with open(os.path.join(wikidata_data_dir, '2_test.txt'), 'w') as f:
  for image in test_images:
    f.write(f"{image}\n")

# Write the ground truth data to 2_ground_truth.json
with open(os.path.join(wikidata_data_dir, '2_ground_truth.json'), 'w') as f:
  json.dump(ground_truth, f)

print("Files 2_test.txt and 2_ground_truth.json have been created.")

['11HH(MARY MAGDALENE)', '11H(JOHN THE BAPTIST)', '11H(JEROME)', '11HH(CATHERINE)', '11H(PETER)', '11H(JOHN)', '11H(FRANCIS)', '11H(ANTONY ABBOT)', '11H(JOSEPH)', '11H(PAUL)']
Files 2_test.txt and 2_ground_truth.json have been created.
