In [15]:
import os
import requests
import zipfile
from tqdm import tqdm

# Define the folder and URL
folder_name = 'ICONCLASS'
url = 'https://iconclass.org/testset/779ba2ca9e977c58d818e3823a676973.zip'
zip_file_name = 'iconclass.zip'

# Check if the folder exists
if not os.path.exists(folder_name):
  # Download the zip file with progress bar
  response = requests.get(url, stream=True)
  total_size = int(response.headers.get('content-length', 0))
  block_size = 1024  # 1 Kibibyte
  t = tqdm(total=total_size, unit='iB', unit_scale=True)
  with open(zip_file_name, 'wb') as file:
    for data in response.iter_content(block_size):
      t.update(len(data))
      file.write(data)
  t.close()
  
  # Extract the zip file
  with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(folder_name)
  
  # Remove the zip file
  os.remove(zip_file_name)

print(f"'{folder_name}' folder is ready.")

'ICONCLASS' folder is ready.


In [16]:
import os
import shutil

# Define the new folder name
jpeg_folder_name = os.path.join(folder_name, 'JPEGImages')

# Create the new folder if it doesn't exist
if not os.path.exists(jpeg_folder_name):
  os.makedirs(jpeg_folder_name)

# Move all .jpg images to the new folder
for file_name in os.listdir(folder_name):
  if file_name.endswith('.jpg'):
    shutil.move(os.path.join(folder_name, file_name), os.path.join(jpeg_folder_name, file_name))

print(f"All .jpg images have been moved to '{jpeg_folder_name}' folder.")

All .jpg images have been moved to 'ICONCLASS/JPEGImages' folder.


In [17]:
import pandas as pd
import os
import json

source_path = os.path.join(folder_name, 'data.json')

# Load the JSON file
with open(source_path, 'r') as file:
  data = json.load(file)

# Count the number of images for each value
image_counts = {}
for codes in data.values():
  for code in codes:
    image_counts[code] = image_counts.get(code, 0) + 1

# Convert to DataFrame and sort in descending order
image_counts_df = pd.DataFrame(list(image_counts.items()), columns=['ID', 'ImageCount'])
image_counts_df = image_counts_df.sort_values(by='ImageCount', ascending=False)
image_counts_df = image_counts_df.head(10)
image_counts_df.reset_index(drop=True, inplace=True)

print(image_counts_df)

                ID  ImageCount
0  61B:31A2212(+1)        7423
1        61B:31D14        7391
2         61B(+52)        5214
3          31A2422        5195
4          31A2421        5169
5         61B(+53)        4628
6             49M3        4431
7       31D15:61BB        4334
8           31A235        3758
9              25I        3392


In [18]:
import pandas as pd
import os
import json

new_folder_name = 'ICONCLASS-data'

# Load the JSON file
with open(source_path, 'r') as file:
  data = json.load(file)

# Filter for saints (11H, 11HH, and 11F codes) - images with exactly one saint
filtered_keys = {k: v for k, v in data.items() if any(code.startswith('11H(') or code.startswith('11HH(') or code.startswith('11F(') for code in v)}

# Keep only images with exactly one saint code (can have other non-saint codes)
single_saint_keys = {}
for k, v in filtered_keys.items():
    saint_codes = [code for code in v if code.startswith('11H(') or code.startswith('11HH(') or code.startswith('11F(')]
    if len(saint_codes) == 1:  # Exactly one saint code (other codes allowed)
        single_saint_keys[k] = saint_codes

print(f'Total images with saint codes: {len(filtered_keys)}')
print(f'Images with exactly one saint: {len(single_saint_keys)}')
print(f'Images with multiple saints: {len(filtered_keys) - len(single_saint_keys)}')

# Now count images per class using only single-saint images
image_counts = {}
for codes in single_saint_keys.values():
  for code in codes:
    image_counts[code] = image_counts.get(code, 0) + 1

# Convert to DataFrame and sort in descending order
image_counts_df = pd.DataFrame(list(image_counts.items()), columns=['ID', 'ImageCount'])
image_counts_df = image_counts_df.sort_values(by='ImageCount', ascending=False)

# Remove characters after the closing round bracket and collapse IDs
image_counts_df['ID'] = image_counts_df['ID'].str.extract(r'(^.*?\))')[0]
image_counts_df = image_counts_df.groupby('ID', as_index=False).sum()
image_counts_df = image_counts_df.sort_values(by='ImageCount', ascending=False)

# Keep only top 10 classes
image_counts_df = image_counts_df.drop(image_counts_df.index[10:])
image_counts_df.reset_index(drop=True, inplace=True)

print(f"\nNumber of classes: {len(image_counts_df)}")
print("Single-saint images per class:")
print(image_counts_df)

# Store image_counts_df in ICONCLASS-data/pre_classes.csv
if not os.path.exists(new_folder_name):
    os.makedirs(new_folder_name)
image_counts_df.to_csv(os.path.join(new_folder_name, 'pre_classes.csv'), index=False)

# Filter the keys to keep only those that belong to the top 10 classes
final_filtered_keys = {}
for k, v in single_saint_keys.items():
    # Extract base saint ID from the code
    saint_code = v[0]
    base_saint_id = saint_code.split(')')[0] + ')'
    if base_saint_id in image_counts_df['ID'].values:
        final_filtered_keys[k] = base_saint_id

print(f'\nFinal dataset contains {len(final_filtered_keys)} single-saint images')

# Define the path for the ground truth JSON file
ground_truth = [{'item': k.replace('.jpg', ''), 'class': v} for k, v in final_filtered_keys.items()]
ground_truth_path = os.path.join(new_folder_name, '2_ground_truth.json')
with open(ground_truth_path, 'w') as file:
  json.dump(ground_truth, file, indent=2)
print(f"Ground truth data has been saved to '{ground_truth_path}'.")

# Store test file
test_keys_path = os.path.join(new_folder_name, '2_test.txt')
with open(test_keys_path, 'w') as file:
  for key in final_filtered_keys.keys():
    file.write(f"{key.replace('.jpg', '')}\n")
print(f"Filtered keys have been saved to '{test_keys_path}'.")

Total images with saint codes: 2903
Images with exactly one saint: 2495
Images with multiple saints: 408

Number of classes: 10
Single-saint images per class:
                     ID  ImageCount
0             11H(PAUL)         129
1  11HH(MARY MAGDALENE)         127
2           11H(JEROME)         126
3             11H(JOHN)          95
4     11H(ANTONY ABBOT)          83
5            11H(PETER)          72
6          11H(MATTHEW)          70
7       11HH(CATHERINE)          63
8             11H(LUKE)          51
9          11H(FRANCIS)          46

Final dataset contains 863 single-saint images
Ground truth data has been saved to 'ICONCLASS-data/2_ground_truth.json'.
Filtered keys have been saved to 'ICONCLASS-data/2_test.txt'.


In [19]:
# Analysis of Ground Truth File - Count images per class
import pandas as pd
import json
import os

# Load the ground truth file
ground_truth_path = os.path.join('ICONCLASS-data', '2_ground_truth.json')
with open(ground_truth_path, 'r') as file:
    ground_truth_data = json.load(file)

print(f"Total images in ground truth: {len(ground_truth_data)}")

# Count images per class
class_counts = {}
for entry in ground_truth_data:
    class_name = entry['class']
    class_counts[class_name] = class_counts.get(class_name, 0) + 1

# Convert to DataFrame and sort by count
class_counts_df = pd.DataFrame(list(class_counts.items()), columns=['Class', 'Number_of_Images'])
class_counts_df = class_counts_df.sort_values(by='Number_of_Images', ascending=False)
class_counts_df.reset_index(drop=True, inplace=True)

print("\nNumber of images per class in ground truth:")
print(class_counts_df)

# Summary statistics
print(f"\nSummary:")
print(f"Total classes: {len(class_counts_df)}")
print(f"Average images per class: {class_counts_df['Number_of_Images'].mean():.1f}")
print(f"Min images per class: {class_counts_df['Number_of_Images'].min()}")
print(f"Max images per class: {class_counts_df['Number_of_Images'].max()}")
print(f"Total images: {class_counts_df['Number_of_Images'].sum()}")

Total images in ground truth: 863

Number of images per class in ground truth:
                  Class  Number_of_Images
0             11H(PAUL)               129
1           11H(JEROME)               127
2  11HH(MARY MAGDALENE)               127
3             11H(JOHN)                95
4     11H(ANTONY ABBOT)                83
5            11H(PETER)                72
6          11H(MATTHEW)                70
7       11HH(CATHERINE)                63
8             11H(LUKE)                51
9          11H(FRANCIS)                46

Summary:
Total classes: 10
Average images per class: 86.3
Min images per class: 46
Max images per class: 129
Total images: 863
