In [1]:
import os
import requests
import zipfile
from tqdm import tqdm

# Define the folder and URL
folder_name = 'ICONCLASS'
url = 'https://iconclass.org/testset/779ba2ca9e977c58d818e3823a676973.zip'
zip_file_name = 'iconclass.zip'

# Check if the folder exists
if not os.path.exists(folder_name):
  # Download the zip file with progress bar
  response = requests.get(url, stream=True)
  total_size = int(response.headers.get('content-length', 0))
  block_size = 1024  # 1 Kibibyte
  t = tqdm(total=total_size, unit='iB', unit_scale=True)
  with open(zip_file_name, 'wb') as file:
    for data in response.iter_content(block_size):
      t.update(len(data))
      file.write(data)
  t.close()
  
  # Extract the zip file
  with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(folder_name)
  
  # Remove the zip file
  os.remove(zip_file_name)

print(f"'{folder_name}' folder is ready.")

'ICONCLASS' folder is ready.


In [2]:
import os
import shutil

# Define the new folder name
jpeg_folder_name = os.path.join(folder_name, 'JPEGImages')

# Create the new folder if it doesn't exist
if not os.path.exists(jpeg_folder_name):
  os.makedirs(jpeg_folder_name)

# Move all .jpg images to the new folder
for file_name in os.listdir(folder_name):
  if file_name.endswith('.jpg'):
    shutil.move(os.path.join(folder_name, file_name), os.path.join(jpeg_folder_name, file_name))

print(f"All .jpg images have been moved to '{jpeg_folder_name}' folder.")

All .jpg images have been moved to 'ICONCLASS/JPEGImages' folder.


In [3]:
import pandas as pd
import os
import json

source_path = os.path.join(folder_name, 'data.json')

# Load the JSON file
with open(source_path, 'r') as file:
  data = json.load(file)

# Count the number of images for each value
image_counts = {}
for codes in data.values():
  for code in codes:
    image_counts[code] = image_counts.get(code, 0) + 1

# Convert to DataFrame and sort in descending order
image_counts_df = pd.DataFrame(list(image_counts.items()), columns=['ID', 'ImageCount'])
image_counts_df = image_counts_df.sort_values(by='ImageCount', ascending=False)
image_counts_df = image_counts_df.head(10)
image_counts_df.reset_index(drop=True, inplace=True)

print(image_counts_df)

                ID  ImageCount
0  61B:31A2212(+1)        7423
1        61B:31D14        7391
2         61B(+52)        5214
3          31A2422        5195
4          31A2421        5169
5         61B(+53)        4628
6             49M3        4431
7       31D15:61BB        4334
8           31A235        3758
9              25I        3392


In [4]:
import pandas as pd
import os
import json

new_folder_name = 'ICONCLASS-data'

# Load the JSON file
with open(source_path, 'r') as file:
  data = json.load(file)

# Get top 10 male and female saints with the most images
filtered_keys = {k: v for k, v in data.items() if any(code.startswith('11H(') or code.startswith('11HH(') for code in v)}

# Count the number of images for each value
image_counts = {}
for codes in filtered_keys.values():
  for code in codes:
    if code.startswith('11H(') or code.startswith('11HH('):
      image_counts[code] = image_counts.get(code, 0) + 1

# Convert to DataFrame and sort in descending order
image_counts_df = pd.DataFrame(list(image_counts.items()), columns=['ID', 'ImageCount'])
image_counts_df = image_counts_df.sort_values(by='ImageCount', ascending=False)

# Remove characters after the closing round bracket and collapse IDs
image_counts_df['ID'] = image_counts_df['ID'].str.extract(r'(^.*?\))')[0]
image_counts_df = image_counts_df.groupby('ID', as_index=False).sum()
image_counts_df = image_counts_df.sort_values(by='ImageCount', ascending=False)

# Keep only top 10 classes (changed from index[11:] to index[10:] to ensure exactly 10 classes)
image_counts_df = image_counts_df.drop(image_counts_df.index[10:])
image_counts_df.reset_index(drop=True, inplace=True)

print(f"Number of classes: {len(image_counts_df)}")
print(image_counts_df)

# Store image_counts_df in ICONCLASS-data/pre_classes.csv
image_counts_df.to_csv(os.path.join(new_folder_name, 'pre_classes.csv'), index=False)

# Define the path for the ground truth JSON file
# Filter the keys to keep only those that have an item in the array value that belongs to image_counts_df, ID column
filtered_keys = {k: [code for code in v if code in image_counts_df['ID'].values] for k, v in filtered_keys.items() if any(code in image_counts_df['ID'].values for code in v)}

# Check how many images have multiple labels
multi_label_count = sum(1 for v in filtered_keys.values() if len(v) > 1)
print(f'The number of images with multiple labels is {multi_label_count}')

# Remove multi-label images from filtered_keys
filtered_keys = {k: v for k, v in filtered_keys.items() if len(v) == 1}

# Define the path for the ground truth JSON file
ground_truth = [{'item': k.replace('.jpg', ''), 'class': v if len(v) > 1 else v[0]} for k, v in filtered_keys.items()]
ground_truth_path = os.path.join(new_folder_name, '2_ground_truth.json')
with open(ground_truth_path, 'w') as file:
  json.dump(ground_truth, file, indent=2)
print(f"Ground truth data has been saved to '{ground_truth_path}'.")

# Store test file
test_keys_path = os.path.join(new_folder_name, '2_test.txt')
with open(test_keys_path, 'w') as file:
  for key in filtered_keys.keys():
    file.write(f"{key.replace('.jpg', '')}\n")
print(f"Filtered keys have been saved to '{test_keys_path}'.")

print(f'The ground truth data contains {len(filtered_keys.keys())} images')

Number of classes: 10
                     ID  ImageCount
0             11H(PAUL)         178
1           11H(JEROME)         158
2  11HH(MARY MAGDALENE)         153
3             11H(JOHN)         132
4            11H(PETER)         128
5       11HH(CATHERINE)         116
6     11H(ANTONY ABBOT)         109
7          11H(MATTHEW)          94
8          11H(FRANCIS)          78
9             11H(MARK)          73
The number of images with multiple labels is 70
Ground truth data has been saved to 'ICONCLASS-data/2_ground_truth.json'.
Filtered keys have been saved to 'ICONCLASS-data/2_test.txt'.
The ground truth data contains 570 images
