In [1]:
import os
import requests
from tqdm import tqdm

download_url = "https://zenodo.org/record/6473001/files/ArtDL.zip"
local_file_name = "ArtDL.zip"

if os.path.exists(local_file_name):
  print(f"The file '{local_file_name}' already exists. Skipping download.")

else:
  print(f"Downloading the dataset from {download_url}...")
  
  head_response = requests.head(download_url)
  file_size = int(head_response.headers.get("content-length", 0))
  
  response = requests.get(download_url, stream=True)
  response.raise_for_status()
  
  with tqdm(total=file_size, unit="B", unit_scale=True, desc=local_file_name) as pbar:
    with open(local_file_name, "wb") as file:
      for chunk in response.iter_content(chunk_size=8192):
        file.write(chunk)
        pbar.update(len(chunk))  # Update progress bar
  
  print(f"Dataset downloaded and saved as '{local_file_name}'")

Downloading the dataset from https://zenodo.org/record/6473001/files/ArtDL.zip...


ArtDL.zip: 3.60GB [03:00, 19.9MB/s]                 

Dataset downloaded and saved as 'ArtDL.zip'





In [1]:
import zipfile
import os

zip_file = "ArtDL.zip"

extract_dir = "dataset" 

if not os.path.exists(zip_file):
    print(f"The file '{zip_file}' does not exist. Please download it first.")
else:
    print(f"Extracting '{zip_file}' to '{extract_dir}'...")
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Extraction complete. Files are saved in '{extract_dir}'")

Extracting 'ArtDL.zip' to 'dataset'...
Extraction complete. Files are saved in 'dataset'


In [2]:
import pandas as pd
import tabulate

csv_file_path = 'dataset/ArtDL/ArtDL.csv'

classes = [
  ("11H(ANTONY OF PADUA)", "ANTHONY OF PADUA"),
  ("11H(JOHN THE BAPTIST)", "JOHN THE BAPTIST"),
  ("11H(PAUL)", "PAUL"),
  ("11H(FRANCIS)", "FRANCIS OF ASSISI"),
  ("11HH(MARY MAGDALENE)", "MARY MAGDALENE"),
  ("11H(JEROME)", "JEROME"),
  ("11H(DOMINIC)", "SAINT DOMINIC"),
  ("11F(MARY)", "VIRGIN MARY"),
  ("11H(PETER)", "PETER"),
  ("11H(SEBASTIAN)", "SAINT SEBASTIAN")
]

# Store classes in a file
with open('classes.txt', 'w') as file:
  for cls in classes:
    file.write(f"{cls[0]},{cls[1]}\n")

def organize_df(df):
  column_mapping = {cls[0]: cls[1] for cls in classes}
  df = df.rename(columns=column_mapping).set_index("set")[sorted(column_mapping.values())].loc[["train", "val", "test"]]
  return df

df = pd.read_csv(csv_file_path)

columns_to_keep = [cls[0] for cls in classes]
df = df[columns_to_keep + ['item', 'set']]

df_normalized = df.copy()
df_normalized = df.drop(columns=['item']).groupby('set').sum().reset_index()
df_normalized = organize_df(df_normalized)

print("Table of classes:")
print(tabulate.tabulate(df_normalized, headers='keys', tablefmt='pretty'))
df_normalized.to_csv('1_ArtDL_classes.csv')
df[columns_to_keep] = df[columns_to_keep].astype(float)

# Weight the classes based on the number of items in each set
for index, row in df.iterrows():
  count_ones = row[columns_to_keep].sum()
  if count_ones > 0:
    df.loc[index, columns_to_keep] = row[columns_to_keep] / count_ones
  
df = df.drop(columns=['item']).groupby('set').sum().reset_index()

df[columns_to_keep] = df[columns_to_keep].astype(int)

df = organize_df(df)
print("Table of classes with weights:")
print(tabulate.tabulate(df, headers='keys', tablefmt='pretty'))
df.to_csv('1_ArtDL_classes_weighted.csv')

Table of classes:
+-------+------------------+-------------------+--------+------------------+----------------+------+-------+---------------+-----------------+-------------+
|  set  | ANTHONY OF PADUA | FRANCIS OF ASSISI | JEROME | JOHN THE BAPTIST | MARY MAGDALENE | PAUL | PETER | SAINT DOMINIC | SAINT SEBASTIAN | VIRGIN MARY |
+-------+------------------+-------------------+--------+------------------+----------------+------+-------+---------------+-----------------+-------------+
| train |       170        |       1220        |  1285  |       1497       |      1949      | 754  | 1471  |      387      |       628       |    15566    |
|  val  |        22        |        144        |  151   |       154        |      235       |  91  |  176  |      47       |       74        |    1920     |
| test  |        22        |        142        |  154   |       159        |      238       |  94  |  178  |      47       |       75        |    1913     |
+-------+------------------+------------

In [3]:
import os
import requests
import pandas as pd

# Download the test.txt file
test_txt_url = "https://raw.githubusercontent.com/iFede94/ArtDL/refs/heads/main/sets/test.txt"
test_txt_file = "2_test.txt"

response = requests.get(test_txt_url)
with open(test_txt_file, 'wb') as file:
  file.write(response.content)

with open(test_txt_file, 'r') as file:
  test_items = file.read().splitlines()

csv_file_path = 'dataset/ArtDL/ArtDL.csv'
df = pd.read_csv(csv_file_path)

missing_items = []
for item in test_items:
  if df[df['item'] == item].empty:
    missing_items.append(item)

if missing_items:
  print("Missing items:")
  for missing in missing_items:
    print(missing)
else:
  print("All items in test.txt exist in ArtDL.csv")

  
# Create ground truth file
image_dir = "dataset/ArtDL/JPEGImages"
missing_files = []

for item in test_items:
  file_name = f"{item}.jpg"
  if not os.path.exists(os.path.join(image_dir, file_name)):
    missing_files.append(file_name)

if missing_files:
  print("Missing image files:")
  for missing in missing_files:
    print(missing)
else:
  print("All image files exist in JPEGImages folder")

  num_rows = len(test_items)
  print(f"Number of rows in test.txt: {num_rows}")

All items in test.txt exist in ArtDL.csv
All image files exist in JPEGImages folder
Number of rows in test.txt: 1864


In [5]:
import os
import json
import pandas as pd

with open('classes.txt', 'r') as file:
  classes = [tuple(line.strip().split(',')) for line in file]

image_dir = "dataset/ArtDL/JPEGImages"
json_file_path = "2_ground_truth.json"
csv_file_path = "dataset/ArtDL/ArtDL.csv"
test_file = "2_test.txt"

df = pd.read_csv(csv_file_path)

ground_truth_data = []

with open(test_file, 'r') as file:
  test_items = file.read().splitlines()

# Process each image in the test file
for item in test_items:
  row = df[df['item'] == item]
  if row.empty:
    print(f"Warning: No matching row found in CSV for item '{item}'. Skipping...")
    continue
  
  row = row.iloc[0]

  # Find the column that is 1 and is in the classes list
  for cls in classes:
    if row[cls[0]] == 1:
      ground_truth_data.append({
        "item": item,
        "class": cls[0]
      })
      break

with open(json_file_path, 'w') as json_file:
  json.dump(ground_truth_data, json_file, indent=4)

print(f"Ground truth data has been saved to {json_file_path}")

Ground truth data has been saved to 2_ground_truth.json
