In [20]:
import requests
import zipfile
import io
import os
from tqdm import tqdm

dataset_name = "Portrait_Painting"
dataset_path = os.path.join(os.getcwd(), dataset_name)

# URL of the dataset
url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/289kxpnp57-1.zip"

if os.path.exists(dataset_name):
  print("The dataset is already downloaded.")
  
else:
  print("Downloading the dataset...")

  
  response = requests.get(url, stream=True)
  if response.status_code == 200:
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024
    t = tqdm(total=total_size, unit='iB', unit_scale=True)
    with io.BytesIO() as file_buffer:
      for data in response.iter_content(block_size):
        t.update(len(data))
        file_buffer.write(data)
      t.close()
      file_buffer.seek(0)
      os.makedirs(dataset_name, exist_ok=True)
      with zipfile.ZipFile(file_buffer) as z:
        z.extractall(dataset_name)
    print("Download and extraction complete.")
  else:
    print("Failed to download the dataset.")

Downloading the dataset...


100%|██████████| 898M/898M [00:30<00:00, 29.3MiB/s] 


Download and extraction complete.


In [21]:
import pandas as pd
import os

org_img_path = os.path.join(dataset_path, 'orgImg')
dataset_data_path = os.path.join(os.getcwd(), f'{dataset_name}-data')
os.makedirs(dataset_data_path, exist_ok=True)

# List to store the data
data = []

# Traverse the directories and files
for movement in os.listdir(org_img_path):
  movement_path = os.path.join(org_img_path, movement)
  if os.path.isdir(movement_path):
    for filename in os.listdir(movement_path):
      if filename.endswith('.jpg'):
        data.append([filename[:-4], movement])

# Create a DataFrame from the data
df = pd.DataFrame(data, columns=['filename', 'art movement'])

# Define the CSV file path
csv_file_path = os.path.join(dataset_path, 'art_movements.csv')

# Write the DataFrame to the CSV file
df.to_csv(csv_file_path, index=False)

print(f"CSV file created at {csv_file_path}")

CSV file created at /home/ubuntu/gspinaci/LLM-test/dataset/Portrait_Painting/art_movements.csv


In [22]:
import shutil

# Define the destination directory
jpeg_images_path = os.path.join(dataset_path, 'JPEGimages')
os.makedirs(jpeg_images_path, exist_ok=True)

# Traverse the directories and move the images
for movement in os.listdir(org_img_path):
  movement_path = os.path.join(org_img_path, movement)
  if os.path.isdir(movement_path):
    for filename in os.listdir(movement_path):
      if filename.endswith('.jpg'):
        src_path = os.path.join(movement_path, filename)
        dst_path = os.path.join(jpeg_images_path, filename)
        shutil.move(src_path, dst_path)

print(f"All images have been moved to {jpeg_images_path}")

All images have been moved to /home/ubuntu/gspinaci/LLM-test/dataset/Portrait_Painting/JPEGimages


In [27]:
# Count the number of images for each art movement
art_movement_counts = df['art movement'].value_counts()

print(art_movement_counts)

# Count the total number of images
total_images = art_movement_counts.sum()

print(f"\nTotal number of images: {total_images}")

art movement
impress            200
postImpress        170
highRenaiss        162
NorthernRenaiss    147
Ukiyo              127
rococo             116
Name: count, dtype: int64

Total number of images: 922
