In [4]:
import os
import pandas as pd

## Utility functions for traversing image data

In [5]:
def get_image_paths(root_path):
    image_paths = []
    for dirpath, _, filenames in os.walk(root_path):
        for filename in filenames:
            image_paths.append(os.path.join(dirpath, filename))
    return image_paths


def create_image_dataframe(root_path):
    image_paths = get_image_paths(root_path)
    data = {
        'image_name': [os.path.basename(path) for path in image_paths],
        'image_path': [os.path.abspath(path) for path in image_paths]
    }
    df = pd.DataFrame(data)
    return df

Empty DataFrame
Columns: [image_name, image_path]
Index: []


## Index data and create metadatasets

In [14]:
data_dir_path = "../data/ILSVRC/Data/CLS-LOC/"
metadata_output_dir_path = "../data/ILSVRC/Metadata"


for dataset in ["train", "val", "test"]:
    print(f"Processing {dataset} data..")
    dataset_root_path = os.path.join(data_dir_path, dataset)
    metadata_output_path = os.path.join(metadata_output_dir_path, f"{dataset}.pkl")

    metadata_df = create_image_dataframe(dataset_root_path)
    print(metadata_output_path)
    metadata_df.to_pickle(metadata_output_path)


Processing train data..
../data/ILSVRC/Metadata/train.pkl
Processing val data..
../data/ILSVRC/Metadata/val.pkl
Processing test data..
../data/ILSVRC/Metadata/test.pkl


## Count number of images

In [13]:
for dataset in ["train", "val", "test"]:
    metadata_path = os.path.join(metadata_output_dir_path, f"{dataset}.pkl")

    metadata_df = pd.read_pickle(metadata_path)
    print(f"Number of images in {dataset} is {len(metadata_df)}")

Number of images in train is 1281167
Number of images in val is 50000
Number of images in test is 100000
