In [2]:
from glob import glob

f_path = "/media/logan/m.2/datasets/image/imagenet-dataset/2010/devkit-1.0/data/meta.mat"
tar_files = glob("/media/logan/m.2/datasets/image/imagenet-dataset/2010/train/*.tar")

In [3]:
! pip install scipy



In [4]:
import os
import scipy.io

mat_data = scipy.io.loadmat(f_path)
metadata_map = {}
tar_file_map = {}

for metadata in mat_data['synsets']:
    metadata_map[metadata[0][1][0]] = {"metadata":metadata, "label":metadata[0][2][0].split(",")[0]}

for tar_file in tar_files:
    tar_file_map[os.path.basename(tar_file).split(".")[0]] = tar_file
    
for key in tar_file_map.keys():
    if key in  metadata_map.keys():
        metadata_map[key]['tar_file'] = tar_file_map[key]

In [5]:
metadata_map['n01484850']

{'metadata': array([(array([[488]], dtype=uint16), array(['n01484850'], dtype='<U9'), array(['great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias'],
              dtype='<U83'), array(['large aggressive shark widespread in warm seas; known to attack humans'],
              dtype='<U70'), array([[0]], dtype=uint8), array([], shape=(1, 0), dtype=uint8), array([[0]], dtype=uint8), array([[1242]], dtype=uint16))                 ],
       dtype=[('ILSVRC2010_ID', 'O'), ('WNID', 'O'), ('words', 'O'), ('gloss', 'O'), ('num_children', 'O'), ('children', 'O'), ('wordnet_height', 'O'), ('num_train_images', 'O')]),
 'label': 'great white shark'}

In [6]:
import os
import tarfile

def untar_into_directory(tar_file, target_directory):
    """
    Untar a tar file into a target directory.

    Args:
        tar_file (str): Path to the tar file.
        target_directory (str): Path to the target directory where the contents will be extracted.
    
    Returns:
        None
    """
    # Create the target directory if it doesn't exist
    os.makedirs(target_directory, exist_ok=True)

    # Open the tar file for reading
    with tarfile.open(tar_file, 'r') as tar:
        # Extract all the contents into the target directory
        tar.extractall(path=target_directory)


In [7]:
from tqdm.contrib.concurrent import process_map

root_path = "/media/logan/m.2/datasets/image/imagenet-dataset/2010/train"

def create_directory_for_tar_file(metadata):
    if 'tar_file' in metadata.keys():
        sub_folder = os.path.join(root_path, metadata['label'])
        metadata['sub_folder'] = sub_folder
        untar_into_directory(metadata['tar_file'], sub_folder)
    return metadata

updated_map = process_map(create_directory_for_tar_file, metadata_map.values(), max_workers=8, chunksize=4)


  0%|          | 0/1676 [00:00<?, ?it/s]

100%|██████████| 1676/1676 [00:00<00:00, 6711.69it/s]


In [8]:
updated_map[0]

{'metadata': array([(array([[1]], dtype=uint8), array(['n07711080'], dtype='<U9'), array(['french fries, french-fried potatoes, fries, chips'], dtype='<U49'), array(['strips of potato fried in deep fat'], dtype='<U34'), array([[0]], dtype=uint8), array([], shape=(1, 0), dtype=uint8), array([[0]], dtype=uint8), array([[1487]], dtype=uint16))],
       dtype=[('ILSVRC2010_ID', 'O'), ('WNID', 'O'), ('words', 'O'), ('gloss', 'O'), ('num_children', 'O'), ('children', 'O'), ('wordnet_height', 'O'), ('num_train_images', 'O')]),
 'label': 'french fries'}

In [9]:
# now this is making the metadata file for the dataset

images = glob("/media/logan/m.2/datasets/image/imagenet-dataset/2010/train/*/*.JPEG")

len(images)

1261406

In [10]:
image_label_paris = []

for image in images:
    image_label_paris.append((image, os.path.basename(os.path.dirname(image))))

In [11]:
image_label_paris[0]

('/media/logan/m.2/datasets/image/imagenet-dataset/2010/train/bean/n13136316_16239.JPEG',
 'bean')

In [15]:
from PIL import Image

def check_image(image_label):
    image, label = image_label
    try:
        img = Image.open(image)
        if img.mode not in ['RGB', 'L']:  # 'L' for grayscale, 'RGB' for standard color
            return None
        return image_label
    except:
        return None



from tqdm.contrib.concurrent import process_map
from tqdm import tqdm


new_pairs = []
for pair in tqdm(image_label_paris):
    new_pair  = check_image(pair)
    if new_pair is not None:
        new_pairs.append(new_pair)

100%|██████████| 1261406/1261406 [01:11<00:00, 17738.58it/s]


In [16]:
len(new_pairs), len(image_label_paris)

(1261390, 1261406)

In [17]:
import pandas as pd

df = pd.DataFrame(new_pairs, columns=['image', 'label'])

In [18]:
df

Unnamed: 0,image,label
0,/media/logan/m.2/datasets/image/imagenet-datas...,bean
1,/media/logan/m.2/datasets/image/imagenet-datas...,bean
2,/media/logan/m.2/datasets/image/imagenet-datas...,bean
3,/media/logan/m.2/datasets/image/imagenet-datas...,bean
4,/media/logan/m.2/datasets/image/imagenet-datas...,bean
...,...,...
1261385,/media/logan/m.2/datasets/image/imagenet-datas...,seashore
1261386,/media/logan/m.2/datasets/image/imagenet-datas...,seashore
1261387,/media/logan/m.2/datasets/image/imagenet-datas...,seashore
1261388,/media/logan/m.2/datasets/image/imagenet-datas...,seashore


In [19]:
df_train = df.sample(frac=0.8, random_state=42)

df_eval = df.drop(df_train.index)
# df_eval = df_eval.sample(frac=0.5, random_state=42)

In [20]:
len(df_train), len(df_eval)

(1009112, 252278)

In [32]:
df_train.to_csv("/media/logan/m.2/datasets/image/imagenet-dataset/2010/train.csv", index=False, sep="|")
df_eval.to_csv("/media/logan/m.2/datasets/image/imagenet-dataset/2010/eval.csv", index=False, sep="|")