In [1]:
from glob import glob

f_path = "/media/logan/m.2/datasets/image/imagenet-dataset/2012/ILSVRC2012_devkit_t12/data/meta.mat"
tar_files = glob("/media/logan/m.2/datasets/image/imagenet-dataset/2012/train/*.tar")

In [2]:
len(tar_files)

1000

In [None]:
! pip install scipy

In [3]:
import os
import scipy.io

mat_data = scipy.io.loadmat(f_path)
metadata_map = {}
tar_file_map = {}

for metadata in mat_data['synsets']:
    metadata_map[metadata[0][1][0]] = {"label": metadata[0][2][0].split(",")[0]}

for tar_file in tar_files:
    tar_file_map[os.path.basename(tar_file).split(".")[0]] = tar_file
    
for key in tar_file_map.keys():
    if key in  metadata_map.keys():
        metadata_map[key]['tar_file'] = tar_file_map[key]

In [4]:
metadata_map['n01484850']

{'label': 'great white shark',
 'tar_file': '/media/logan/m.2/datasets/image/imagenet-dataset/2012/train/n01484850.tar'}

In [5]:
import os
import tarfile

def untar_into_directory(tar_file, target_directory):
    """
    Untar a tar file into a target directory.

    Args:
        tar_file (str): Path to the tar file.
        target_directory (str): Path to the target directory where the contents will be extracted.
    
    Returns:
        None
    """
    # Create the target directory if it doesn't exist
    os.makedirs(target_directory, exist_ok=True)

    # Open the tar file for reading
    with tarfile.open(tar_file, 'r') as tar:
        # Extract all the contents into the target directory
        tar.extractall(path=target_directory)


In [6]:
import os
os.path.basename(metadata_map['n01484850']['tar_file']).split(".")[0]

'n01484850'

In [7]:
from tqdm.contrib.concurrent import process_map

root_path = "/media/logan/m.2/datasets/image/imagenet-dataset/2012/train"

def create_directory_for_tar_file(metadata):
    if 'tar_file' in metadata.keys():
        sub_folder = os.path.join(root_path, os.path.basename(metadata['tar_file']).split(".")[0])
        untar_into_directory(metadata['tar_file'], sub_folder)
        return sub_folder

updated_map = process_map(create_directory_for_tar_file, metadata_map.values(), max_workers=8, chunksize=4)


  0%|          | 0/1860 [00:00<?, ?it/s]

In [14]:
updated_map[0], metadata_map['n01443537']

('/media/logan/m.2/datasets/image/imagenet-dataset/2012/train/n02119789',
 {'label': 'goldfish',
  'tar_file': '/media/logan/m.2/datasets/image/imagenet-dataset/2012/train/n01443537.tar'})

In [10]:
# now this is making the metadata file for the dataset

images = glob("/media/logan/m.2/datasets/image/imagenet-dataset/2012/train/*/*.JPEG")

len(images)

1281167

In [12]:
image_label_paris = []

for image in images:
    image_label_paris.append((image, os.path.basename(os.path.dirname(image))))

In [13]:
image_label_paris[0]

('/media/logan/m.2/datasets/image/imagenet-dataset/2012/train/n03930630/n03930630_24612.JPEG',
 'n03930630')

In [15]:
from PIL import Image

def check_image(image_label):
    image, label = image_label
    try:
        img = Image.open(image)
        img.verify()
        if img.mode not in ['RGB', 'L']:  # 'L' for grayscale, 'RGB' for standard color
            return None
        return image_label
    except:
        return None



from tqdm.contrib.concurrent import process_map
from tqdm import tqdm


new_pairs = []
for pair in tqdm(image_label_paris):
    new_pair  = check_image(pair)
    if new_pair is not None:
        new_pairs.append(new_pair)

100%|██████████| 1281167/1281167 [03:29<00:00, 6128.32it/s] 


In [16]:
len(new_pairs), len(image_label_paris)

(1281144, 1281167)

In [None]:
new_pairs[0]

In [17]:
folder_label_map = {}

for pair in new_pairs:
    key = pair[1]
    label = metadata_map[pair[1]]['label']
    folder_label_map[key] = label

In [18]:
folder_label_map['n01443537']

'goldfish'

In [19]:
import json
with open('folder_label_map.json', 'w') as fp:
    json.dump(folder_label_map, fp)

In [20]:
import pandas as pd

df = pd.DataFrame(new_pairs, columns=['image', 'label'])

In [21]:
df

Unnamed: 0,image,label
0,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
1,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
2,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
3,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
4,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
...,...,...
1281139,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941
1281140,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941
1281141,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941
1281142,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941


In [22]:
df.to_csv('train.csv', index=False, sep="|")

In [4]:
from glob import glob

valid_images = glob("/media/logan/m.2/datasets/image/imagenet-dataset/2012/val/*/*.JPEG")

len(valid_images)

50000

In [5]:
from PIL import Image

def check_image(image):
    try:
        img = Image.open(image)
        img.verify()
        if img.mode not in ['RGB', 'L']:  # 'L' for grayscale, 'RGB' for standard color
            return None
        return image
    except:
        return None



from tqdm.contrib.concurrent import process_map
from tqdm import tqdm


new_pairs = []
for pair in tqdm(valid_images):
    new_pair  = check_image(pair)
    if new_pair is not None:
        new_pairs.append(new_pair)

100%|██████████| 50000/50000 [00:04<00:00, 11304.38it/s]


In [6]:
len(new_pairs)

49999

In [9]:
import os
import pandas as pd
image_label_pairs_val = []

for image in new_pairs:
    image_label_pairs_val.append((image, os.path.basename(os.path.dirname(image))))
    
len(image_label_pairs_val)

df_val = pd.DataFrame(image_label_pairs_val, columns=['image', 'label'])

df_val

Unnamed: 0,image,label
0,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
1,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
2,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
3,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
4,/media/logan/m.2/datasets/image/imagenet-datas...,n03930630
...,...,...
49994,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941
49995,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941
49996,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941
49997,/media/logan/m.2/datasets/image/imagenet-datas...,n03207941


In [10]:
df_val.to_csv('eval.csv', index=False, sep="|")

In [None]:
s