In [1]:
import os
from datasets import load_dataset
from dotenv import load_dotenv
from collections import defaultdict
from datasets import load_dataset
from collections import Counter
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import pandas as pd

In [2]:
load_dotenv("../../Show-o/.env")

True

In [3]:
TOKEN = os.getenv("HUGGINGFACE_KEY")

In [5]:
imagenet = load_dataset("imagenet-1k", token=TOKEN, split="train")

Loading dataset shards:   0%|          | 0/257 [00:00<?, ?it/s]

In [6]:
def process_batch(start, end):
    batch = imagenet[start:end]
    return Counter(batch["label"])

# Use ThreadPoolExecutor for parallel processing
def compute_label_distribution(dataset, num_workers=16, chunk_size=1000):
    label_counts = Counter()
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for i in tqdm(range(0, len(dataset), chunk_size), desc="Submitting jobs"):
            futures.append(executor.submit(process_batch, i, i + chunk_size))

        # Aggregate results as they complete
        for future in tqdm(futures, desc="Aggregating results"):
            label_counts.update(future.result())

    return label_counts

# Compute label distribution
label_distribution = compute_label_distribution(imagenet)

# Print the distribution
print(label_distribution)

Submitting jobs: 100%|██████████| 1282/1282 [00:01<00:00, 1275.76it/s]
Aggregating results: 100%|██████████| 1282/1282 [12:13<00:00,  1.75it/s]


Counter({726: 1300, 917: 1300, 13: 1300, 939: 1300, 6: 1300, 983: 1300, 655: 1300, 579: 1300, 702: 1300, 845: 1300, 69: 1300, 822: 1300, 575: 1300, 752: 1300, 219: 1300, 192: 1300, 191: 1300, 292: 1300, 848: 1300, 108: 1300, 372: 1300, 765: 1300, 473: 1300, 525: 1300, 639: 1300, 99: 1300, 127: 1300, 76: 1300, 905: 1300, 30: 1300, 634: 1300, 907: 1300, 979: 1300, 718: 1300, 154: 1300, 293: 1300, 9: 1300, 922: 1300, 130: 1300, 33: 1300, 968: 1300, 719: 1300, 840: 1300, 139: 1300, 198: 1300, 236: 1300, 304: 1300, 547: 1300, 215: 1300, 853: 1300, 805: 1300, 28: 1300, 104: 1300, 67: 1300, 311: 1300, 429: 1300, 941: 1300, 950: 1300, 603: 1300, 971: 1300, 486: 1300, 504: 1300, 497: 1300, 670: 1300, 459: 1300, 559: 1300, 829: 1300, 888: 1300, 773: 1300, 784: 1300, 274: 1300, 146: 1300, 245: 1300, 761: 1300, 256: 1300, 326: 1300, 264: 1300, 827: 1300, 690: 1300, 973: 1300, 91: 1300, 615: 1300, 301: 1300, 361: 1300, 614: 1300, 572: 1300, 92: 1300, 303: 1300, 799: 1300, 362: 1300, 222: 1300, 371:

In [8]:
df = pd.DataFrame(label_distribution.items(), columns=["label", "count"])

# Save to CSV
df.to_csv("imagenet_label_distribution.csv", index=False)

In [9]:
label_distribution = defaultdict(int)

for image_label in tqdm(imagenet):
    label_distribution[image_label["label"]] += 1

100%|██████████| 1281167/1281167 [31:20<00:00, 681.34it/s] 


In [10]:
label_distribution

defaultdict(int,
            {726: 1300,
             917: 1300,
             13: 1300,
             939: 1300,
             6: 1300,
             983: 1300,
             655: 1300,
             579: 1300,
             702: 1300,
             845: 1300,
             69: 1300,
             822: 1300,
             575: 1300,
             906: 1213,
             752: 1300,
             219: 1300,
             192: 1300,
             191: 1300,
             292: 1300,
             848: 1300,
             108: 1300,
             372: 1300,
             765: 1300,
             473: 1300,
             525: 1300,
             639: 1300,
             686: 1120,
             99: 1300,
             127: 1300,
             76: 1300,
             905: 1300,
             550: 1136,
             30: 1300,
             634: 1300,
             907: 1300,
             979: 1300,
             718: 1300,
             154: 1300,
             914: 1206,
             293: 1300,
             9: 1300,
        

In [12]:
max(label_distribution.items(), key= lambda x: x[1])

(726, 1300)

In [7]:
import csv
import json
import requests

# Download ImageNet class index mapping
url = "https://storage.googleapis.com/download.tensorflow.org/data/imagenet_class_index.json"
imagenet_classes = requests.get(url).json()

# Function to convert label index to text and replace underscores with spaces
def imagenet_label_to_text(label_index):
    text_label = imagenet_classes.get(str(label_index), ["Unknown", "Unknown"])[1]
    return text_label.replace("_", " ")

# Read the input CSV and create a new CSV with formatted text labels and count
input_file = "imagenet_label_distribution.csv"
output_file = "output.csv"

with open(input_file, mode="r") as infile, open(output_file, mode="w", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = ["text_label", "count"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for row in reader:
        text_label = imagenet_label_to_text(row["label"])
        writer.writerow({"text_label": text_label, "count": row["count"]})

print(f"New CSV file '{output_file}' created successfully!")


New CSV file 'output.csv' created successfully!
