In [1]:
import pandas as pd
from huggingface_hub import HfApi, list_models, list_datasets
import json
from datetime import datetime
from typing import Any
import os
import time
# Data was collected on Feb 15th, 2025

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import huggingface_hub
huggingface_hub.__version__

'0.28.1'

In [3]:
hf_api=HfApi()

In [4]:
nlp=['text-classification',
     'token-classification',
     'table-question-answering',
     'question-answering',
     'zero-shot-classification',
     'translation',
     'summarization',
     'feature-extraction',
     'text-generation',
     'text2text-generation',
     'fill-mask',
     'sentence-similarity',
     # 'table-to-text',
     # 'multiple-choice',
     # 'text-retrieval' 
     ]

In [5]:
cv=['depth-estimation',
    'image-classification',
    'object-detection',
    'image-segmentation',
    'text-to-image',
    'image-to-text',
    'image-to-image',
    'image-to-video',
    'unconditional-image-generation',
    'video-classification',
    'text-to-video',
    'zero-shot-image-classification',
    'mask-generation',
    'zero-shot-object-detection',
    'text-to-3d',
    'image-to-3d',
    'image-feature-extraction',
    'keypoint-detection'
]

In [None]:
print(len(nlp), len(cv))

In [10]:
class CustomEncoder(json.JSONEncoder):
    def default(self, obj: Any):
        if isinstance(obj, datetime):
            return obj.isoformat()  # Convert datetime to ISO 8601
        elif hasattr(obj, '__dict__'):  
            return obj.__dict__  # Convert objects (like RepoSibling) to dictionary
        return super().default(obj)  # Fallback to default serialization

##### Model collection

##### NLP

In [None]:
for nlp_task in sorted(nlp):
    folder_path = f'../../../data/huggingface/model/nlp/{nlp_task}'
    os.makedirs(folder_path, exist_ok=True) # Create a folder if it does not exist
    models_generator = hf_api.list_models(filter=nlp_task, full=True, cardData=True) # Get the generator object
    print(nlp_task)
    try:
        for model in models_generator:
            model_dict = model.__dict__
            # pprint(model_dict)
            with open('../../../data/huggingface/model/nlp/'+nlp_task+'/'+f"{model._id.replace('/', '_')}.json", 'w') as json_file:
                #slash is not allowed in file name so replace it with underscore
                json.dump(model_dict, json_file, indent=4, cls=CustomEncoder)
    except:
        continue

##### CV

In [None]:
for cv_task in sorted(cv):
    folder_path = f'../../../data/huggingface/model/cv/{cv_task}'
    os.makedirs(folder_path, exist_ok=True)
    models_generator = hf_api.list_models(filter=cv_task, full=True, cardData=True)
    print(cv_task)
    try:
        for model in models_generator:
            model_dict = model.__dict__
            with open('../../../data/huggingface/model/cv/'+cv_task+'/'+f"{model._id.replace('/', '_')}.json", 'w') as json_file:
                json.dump(model_dict, json_file, indent=4, cls=CustomEncoder)
    except:
        continue

##### Dataset collection

##### NLP

In [None]:
for nlp_task in sorted(nlp):
    folder_path = f'../../../data/huggingface/dataset/nlp/{nlp_task}'
    os.makedirs(folder_path, exist_ok=True) # Create a folder if it does not exist
    datasets_generator = hf_api.list_datasets(task_categories=nlp_task, full=True) # Get the generator object
    print(nlp_task)
    for dataset in datasets_generator:
        dataset_dict = dataset.__dict__
        # pprint(model_dict)
        with open('../../../data/huggingface/dataset/nlp/'+nlp_task+'/'+f"{dataset._id.replace('/', '_')}.json", 'w') as json_file:
            json.dump(dataset_dict, json_file, indent=4, cls=CustomEncoder)

##### CV

In [11]:
for cv_task in sorted(cv):
    folder_path = f'../../../data/huggingface/dataset/cv_new/{cv_task}'
    os.makedirs(folder_path, exist_ok=True)
    datasets_generator = hf_api.list_datasets(task_categories=cv_task, full=True)
    print(cv_task)
    for dataset in datasets_generator:
        dataset_dict = dataset.__dict__
        with open('../../../data/huggingface/dataset/cv_new/'+cv_task+'/'+f"{dataset._id.replace('/', '_')}.json", 'w') as json_file:
            json.dump(dataset_dict, json_file, indent=4, cls=CustomEncoder)

depth-estimation
image-classification
image-feature-extraction
image-segmentation
image-to-3d
image-to-image
image-to-text
image-to-video
keypoint-detection
mask-generation
object-detection
text-to-3d
text-to-image
text-to-video
unconditional-image-generation
video-classification
zero-shot-image-classification
zero-shot-object-detection


In [None]:
# json.read('facebook_bart-large-mnli_zero-shot-classification.json')
with open('nomic-ai_nomic-embed-text-v2-moe_feature-extraction.json', 'r') as file:
    data = json.load(file)
pprint(data['cardData']['base_model'])