In [59]:
import time
import pandas as pd
import json

from huggingface_hub import HfApi
from huggingface_hub import hf_hub_url, get_hf_file_metadata
from huggingface_hub.utils import GatedRepoError


def find_model_size(repo_files, modelId):
    """
    Find the size of a model given its ID, considering only files of the highest priority extension present.
   
    Args:
        modelId: The ID of the model.
   
    Returns:
        The size of the model or None if not found.
    """
    api_token = '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'
    priority_extensions = ['.bin', '.safetensors', '.pth', '.h5', '.gguf', '.pt', '.ckpt', '.npz', '.model', '.onnx',
                           '.mgspack', '.pkl', '.whisper', '.nemo', '.joblib', '.pb', '.npy', '.cleanrl_model',
                           '.data-00000-of-00001', '.weights', '.ptl', '.binary', 'CNN_model', '.msgpack', '.addon',
                           '.mdl', '.tgt', '.src', '.tar', '.zip', '.tar.xz']
    # Filter files by allowed extensions
    model_files = [file for file in repo_files if any(file.endswith(ext) for ext in priority_extensions)]
    if not model_files:
        print(f"No model files found for {modelId}")
        return None
   
    for ext in priority_extensions:
        files_with_ext = [file for file in model_files if file.endswith(ext)]
        if files_with_ext:
            try:
                # Sum the sizes of all files with this extension
                file_size = sum(get_hf_file_metadata(hf_hub_url(repo_id=modelId, filename=model_file), token=api_token).size for model_file in files_with_ext)
                return file_size
            except GatedRepoError:
                print(f'Need authorization to retrieve model size from {modelId}.')
                return None
            except Exception as e:
                print(f"Error retrieving file metadata for {modelId} with extension {ext}: {e}")
                return None  # Return None to indicate failure

    print(f"Could not retrieve model size for {modelId}")
    return None


def retrieve_model_datasets(model):
    """
    Retrieve the datasets used by a given model.
    
    Args:
        model: The model object.
    
    Returns:
        A list of datasets used by the model.
    """

    if model.card_data and 'datasets' in model.card_data:
        if type(model.cardData["datasets"]) is list:
            datasets = model.cardData["datasets"]
        else:
            datasets = model.cardData["datasets"]
    else:
        datasets = ['']

    return datasets


def find_datasets_size(datasets):
    """
    Find the size of datasets used by a given model.
    
    Args:
        datasets: A list of datasets.
    
    Returns:
        The total size of the datasets or None if not found.
    """    
    
    datasets_size = 0
    if datasets is None:
        return None
    
    api_token = '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'

    for dataset in datasets:
        try:
            datasets_size += api.dataset_info(dataset,token=api_token).cardData["dataset_info"]["dataset_size"]
        except:
            pass

    return datasets_size

        
def api_calls_parameters(model):
    """
    Get size, datasets size, and creation date from API calls.

    Args:
        model: The model object.
        datasets: A list of datasets.

    Returns:
        A tuple containing size, datasets_size, and created_at.
    """

    commits = size = created_at = None
    api_token = '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'

    try:
        files = api.list_repo_files(repo_id=model.modelId, token=api_token)
    except GatedRepoError:
        print(
            f'Need authorization to retrieve files and commits from {model.modelId}')
        files = 'needs authorization'
    except Exception as e:
        print(
            f'Unexpected error on retrieving "files" for {model.modelId}:', str(e))
        files = None
    
    try:
        commits = api.list_repo_commits(repo_id=model.modelId, token=api_token)
        commits = [{**commit.__dict__, "created_at": commit.created_at.strftime(
            '%Y-%m-%dT%H:%M:%S.%fZ')[:-3]+'Z'} for commit in commits]
        created_at = commits[-1]['created_at']
        commits = json.dumps(commits)
    except GatedRepoError:
        commits = 'not authorized'
    except Exception as e:
        print(
            f'Unexpected error on retrieving "commits" for {model.modelId}:', str(e))
        commits = None

    return files, commits, size, created_at


def process_model(model):
    """
    Process a model and extract relevant information.

    Args:
    model: A tuple containing the model object.

    Returns:
        A dictionary containing the processed model information.
    """

    idx, model_info = model
    
    datasets = retrieve_model_datasets(model_info)
    datasets_size = find_datasets_size(datasets)
    
    print(datasets)
    
    files = commits = size = created_at = model_size = None

    try:
        # Call the function for each model
        files, commits, size, created_at = api_calls_parameters(model_info)

        # Continue with the rest of your logic
        if files:
            model_size = find_model_size(files, model_info.modelId)
            print(f"Model {idx}, Size: {model_size} bytes")
        else:
            print(f"Model {idx}, Unable to retrieve repo files")

    except Exception as e:
        print(f'Error processing model {str(e)}')

    return {'modelId': model_info.modelId,
                'size': model_size,
                'datasets': datasets,
                'datasets_size': datasets_size,
                'downloads': model_info.downloads,
                'likes': model_info.likes,
                'library_name': model_info.library_name,
                'created_at': created_at,
                'commits': commits,
                'files': files}

  from .autonotebook import tqdm as notebook_tqdm


# Models extraction

In [60]:
import requests
import pandas as pd

# Your API token
api_token = "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"

url = "https://huggingface.co/api/models"

# Set headers with authentication token
headers = {
    "Authorization": f"Bearer {api_token}"
}

# Parameters for pagination
limit = 5000  # Maximum models per request
total_models = 5000  # Total models to fetch
offset = 0
models = []

# Loop to fetch models in batches
while len(models) < total_models:
    params = {
        "limit": limit,
        "offset": offset,
        "sort": "downloads",  # Sort by downloads
        "direction": -1,      # Descending order (-1 for descending)
    }
    response = requests.get(url, headers=headers, params=params)

    # Check for successful response
    if response.status_code == 200:
        model_data = response.json()
        models.extend(model_data)
        
        # Stop if fewer than requested models are returned (end of list)
        if len(model_data) < limit:
            break
    else:
        print(f"Failed to fetch data: {response.status_code} - {response.text}")
        break
    
    # Update offset for the next batch
    offset += limit

# Truncate the list to exactly 400 models if necessary
models = models[:total_models]

# Extract model IDs or other attributes (e.g., 'modelId', 'downloads') into an array
top_models = [{"modelId": model["modelId"], "downloads": model["downloads"]} for model in models]

# Output the array of top models
print(len(top_models))
print(top_models)

1000
[{'modelId': 'sentence-transformers/all-mpnet-base-v2', 'downloads': 428575767}, {'modelId': 'sentence-transformers/all-MiniLM-L6-v2', 'downloads': 92113137}, {'modelId': 'nesaorg/benchmark_v0', 'downloads': 84327724}, {'modelId': 'google-bert/bert-base-uncased', 'downloads': 65405294}, {'modelId': 'FacebookAI/xlm-roberta-large', 'downloads': 65395971}, {'modelId': 'openai/whisper-large-v2', 'downloads': 44353532}, {'modelId': 'Qwen/Qwen2.5-1.5B-Instruct', 'downloads': 37797318}, {'modelId': 'openai/clip-vit-base-patch32', 'downloads': 25332456}, {'modelId': 'openai/clip-vit-large-patch14', 'downloads': 24862339}, {'modelId': 'nesaorg/fc_8', 'downloads': 24508153}, {'modelId': 'FacebookAI/roberta-base', 'downloads': 22890463}, {'modelId': 'jonatasgrosman/wav2vec2-large-xlsr-53-english', 'downloads': 22543344}, {'modelId': 'openai/clip-vit-base-patch16', 'downloads': 21645671}, {'modelId': 'facebook/opt-1.3b', 'downloads': 18754658}, {'modelId': 'google/vit-base-patch16-224-in21k',

In [61]:
top_models

[{'modelId': 'sentence-transformers/all-mpnet-base-v2',
  'downloads': 428575767},
 {'modelId': 'sentence-transformers/all-MiniLM-L6-v2', 'downloads': 92113137},
 {'modelId': 'nesaorg/benchmark_v0', 'downloads': 84327724},
 {'modelId': 'google-bert/bert-base-uncased', 'downloads': 65405294},
 {'modelId': 'FacebookAI/xlm-roberta-large', 'downloads': 65395971},
 {'modelId': 'openai/whisper-large-v2', 'downloads': 44353532},
 {'modelId': 'Qwen/Qwen2.5-1.5B-Instruct', 'downloads': 37797318},
 {'modelId': 'openai/clip-vit-base-patch32', 'downloads': 25332456},
 {'modelId': 'openai/clip-vit-large-patch14', 'downloads': 24862339},
 {'modelId': 'nesaorg/fc_8', 'downloads': 24508153},
 {'modelId': 'FacebookAI/roberta-base', 'downloads': 22890463},
 {'modelId': 'jonatasgrosman/wav2vec2-large-xlsr-53-english',
  'downloads': 22543344},
 {'modelId': 'openai/clip-vit-base-patch16', 'downloads': 21645671},
 {'modelId': 'facebook/opt-1.3b', 'downloads': 18754658},
 {'modelId': 'google/vit-base-patch1

In [62]:
model_data = [
    {
        "modelId": model["modelId"],
        "downloads": model["downloads"],
        "organization": model["modelId"].split("/")[0] if "/" in model["modelId"] else "unknown"
    }
    for model in models
]

# Convert to DataFrame for easier grouping
df = pd.DataFrame(model_data)
print(len(df["organization"].unique()))

# Group by organization and keep the model with the most downloads
top_models = df.loc[df.groupby("organization")["downloads"].idxmax()]

377


In [63]:
len(top_models)
top_models

Unnamed: 0,modelId,downloads,organization
979,1-800-BAD-CODE/xlm-roberta_punctuation_fullsto...,118724,1-800-BAD-CODE
56,1231czx/llama3_it_ultra_list_and_bold500,5315764,1231czx
217,AdamCodd/vit-base-nsfw-detector,1283368,AdamCodd
99,Alibaba-NLP/gte-large-en-v1.5,2787700,Alibaba-NLP
284,Ashishkr/query_wellformedness_score,894812,Ashishkr
...,...,...,...
906,yisol/IDM-VTON,137997,yisol
192,yiyanghkust/finbert-tone,1494790,yiyanghkust
549,zake7749/gemma-2-2b-it-chinese-kyara-dpo,342814,zake7749
787,zer0int/CLIP-GmP-ViT-L-14,178380,zer0int


In [64]:
models_id = top_models["modelId"].tolist()
models_id

['1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase',
 '1231czx/llama3_it_ultra_list_and_bold500',
 'AdamCodd/vit-base-nsfw-detector',
 'Alibaba-NLP/gte-large-en-v1.5',
 'Ashishkr/query_wellformedness_score',
 'AutonLab/MOMENT-1-large',
 'BAAI/bge-small-en-v1.5',
 'Babelscape/wikineural-multilingual-ner',
 'Bettensor/podos_soccer_model',
 'BridgeTower/bridgetower-large-itm-mlm-itc',
 'ByteDance/Hyper-SD',
 'CIDAS/clipseg-rd64-refined',
 'Cloudy1225/stackoverflow-roberta-base-sentiment',
 'CofeAI/FLM-2-52B-Instruct-2407',
 'CompVis/stable-diffusion-safety-checker',
 'Danswer/intent-model',
 'DeepChem/ChemBERTa-77M-MTR',
 'DeepFloyd/t5-v1_1-xxl',
 'DeepMount00/Llama-3-8b-Ita',
 'DeepPavlov/rubert-base-cased',
 'EleutherAI/pythia-14m',
 'EmergentMethods/gliner_medium_news-v2.1',
 'Ericwang/tiny-random-ast',
 'Ertugrul/Qwen2-VL-7B-Captioner-Relaxed',
 'FacebookAI/xlm-roberta-large',
 'Falconsai/nsfw_image_detection',
 'FreedomIntelligence/HuatuoGPT-Vision-7B',
 'Gustavosta/MagicProm

In [65]:
# Retrieve model information through HfApi call
''' api = HfApi()
all_models = api.list_models(cardData=True)

model_ids_to_search = [
    'MIT/ast-finetuned-audioset-10-10-0.4593',
    'openai/clip-vit-large-patch14',
    'sentence-transformers/all-MiniLM-L6-v2',
    'google-bert/bert-base-uncased',
    'google/vit-base-patch16-224-in21k',
    'FacebookAI/roberta-base',
    'microsoft/layoutlmv2-base-uncased',
    'pyannote/segmentation-3.0',
    'facebook/opt-125m',
    'distilbert/distilbert-base-uncased',
    'timm/resnet50.a1_in1k',
    'nesaorg/benchmark_v0',
    'BAAI/bge-base-en-v1.5',
    'amazon/chronos-t5-tiny',
    'meta-llama/Llama-3.1-8B-Instruct',
    'stabilityai/sdxl-turbo',
    'Qwen/Qwen2-VL-72B-Instruct-AWQ',
    'google-t5/t5-small',
    'openai-community/gpt2',
    'mixedbread-ai/mxbai-embed-large-v1',
    'laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
    'owkin/phikon',
    'Helsinki-NLP/opus-mt-zh-en',
    'cardiffnlp/twitter-roberta-base-sentiment-latest',
    'textattack/bert-base-uncased-snli',
    'allenai/longformer-base-4096',
    'unslothai/1',
    'trl-internal-testing/dummy-GPT2-correct-vocab',
    'CompVis/stable-diffusion-v1-4',
    'sportstensor/basic_model',
    'cross-encoder/ms-marco-TinyBERT-L-2-v2',
    'Falconsai/nsfw_image_detection',
    'Salesforce/blip-image-captioning-large',
    'myshell-ai/MeloTTS-English',
    'CIDAS/clipseg-rd64-refined',
    'nvidia/speakerverification_en_titanet_large',
    'bigscience/bloomz-560m',
    'colbert-ir/colbertv2.0',
    'Alibaba-NLP/gte-large-en-v1.5',
    'almanach/camembert-base',
    'stable-diffusion-v1-5/stable-diffusion-v1-5',
    'mistralai/Mistral-7B-Instruct-v0.2',
    'Rostlab/prot_t5_xl_uniref50',
    
]

models_org = [
   model for model in all_models if model.id in model_ids_to_search]

models = [(idx, model)
          for idx, model in enumerate(models_org)]'''



" api = HfApi()\nall_models = api.list_models(cardData=True)\n\nmodel_ids_to_search = [\n    'MIT/ast-finetuned-audioset-10-10-0.4593',\n    'openai/clip-vit-large-patch14',\n    'sentence-transformers/all-MiniLM-L6-v2',\n    'google-bert/bert-base-uncased',\n    'google/vit-base-patch16-224-in21k',\n    'FacebookAI/roberta-base',\n    'microsoft/layoutlmv2-base-uncased',\n    'pyannote/segmentation-3.0',\n    'facebook/opt-125m',\n    'distilbert/distilbert-base-uncased',\n    'timm/resnet50.a1_in1k',\n    'nesaorg/benchmark_v0',\n    'BAAI/bge-base-en-v1.5',\n    'amazon/chronos-t5-tiny',\n    'meta-llama/Llama-3.1-8B-Instruct',\n    'stabilityai/sdxl-turbo',\n    'Qwen/Qwen2-VL-72B-Instruct-AWQ',\n    'google-t5/t5-small',\n    'openai-community/gpt2',\n    'mixedbread-ai/mxbai-embed-large-v1',\n    'laion/CLIP-ViT-B-16-laion2B-s34B-b88K',\n    'owkin/phikon',\n    'Helsinki-NLP/opus-mt-zh-en',\n    'cardiffnlp/twitter-roberta-base-sentiment-latest',\n    'textattack/bert-base-uncas

In [67]:
start = time.time()

# Process models sequentially without parallelization
models_information = []
for model_info in models_id:
    print(model_info)
    models_information.append(process_model(model_info))

models_information = [
    model for model in models_information if model is not None]
df = pd.DataFrame(models_information)
end = time.time()
print(end - start)

df.to_csv('HF_image-classification_models222.csv')

1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase


ValueError: too many values to unpack (expected 2)