In [9]:
import time
import pandas as pd
import json

from huggingface_hub import HfApi
from huggingface_hub import hf_hub_url, get_hf_file_metadata
from huggingface_hub.utils import GatedRepoError

api = HfApi()


def find_model_size(repo_files, modelId):
    """
    Find the size of a model given its ID, considering only files of the highest priority extension present.

    Args:
        modelId: The ID of the model.

    Returns:
        The size of the model or None if not found.
    """
    api_token = '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'
    priority_extensions = ['.bin', '.safetensors', '.pth', '.h5', '.gguf', '.pt', '.ckpt', '.npz', '.model', '.onnx',
                           '.mgspack', '.pkl', '.whisper', '.nemo', '.joblib', '.pb', '.npy', '.cleanrl_model',
                           '.data-00000-of-00001', '.weights', '.ptl', '.binary', 'CNN_model', '.msgpack', '.addon',
                           '.mdl', '.tgt', '.src', '.tar', '.zip', '.tar.xz']
    # Filter files by allowed extensions
    model_files = [file for file in repo_files if any(file.endswith(ext) for ext in priority_extensions)]
    if not model_files:
        print(f"No model files found for {modelId}")
        return None

    for ext in priority_extensions:
        files_with_ext = [file for file in model_files if file.endswith(ext)]
        if files_with_ext:
            try:
                # Sum the sizes of all files with this extension
                file_size = sum(get_hf_file_metadata(hf_hub_url(repo_id=modelId, filename=model_file), token=api_token).size for model_file in files_with_ext)
                return file_size
            except GatedRepoError:
                print(f'Need authorization to retrieve model size from {modelId}.')
                return None
            except Exception as e:
                print(f"Error retrieving file metadata for {modelId} with extension {ext}: {e}")
                return None  # Return None to indicate failure

    print(f"Could not retrieve model size for {modelId}")
    return None


def retrieve_model_datasets(model):
    """
    Retrieve the datasets used by a given model.

    Args:
        model: The model object.

    Returns:
        A list of datasets used by the model.
    """

    if model.card_data and 'datasets' in model.card_data:
        if type(model.cardData["datasets"]) is list:
            datasets = model.cardData["datasets"]
        else:
            datasets = model.cardData["datasets"]
    else:
        datasets = ['']

    return datasets


def find_datasets_size(datasets):
    """
    Find the size of datasets used by a given model.

    Args:
        datasets: A list of datasets.

    Returns:
        The total size of the datasets or None if not found.
    """

    datasets_size = 0
    if datasets is None:
        return None

    api_token = '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'

    for dataset in datasets:
        try:
            datasets_size += api.dataset_info(dataset,token=api_token).cardData["dataset_info"]["dataset_size"]
        except:
            pass

    return datasets_size


def api_calls_parameters(model):
    """
    Get size, datasets size, and creation date from API calls.

    Args:
        model: The model object.
        datasets: A list of datasets.

    Returns:
        A tuple containing size, datasets_size, and created_at.
    """

    commits = size = created_at = None
    api_token = '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'

    try:
        files = api.list_repo_files(repo_id=model.modelId, token=api_token)
    except GatedRepoError:
        print(
            f'Need authorization to retrieve files and commits from {model.modelId}')
        files = 'needs authorization'
    except Exception as e:
        print(
            f'Unexpected error on retrieving "files" for {model.modelId}:', str(e))
        files = None

    try:
        commits = api.list_repo_commits(repo_id=model.modelId, token=api_token)
        commits = [{**commit.__dict__, "created_at": commit.created_at.strftime(
            '%Y-%m-%dT%H:%M:%S.%fZ')[:-3]+'Z'} for commit in commits]
        created_at = commits[-1]['created_at']
        commits = json.dumps(commits)
    except GatedRepoError:
        commits = 'not authorized'
    except Exception as e:
        print(
            f'Unexpected error on retrieving "commits" for {model.modelId}:', str(e))
        commits = None

    return files, commits, size, created_at


def process_model(model_id):
    """
    Process a model ID and extract relevant information.

    Args:
        model_id: The ID of the model (string).

    Returns:
        A dictionary containing the processed model information.
    """
    datasets_size = None
    datasets = []
    files = commits = size = created_at = model_size = None

    try:
        # Retrieve model metadata
        model_info = api.model_info(repo_id=model_id, token='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')

        # Retrieve datasets
        datasets = retrieve_model_datasets(model_info)
        datasets_size = find_datasets_size(datasets)

        # Retrieve files, commits, size, and creation date
        files, commits, size, created_at = api_calls_parameters(model_info)

        # Calculate model size
        if files:
            model_size = find_model_size(files, model_id)
            print(f"Model {model_id}, Size: {model_size} bytes")
        else:
            print(f"Model {model_id}, Unable to retrieve repo files")

    except Exception as e:
        print(f'Error processing model {model_id}: {str(e)}')

    return {
        'modelId': model_id,
        'size': model_size,
        'datasets': datasets,
        'datasets_size': datasets_size,
        'downloads': model_info.downloads if 'model_info' in locals() else None,
        'likes': model_info.likes if 'model_info' in locals() else None,
        'library_name': model_info.library_name if 'model_info' in locals() else None,
        'created_at': created_at,
        'commits': commits,
        'files': files
    }


# Models extraction

In [2]:
import requests
import pandas as pd

# Your API token
api_token = "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"

url = "https://huggingface.co/api/models"

# Set headers with authentication token
headers = {
    "Authorization": f"Bearer {api_token}"
}

# Parameters for pagination
limit = 5000  # Maximum models per request
total_models = 5000  # Total models to fetch
offset = 0
models = []

# Loop to fetch models in batches
while len(models) < total_models:
    params = {
        "limit": limit,
        "offset": offset,
        "sort": "downloads",  # Sort by downloads
        "direction": -1,      # Descending order (-1 for descending)
    }
    response = requests.get(url, headers=headers, params=params)

    # Check for successful response
    if response.status_code == 200:
        model_data = response.json()
        models.extend(model_data)
        
        # Stop if fewer than requested models are returned (end of list)
        if len(model_data) < limit:
            break
    else:
        print(f"Failed to fetch data: {response.status_code} - {response.text}")
        break
    
    # Update offset for the next batch
    offset += limit

# Truncate the list to exactly 400 models if necessary
models = models[:total_models]

# Extract model IDs or other attributes (e.g., 'modelId', 'downloads') into an array
top_models = [{"modelId": model["modelId"], "downloads": model["downloads"]} for model in models]

# Output the array of top models
print(len(top_models))
print(top_models)

1000
[{'modelId': 'sentence-transformers/all-mpnet-base-v2', 'downloads': 296552424}, {'modelId': 'nesaorg/benchmark_v0', 'downloads': 118617403}, {'modelId': 'sentence-transformers/all-MiniLM-L6-v2', 'downloads': 99334920}, {'modelId': 'FacebookAI/xlm-roberta-large', 'downloads': 72928929}, {'modelId': 'google-bert/bert-base-uncased', 'downloads': 70401215}, {'modelId': 'Qwen/Qwen2.5-1.5B-Instruct', 'downloads': 43395422}, {'modelId': 'nesaorg/fc_8', 'downloads': 37152421}, {'modelId': 'microsoft/resnet-50', 'downloads': 31051253}, {'modelId': 'openai/clip-vit-large-patch14', 'downloads': 29787899}, {'modelId': 'nesaorg/fc_6', 'downloads': 24655433}, {'modelId': 'openai/clip-vit-base-patch32', 'downloads': 21801468}, {'modelId': 'jonatasgrosman/wav2vec2-large-xlsr-53-english', 'downloads': 21139889}, {'modelId': 'openai/clip-vit-large-patch14-336', 'downloads': 21096806}, {'modelId': 'nesaorg/fc_12', 'downloads': 20024791}, {'modelId': 'FacebookAI/roberta-base', 'downloads': 19326431}

In [3]:
top_models

[{'modelId': 'sentence-transformers/all-mpnet-base-v2',
  'downloads': 296552424},
 {'modelId': 'nesaorg/benchmark_v0', 'downloads': 118617403},
 {'modelId': 'sentence-transformers/all-MiniLM-L6-v2', 'downloads': 99334920},
 {'modelId': 'FacebookAI/xlm-roberta-large', 'downloads': 72928929},
 {'modelId': 'google-bert/bert-base-uncased', 'downloads': 70401215},
 {'modelId': 'Qwen/Qwen2.5-1.5B-Instruct', 'downloads': 43395422},
 {'modelId': 'nesaorg/fc_8', 'downloads': 37152421},
 {'modelId': 'microsoft/resnet-50', 'downloads': 31051253},
 {'modelId': 'openai/clip-vit-large-patch14', 'downloads': 29787899},
 {'modelId': 'nesaorg/fc_6', 'downloads': 24655433},
 {'modelId': 'openai/clip-vit-base-patch32', 'downloads': 21801468},
 {'modelId': 'jonatasgrosman/wav2vec2-large-xlsr-53-english',
  'downloads': 21139889},
 {'modelId': 'openai/clip-vit-large-patch14-336', 'downloads': 21096806},
 {'modelId': 'nesaorg/fc_12', 'downloads': 20024791},
 {'modelId': 'FacebookAI/roberta-base', 'download

In [4]:
model_data = [
    {
        "modelId": model["modelId"],
        "downloads": model["downloads"],
        "organization": model["modelId"].split("/")[0] if "/" in model["modelId"] else "unknown"
    }
    for model in models
]

# Convert to DataFrame for easier grouping
df = pd.DataFrame(model_data)
print(len(df["organization"].unique()))

# Group by organization and keep the model with the most downloads
top_models = df.loc[df.groupby("organization")["downloads"].idxmax()]

363


In [5]:
len(top_models)
top_models.to_csv('models_info.csv', index = False)

In [6]:
models_id = top_models["modelId"].tolist()
models_id

['1-800-BAD-CODE/punctuation_fullstop_truecase_english',
 'AdamCodd/vit-base-nsfw-detector',
 'Alibaba-NLP/gte-large-en-v1.5',
 'Ashishkr/query_wellformedness_score',
 'AutonLab/MOMENT-1-large',
 'BAAI/bge-small-en-v1.5',
 'Babelscape/wikineural-multilingual-ner',
 'BridgeTower/bridgetower-large-itm-mlm-itc',
 'ByteDance/Hyper-SD',
 'CIDAS/clipseg-rd64-refined',
 'Cloudy1225/stackoverflow-roberta-base-sentiment',
 'CompVis/stable-diffusion-safety-checker',
 'Danswer/intent-model',
 'DeepChem/ChemBERTa-77M-MTR',
 'DeepFloyd/t5-v1_1-xxl',
 'DeepPavlov/rubert-base-cased',
 'Dmyadav2001/Sentimental-Analysis',
 'Efficient-Large-Model/Llama-3-VILA1.5-8B-Fix',
 'EleutherAI/gpt-neo-2.7B',
 'EmergentMethods/gliner_medium_news-v2.1',
 'Ericwang/tiny-random-ast',
 'FacebookAI/xlm-roberta-large',
 'Falconsai/nsfw_image_detection',
 'Gustavosta/MagicPrompt-Stable-Diffusion',
 'Hate-speech-CNERG/tamil-codemixed-abusive-MuRIL',
 'Helsinki-NLP/opus-mt-fr-en',
 'HooshvareLab/bert-fa-base-uncased-ner-pe

Saving into a csv the information about the 363 models. The columns saved are modelId, size, datasets, dataset_size, downloads, likes, library_name, created_at, commits and files.

In [10]:
models_information = []
for model_id in models_id:
    print(f"Processing model: {model_id}")
    processed_model = process_model(model_id)
    if processed_model:
        models_information.append(processed_model)

df = pd.DataFrame(models_information)
df.to_csv('HF_topmodels.csv', index=False)
print(f"Processed {len(models_information)} models. Results saved to 'HF_topmodels.csv'.")

Processing model: 1-800-BAD-CODE/punctuation_fullstop_truecase_english
Model 1-800-BAD-CODE/punctuation_fullstop_truecase_english, Size: 587902 bytes
Processing model: AdamCodd/vit-base-nsfw-detector




Model AdamCodd/vit-base-nsfw-detector, Size: 344391328 bytes
Processing model: Alibaba-NLP/gte-large-en-v1.5
Model Alibaba-NLP/gte-large-en-v1.5, Size: 1736585680 bytes
Processing model: Ashishkr/query_wellformedness_score
Model Ashishkr/query_wellformedness_score, Size: 501040705 bytes
Processing model: AutonLab/MOMENT-1-large
Model AutonLab/MOMENT-1-large, Size: 1385575757 bytes
Processing model: BAAI/bge-small-en-v1.5
Model BAAI/bge-small-en-v1.5, Size: 133508397 bytes
Processing model: Babelscape/wikineural-multilingual-ner
Model Babelscape/wikineural-multilingual-ner, Size: 709170662 bytes
Processing model: BridgeTower/bridgetower-large-itm-mlm-itc
Model BridgeTower/bridgetower-large-itm-mlm-itc, Size: 3681345984 bytes
Processing model: ByteDance/Hyper-SD
Model ByteDance/Hyper-SD, Size: 27739238686 bytes
Processing model: CIDAS/clipseg-rd64-refined
Model CIDAS/clipseg-rd64-refined, Size: 603143713 bytes
Processing model: Cloudy1225/stackoverflow-roberta-base-sentiment
Model Cloudy