# Scrape models info from Hugging face

In [1]:
import requests
from bs4 import BeautifulSoup
import json

Output:
- List of categories
- List of models inside each category
- Model:
    + Name
    + ...

In [2]:
#define list of tasks
categories = [] #'Multimodal', 'Computer Vision', 'Natural Language Processing', 'Audio', 'Tabular', 'Reinforcement Learning'
cat_map = {}    #key: category, value: list of tasks


In [3]:
def custom_query(get_url):
    #print(get_url)
    try:
        r = requests.get(get_url)
        return r.json()
    except Exception as e:
       print(e)
       return {'error': e}

In [4]:
#find task statistics
def find_task_details(tag):
    simple_details = {}
    details = custom_query('https://huggingface.co/models-json?sort=downloads&withCount=true&pipeline_tag=' + tag)
    if 'error' in details:
        return {}   #error
    #extract basic info
    simple_details['numTotalItems'] = details['numTotalItems']
    if 'models' in details:
        model_details = []
        for model in details['models']:
            model_details.append({
                'id': model['id'],
                'downloads': model['downloads']
            })
        simple_details['models'] = model_details
    return simple_details

In [5]:
#scrape and export to json file
def scrape_huggingface():
    list_url = 'https://huggingface.co/models?sort=trending'
    response = requests.get(list_url)

    soup = BeautifulSoup(response.content, 'html.parser')
    items = soup.find_all('div', attrs={'class': 'mb-3'})
    index = 0
    for item in items:
        a_tags = item.find_all('a', attrs={'class': 'mb-1 mr-1 md:mb-1.5 md:mr-1.5 rounded-lg'})
        if a_tags != None and len(a_tags) > 0:
            cat_tag = item.find('div', attrs={'class': 'mb-3 text-sm font-medium text-gray-500'})
            cat_name = cat_tag.text.strip().replace('\n', "").replace('\t', "")
            categories.append(cat_name)
            task_list = []
            for a_tag in a_tags:
                # if index > 0:
                #     break
                tag_name = a_tag['href'].replace('/models?pipeline_tag=', '')
                task_details = find_task_details(tag_name)
                task_list.append({
                    'tag': tag_name,
                    'name': a_tag.find('span').text,
                    'task_details': task_details
                })
                index += 1
            cat_map[cat_name+''] = task_list
            # if index > 0:
            #     break

    #done (1m 40s)
    #print(cat_map)
    with open('huggingface_modellist.json', 'w') as f:
        f.write(str(cat_map).replace("'", '"'))
#test
#scrape_huggingface()

In [6]:
def read_json_file(filepath):
    with open('huggingface_modellist.json', 'r') as file:
      data = json.load(file)
      return data
#
data = read_json_file('huggingface_modellist.json')
#data

In [7]:
def sort(arr):
    return sorted(arr, key=lambda item: item['total'])  #default ascending

In [None]:
#show data
def show_data():
    #find percentage of each category
    total_models = 0
    total_models_map = {} #key: cat, value: total models
    for cat in data.keys():
        print(cat + ' ==========')
        sub_cats = data[cat]
        total_models_map[cat] = 0
        index = 1
        sorted_sub_cats = sorted(sub_cats, key=lambda item: item['task_details']['numTotalItems'], reverse=True)
        for sub_cat in sorted_sub_cats: #Image Text to Text, Any to Any, etc
            total_models += sub_cat['task_details']['numTotalItems']
            total_models_map[cat] += sub_cat['task_details']['numTotalItems']
            #print sub model categories
            print(str(index) + '. ' + sub_cat['name'].replace('-', ' ') + ' ' + str(sub_cat['task_details']['numTotalItems']))
            index += 1
            #sort and print 10 most download models in each subcat
            sorted_models = sorted(sub_cat['task_details']['models'], key=lambda item: item['downloads'], reverse=True)
            for i in range(min(10, len(sorted_models))):
                print(sorted_models[i])
    #print percentage of categories
    print('----------')
    print(str(total_models))
    print(total_models_map)
    for cat in total_models_map:
         print(cat + ': ' + str(round((total_models_map[cat] / total_models) * 100, 2)))

#test
show_data()
#arrTotal = [7, 18, 13, 6, 3, 3]

1. Image Text to Text 7666
{'id': 'Qwen/Qwen2.5-VL-7B-Instruct', 'downloads': 3186856}
{'id': 'microsoft/Florence-2-large', 'downloads': 2944818}
{'id': 'meta-llama/Llama-3.2-11B-Vision-Instruct', 'downloads': 1427475}
{'id': 'Qwen/Qwen2.5-VL-3B-Instruct', 'downloads': 1217869}
{'id': 'Qwen/Qwen2-VL-7B-Instruct', 'downloads': 1143874}
{'id': 'llava-hf/llava-onevision-qwen2-0.5b-ov-hf', 'downloads': 1097361}
{'id': 'liuhaotian/llava-v1.5-7b', 'downloads': 923306}
{'id': 'Qwen/Qwen2-VL-2B-Instruct', 'downloads': 908564}
{'id': 'llava-hf/llava-1.5-7b-hf', 'downloads': 745769}
{'id': 'google/gemma-3-27b-it', 'downloads': 615940}
2. Any to Any 6669
{'id': 'openbmb/MiniCPM-o-2_6', 'downloads': 751813}
{'id': 'deepseek-ai/Janus-Pro-7B', 'downloads': 246601}
{'id': 'deepseek-ai/Janus-Pro-1B', 'downloads': 74218}
{'id': 'openbmb/MiniCPM-o-2_6-int4', 'downloads': 29277}
{'id': 'openbmb/MiniCPM-o-2_6-gguf', 'downloads': 17398}
{'id': 'deepseek-ai/Janus-1.3B', 'downloads': 11480}
{'id': 'VARGPT-fa