# Scrape models info from Hugging face

In [2]:
import requests
from bs4 import BeautifulSoup
import json

Output:
- List of categories
- List of models inside each category
- Model:
    + Name
    + ...

In [3]:
#define list of tasks
categories = [] #'Multimodal', 'Computer Vision', 'Natural Language Processing', 'Audio', 'Tabular', 'Reinforcement Learning'
cat_map = {}    #key: category, value: list of tasks


In [4]:
def custom_query(get_url):
    #print(get_url)
    try:
        r = requests.get(get_url)
        return r.json()
    except Exception as e:
       print(e)
       return {'error': e}

In [5]:
#find task statistics
def find_task_details(tag):
    simple_details = {}
    details = custom_query('https://huggingface.co/models-json?sort=downloads&withCount=true&pipeline_tag=' + tag)
    if 'error' in details:
        return {}   #error
    #extract basic info
    simple_details['numTotalItems'] = details['numTotalItems']
    if 'models' in details:
        model_details = []
        for model in details['models']:
            model_details.append({
                'id': model['id'],
                'downloads': model['downloads']
            })
        simple_details['models'] = model_details
    return simple_details

In [6]:
#scrape and export to json file
def scrape_huggingface():
    list_url = 'https://huggingface.co/models?sort=trending'
    response = requests.get(list_url)

    soup = BeautifulSoup(response.content, 'html.parser')
    items = soup.find_all('div', attrs={'class': 'mb-3'})
    index = 0
    for item in items:
        a_tags = item.find_all('a', attrs={'class': 'mb-1 mr-1 md:mb-1.5 md:mr-1.5 rounded-lg'})
        if a_tags != None and len(a_tags) > 0:
            cat_tag = item.find('div', attrs={'class': 'mb-3 text-sm font-medium text-gray-500'})
            cat_name = cat_tag.text.strip().replace('\n', "").replace('\t', "")
            categories.append(cat_name)
            task_list = []
            for a_tag in a_tags:
                # if index > 0:
                #     break
                tag_name = a_tag['href'].replace('/models?pipeline_tag=', '')
                task_details = find_task_details(tag_name)
                task_list.append({
                    'tag': tag_name,
                    'name': a_tag.find('span').text,
                    'task_details': task_details
                })
                index += 1
            cat_map[cat_name+''] = task_list
            # if index > 0:
            #     break

    #done (1m 40s)
    #print(cat_map)
    with open('huggingface_modellist.json', 'w') as f:
        f.write(str(cat_map).replace("'", '"'))


In [16]:
def read_json_file(filepath):
    with open('huggingface_modellist.json', 'r') as file:
      data = json.load(file)
      return data
#
data = read_json_file('huggingface_modellist.json')
data

{'Multimodal': [{'tag': 'audio-text-to-text',
   'name': 'Audio-Text-to-Text',
   'task_details': {'numTotalItems': 57,
    'models': [{'id': 'Qwen/Qwen2-Audio-7B-Instruct', 'downloads': 150250},
     {'id': 'fixie-ai/ultravox-v0_3', 'downloads': 109556},
     {'id': 'fixie-ai/ultravox-v0_5-llama-3_2-1b', 'downloads': 103235},
     {'id': 'fixie-ai/ultravox-v0_4', 'downloads': 44104},
     {'id': 'Qwen/Qwen2-Audio-7B', 'downloads': 15864},
     {'id': 'NexaAIDev/Qwen2-Audio-7B-GGUF', 'downloads': 8301},
     {'id': 'fixie-ai/ultravox-v0_5-llama-3_3-70b', 'downloads': 7640},
     {'id': 'fixie-ai/ultravox-v0_4_1-llama-3_1-8b', 'downloads': 6441},
     {'id': 'fixie-ai/ultravox-v0_5-llama-3_1-8b', 'downloads': 6063},
     {'id': 'fixie-ai/ultravox-v0_4_1-mistral-nemo', 'downloads': 1460},
     {'id': 'sarvamai/shuka-1', 'downloads': 949},
     {'id': 'stepfun-ai/Step-Audio-Chat', 'downloads': 918},
     {'id': 'fixie-ai/ultravox-v0_4_1-llama-3_1-70b', 'downloads': 804},
     {'id': 'mrad

In [32]:
#show data
def show_data():
    #find percentage of each category
    total_models = 0
    total_models_map = {} #key: cat, value: total models
    for cat in data.keys():
        # print(cat)
        sub_cats = data[cat]
        total_models_map[cat] = 0
        for sub_cat in sub_cats:
            total_models += sub_cat['task_details']['numTotalItems']
            total_models_map[cat] += sub_cat['task_details']['numTotalItems']
            #print(cat + ' ' + sub_cat['name'] + ' ' + str(sub_cat['task_details']['numTotalItems']))

    print(str(total_models))
    print(total_models_map)
    for cat in total_models_map:
        print(cat + ': ' + str(round((total_models_map[cat] / total_models) * 100, 2)))

#test
show_data()

602987
{'Multimodal': 14986, 'Computer Vision': 94492, 'Natural Language Processing': 401876, 'Audio': 34551, 'Tabular': 540, 'Reinforcement Learning': 56482, 'Other': 60}
Multimodal: 2.49
Computer Vision: 15.67
Natural Language Processing: 66.65
Audio: 5.73
Tabular: 0.09
Reinforcement Learning: 9.37
Other: 0.01
