# Scrape models info from Hugging face

In [13]:
import requests
from bs4 import BeautifulSoup
import json

Output:
- List of categories
- List of models inside each category
- Model:
    + Name
    + ...

In [14]:
#define list of tasks
categories = [] #'Multimodal', 'Computer Vision', 'Natural Language Processing', 'Audio', 'Tabular', 'Reinforcement Learning'
cat_map = {}    #key: category, value: list of tasks


In [15]:
def custom_query(get_url):
    #print(get_url)
    try:
        r = requests.get(get_url)
        return r.json()
    except Exception as e:
       print(e)
       return {'error': e}

In [16]:
#find task statistics
def find_task_details(tag):
    simple_details = {}
    details = custom_query('https://huggingface.co/models-json?sort=downloads&withCount=true&pipeline_tag=' + tag)
    if 'error' in details:
        return {}   #error
    #extract basic info
    simple_details['numTotalItems'] = details['numTotalItems']
    if 'models' in details:
        model_details = []
        for model in details['models']:
            model_details.append({
                'id': model['id'],
                'downloads': model['downloads']
            })
        simple_details['models'] = model_details
    return simple_details

In [17]:
#scrape and export to json file
def scrape_huggingface():
    list_url = 'https://huggingface.co/models?sort=trending'
    response = requests.get(list_url)

    soup = BeautifulSoup(response.content, 'html.parser')
    items = soup.find_all('div', attrs={'class': 'mb-3'})
    index = 0
    for item in items:
        a_tags = item.find_all('a', attrs={'class': 'mb-1 mr-1 md:mb-1.5 md:mr-1.5 rounded-lg'})
        if a_tags != None and len(a_tags) > 0:
            cat_tag = item.find('div', attrs={'class': 'mb-3 text-sm font-medium text-gray-500'})
            cat_name = cat_tag.text.strip().replace('\n', "").replace('\t', "")
            categories.append(cat_name)
            task_list = []
            for a_tag in a_tags:
                # if index > 0:
                #     break
                tag_name = a_tag['href'].replace('/models?pipeline_tag=', '')
                task_details = find_task_details(tag_name)
                task_list.append({
                    'tag': tag_name,
                    'name': a_tag.find('span').text,
                    'task_details': task_details
                })
                index += 1
            cat_map[cat_name+''] = task_list
            # if index > 0:
            #     break

    #done (1m 40s)
    #print(cat_map)
    with open('huggingface_modellist.json', 'w') as f:
        f.write(str(cat_map).replace("'", '"'))
#test
scrape_huggingface()

In [18]:
def read_json_file(filepath):
    with open('huggingface_modellist.json', 'r') as file:
      data = json.load(file)
      return data
#
data = read_json_file('huggingface_modellist.json')
#data

In [21]:
def sort(arr):
    return sorted(arr, key=lambda item: item['total'])  #default ascending

In [None]:
#show data
def show_data():
    #find percentage of each category
    total_models = 0
    total_models_map = {} #key: cat, value: total models
    for cat in data.keys():
        print(cat + ' ==========')
        sub_cats = data[cat]
        total_models_map[cat] = 0
        index = 1
        sorted_sub_cats = sorted(sub_cats, key=lambda item: item['task_details']['numTotalItems'], reverse=True)
        for sub_cat in sorted_sub_cats:
            total_models += sub_cat['task_details']['numTotalItems']
            total_models_map[cat] += sub_cat['task_details']['numTotalItems']
            #print sub model categories
            print(str(index) + '. ' + sub_cat['name'].replace('-', ' ') + ' ' + str(sub_cat['task_details']['numTotalItems']))
            index += 1
    #print percentage of categories
    print('----------')
    print(str(total_models))
    print(total_models_map)
    for cat in total_models_map:
         print(cat + ': ' + str(round((total_models_map[cat] / total_models) * 100, 2)))


#test
show_data()
#arrTotal = [7, 18, 13, 6, 3, 3]

1. Image Text to Text 7666
2. Any to Any 6669
3. Visual Question Answering 473
4. Document Question Answering 221
5. Video Text to Text 90
6. Audio Text to Text 56
7. Visual Document Retrieval 38
1. Text to Image 66719
2. Image Classification 16266
3. Object Detection 3169
4. Unconditional Image Generation 1890
5. Video Classification 1424
6. Image Segmentation 1253
7. Image to Image 962
8. Zero Shot Image Classification 889
9. Image to Text 776
10. Image Feature Extraction 597
11. Text to Video 416
12. Image to 3D 197
13. Depth Estimation 188
14. Mask Generation 184
15. Image to Video 135
16. Text to 3D 56
17. Keypoint Detection 56
18. Zero Shot Object Detection 48
1. Text Generation 199746
2. Text Classification 88572
3. Text2Text Generation 37097
4. Token Classification 21670
5. Fill Mask 14192
6. Question Answering 13061
7. Feature Extraction 11834
8. Sentence Similarity 9354
9. Translation 5446
10. Summarization 2253
11. Zero Shot Classification 381
12. Table Question Answering 14