# Data Availability

In [72]:
from huggingface_hub import HfApi
import pandas as pd
import itertools
import re
import requests
from bs4 import BeautifulSoup
import time
import json
import os

from tags import * # tags.py

In [73]:
api = HfApi()

### LLMs

In [74]:
models = api.list_models(full=True)

In [75]:
model = itertools.islice(models, 0, 1000)
models_df = pd.DataFrame(model)
models_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,likes,...,pipeline_tag,mask_token,card_data,widget_data,model_index,config,transformers_info,siblings,spaces,safetensors
0,albert/albert-base-v1,albert,082438ba120d36b97b9288772a41144e941705b9,2022-03-02 23:29:04+00:00,2024-02-19 10:57:35+00:00,False,False,,15692,8,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
1,albert/albert-base-v2,albert,8e2f239c5f8a2c0f253781ca60135db913e5c80c,2022-03-02 23:29:04+00:00,2024-02-19 10:58:14+00:00,False,False,,2405466,98,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
2,albert/albert-large-v1,albert,94fd741fb5d6cb5bc578fc154837016c583bafef,2022-03-02 23:29:04+00:00,2024-02-19 10:58:26+00:00,False,False,,1950,3,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
3,albert/albert-large-v2,albert,dfed3a5ef4499fb3351c4ebbcf487375d1e942c8,2022-03-02 23:29:04+00:00,2024-02-19 10:58:48+00:00,False,False,,10885,15,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
4,albert/albert-xlarge-v1,albert,ed6f87d14403b3c459a458fa6aa9dc5c51c517c1,2022-03-02 23:29:04+00:00,2024-02-19 11:01:28+00:00,False,False,,1378,4,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
5,albert/albert-xlarge-v2,albert,4fd2c2aa9aeb305f87704a7e595be7bfffa3db88,2022-03-02 23:29:04+00:00,2024-04-10 09:57:46+00:00,False,False,,3807,8,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
6,albert/albert-xxlarge-v1,albert,43129068ee5f6a481c148daeac11cc593b8ff440,2022-03-02 23:29:04+00:00,2024-02-19 11:01:42+00:00,False,False,,4795,5,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
7,albert/albert-xxlarge-v2,albert,97d3e58863d3a41dc581882f73b34d110b18f1f8,2022-03-02 23:29:04+00:00,2024-02-19 11:02:09+00:00,False,False,,12049,19,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
8,google-bert/bert-base-cased-finetuned-mrpc,google-bert,f150c1d609d1e50dd5e2e5408661cfac8339277c,2022-03-02 23:29:04+00:00,2024-02-19 11:03:21+00:00,False,False,,51720,1,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
9,google-bert/bert-base-cased,google-bert,cd5ef92a9fb2f889e972770a36d4ed042daf221e,2022-03-02 23:29:04+00:00,2024-02-19 11:02:26+00:00,False,False,,5654352,245,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,


In [76]:
print(models_df.columns)

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'likes', 'library_name', 'tags',
       'pipeline_tag', 'mask_token', 'card_data', 'widget_data', 'model_index',
       'config', 'transformers_info', 'siblings', 'spaces', 'safetensors'],
      dtype='object')


In [77]:
models_df.loc[0]['tags']

['transformers',
 'pytorch',
 'tf',
 'safetensors',
 'albert',
 'fill-mask',
 'exbert',
 'en',
 'dataset:bookcorpus',
 'dataset:wikipedia',
 'arxiv:1909.11942',
 'license:apache-2.0',
 'autotrain_compatible',
 'endpoints_compatible',
 'region:us']

In [78]:
example_model = api.list_models(model_name='meta-llama/Meta-Llama-3.1-8B-Instruct')
example_df = pd.DataFrame(example_model)
example_df.loc[0]['tags']

['transformers',
 'safetensors',
 'llama',
 'text-generation',
 'facebook',
 'meta',
 'pytorch',
 'llama-3',
 'conversational',
 'en',
 'de',
 'fr',
 'it',
 'pt',
 'hi',
 'es',
 'th',
 'arxiv:2204.05149',
 'license:llama3.1',
 'autotrain_compatible',
 'text-generation-inference',
 'endpoints_compatible',
 'region:us']

In [79]:
# Scrape languages from HF

url_languages = 'https://huggingface.co/languages'

response = requests.get(url_languages)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

code_tags = soup.find_all('code')
tag_language = [code_tag.get_text() for code_tag in code_tags]
print(tag_language)

tag_language.remove('jax') # 'jax' is the ISO for Jambi Malay (present in 3 datasets, 36 models), impossible to distinguish from JAX the library... TODO: better solution?

['en', 'zh', 'fr', 'es', 'ru', 'de', 'ja', 'pt', 'ko', 'ar', 'it', 'vi', 'tr', 'hi', 'id', 'pl', 'nl', 'th', 'bn', 'fa', 'sv', 'cs', 'ro', 'fi', 'ca', 'ta', 'da', 'hu', 'uk', 'ind', 'el', 'te', 'ur', 'bg', 'he', 'ms', 'ml', 'sl', 'mr', 'sk', 'sw', 'et', 'eu', 'kn', 'gu', 'sr', 'no', 'hr', 'lt', 'lv', 'pa', 'is', 'yo', 'vie', 'am', 'af', 'ne', 'az', 'mt', 'gl', 'si', 'sq', 'ga', 'or', 'kk', 'tl', 'ceb', 'tha', 'cy', 'as', 'mk', 'hy', 'ka', 'ha', 'my', 'uz', 'eng', 'ig', 'eo', 'be', 'nb', 'km', 'mn', 'ky', 'la', 'zu', 'min', 'so', 'jav', 'xh', 'nn', 'ps', 'rw', 'jv', 'mya', 'yue', 'tt', 'br', 'bs', 'ckb', 'lg', 'sa', 'lo', 'wo', 'ku', 'ug', 'sd', 'ilo', 'ast', 'tw', 'sun', 'tg', 'lb', 'ace', 'nso', 'gd', 'war', 'fil', 'tgl', 'fy', 'oc', 'bug', 'tk', 'su', 'bjn', 'khm', 'sn', 'gn', 'ht', 'yi', 'mai', 'ba', 'bo', 'zlm', 'ban', 'fo', 'tn', 'dv', 'kab', 'ln', 'bm', 'ny', 'cv', 'shn', 'ti', 'aa', 'sat', 'mi', 'mg', 'lao', 'sah', 'arz', 'ee', 'mar', 'st', 'ia', 'pag', 'qu', 'hsb', 'ab', 'azb',

In [80]:
def extract_name(full_name):
    pattern = re.compile(r'[^/]+/(.+)')
    match = re.search(pattern, full_name)
    if match:
        return match.group(1) # the part after '/' might also contain version and number of parameters (impossible to extract in a uniform way)
    else:
        return full_name

def match_string(entries, pattern):
    pattern = re.compile(pattern)
    for entry in entries:
        match = pattern.match(entry)
        if match:
            return match.group(1)
    return None

def find_all_matches(entries, pattern):
    pattern = re.compile(pattern)
    matches = []
    for entry in entries:
        match = pattern.match(entry)
        if match:
            matches.append(match.group(1))
    return matches

def match_license(entries):
    return match_string(entries, r'license:(\S+)')

def match_dataset(entries):
    return find_all_matches(entries, r'dataset:(\S+)')

def match_uri(entries):
    uri = match_string(entries, r'arxiv:(\S+)')
    if uri is None:
        uri = match_string(entries, r'doi:(\S+)')
    return uri

In [94]:
# Fill attributes for a random model

# TODO: check None attributes

model_idx = 0

def extract_attributes(model_idx):
	model = models_df.loc[model_idx]
	model_tags = models_df.loc[model_idx]['tags']
	model_card_data = None
	try:
		model_card_data = next(api.list_models(model_name=model['id'], full=True, cardData=True)).card_data.to_dict()
	except AttributeError:
		print('No card data available for this model')
	model_attributes = dict()

	model_attributes['name'] = extract_name(model['id'])
	model_attributes['version'] = None # in model['id'] but impossible to extract in a uniform way
	model_attributes['numberOfParameters'] = None # sometimes in model['id'] -> difficult to extract, sometimes in model description on HF

	model_attributes['quantization'] = None
	for t in model_tags:
		if t in tag_quantization:
			model_attributes['quantization'] = t

	model_attributes['architecture'] = None
	try:
		if model_card_data is not None:
			model_attributes['architecture'] = model_card_data['base_model']
	except KeyError:
		print('No architecture data available for this model')

	model_attributes['languages'] = []
	for t in model_tags:
		if t in tag_language:
			model_attributes['languages'].append(t)

	model_attributes['modelCreator'] = None # TODO: if base_model exists, look for 'author' of the base model
	try:
		if model_card_data is not None:
			base_model = model_card_data['base_model']
			base_model_data = pd.DataFrame(api.list_models(model_name=base_model, full=True))
			model_attributes['modelCreator'] = base_model_data.loc[0]['author']
	except KeyError:
		print('No base model data available for this model')

	model_attributes['licenseToUse'] = match_license(model_tags)

	model_attributes['libraryFramework'] = [] # TODO: change type into list(str) in our model
	for t in model_tags:
		if t in tag_library:
			model_attributes['libraryFramework'].append(t)

	model_attributes['contextLength'] = None
	model_attributes['developers'] = [model['author']]
	model_attributes['openSource'] = True

	model_attributes['uri'] = match_uri(model_tags)

	model_attributes['fineTuned'] = None # if there is a 'base_model' in card_data, it is fine-tuned

	model_attributes['carbonEmission [CO2eq tons]'] = None
	try:
		if model_card_data is not None:
			model_attributes['carbonEmission [CO2eq tons]'] = model_card_data['co2_eq_emissions']
	except KeyError:
		print('No emission data available for this model')

	model_attributes['tokenizer'] = None

	return model_attributes

In [82]:
models_df.loc[model_idx]

id                                               albert/albert-base-v1
author                                                          albert
sha                           082438ba120d36b97b9288772a41144e941705b9
created_at                                   2022-03-02 23:29:04+00:00
last_modified                                2024-02-19 10:57:35+00:00
private                                                          False
gated                                                            False
disabled                                                          None
downloads                                                        15692
likes                                                                8
library_name                                              transformers
tags                 [transformers, pytorch, tf, safetensors, alber...
pipeline_tag                                                 fill-mask
mask_token                                                        None
card_d

In [83]:
model_tags

['transformers',
 'safetensors',
 'gguf',
 'llama',
 'text-generation',
 'llama-factory',
 'orpo',
 'conversational',
 'en',
 'zh',
 'base_model:meta-llama/Meta-Llama-3.1-8B-Instruct',
 'base_model:quantized:meta-llama/Meta-Llama-3.1-8B-Instruct',
 'doi:10.57967/hf/2779',
 'license:llama3.1',
 'autotrain_compatible',
 'text-generation-inference',
 'endpoints_compatible',
 'region:us']

In [84]:
for key, value in model_attributes.items():
	print(f"{'  '}{key}: {value}")

  name: Llama3.1-8B-Chinese-Chat
  version: None
  number of parameters: None
  quantization: None
  architecture: meta-llama/Meta-Llama-3.1-8B-Instruct
  language: ['en', 'zh']
  model creator: meta-llama
  developer: None
  license to use: llama3.1
  library: ['transformers', 'safetensors', 'gguf']
  context length: None
  open source: True
  uri: None
  fine-tuned: None
  carbon emission: None
  tokenizer: None


In [85]:
def model_to_json(model_idx):
    
    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    
    os.makedirs(result_path, exist_ok=True)
    
    model_attributes = extract_attributes(model_idx)
    
    with open(os.path.join(result_path, 'ChatIMPACT.LargeLanguageModel.json'), 'w', encoding='utf-8') as f:
        json.dump(model_attributes, f, indent=4)

In [88]:
def models_to_json(models_df):
    
    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    
    os.makedirs(result_path, exist_ok=True)

    output = []
    
    for model_idx in range(models_df.shape[0]):
        output.append(extract_attributes(model_idx))
    
    with open(os.path.join(result_path, 'ChatIMPACT.LargeLanguageModel.json'), 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=4)

In [95]:
models_to_json(models_df)

No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available f

Invalid model-index. Not loading eval results into CardData.


No card data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this mode

Invalid model-index. Not loading eval results into CardData.


No architecture data available for this model
No base model data available for this model
No emission data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No card data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No card data available for this model
No card data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this mode

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No architecture data available for this model
No base model data available for this model
No architecture data available for this model
No base model data available for this model
No card data available for this model
No card data available for this model
No card data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No card data available for

In [86]:
for idx in range(3):
    model_to_json(idx)

No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model
No architecture data available for this model
No base model data available for this model
No emission data available for this model


In [16]:
# evaluate availability of attributes

availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

llm_attributes = ['name', 'version', 'number of parameters', 'quantization', 'architecture', 'language', 'model creator', 'license to use', 'library framework', 'context length', 'developer', 'open source', 'uri', 'fine-tuned', 'carbon emission', 'tokenizer']
llm_attributes_API_availability = [True, False, False, False, False, True, True, True, True, False, False, False, True, False, True, False]
availability['id'] = models_df['id']
availability['entity name'] = 'LLM'
availability = availability.loc[availability.index.repeat(len(llm_attributes))].reset_index(drop=True)
availability['attribute name'] = llm_attributes * len(models_df)
availability['available API'] = llm_attributes_API_availability * len(models_df)

In [17]:
availability.head(32)

Unnamed: 0,id,entity name,attribute name,available API,available scraping
0,albert/albert-base-v1,LLM,name,True,
1,albert/albert-base-v1,LLM,version,False,
2,albert/albert-base-v1,LLM,number of parameters,False,
3,albert/albert-base-v1,LLM,quantization,False,
4,albert/albert-base-v1,LLM,architecture,False,
5,albert/albert-base-v1,LLM,language,True,
6,albert/albert-base-v1,LLM,model creator,True,
7,albert/albert-base-v1,LLM,license to use,True,
8,albert/albert-base-v1,LLM,library framework,True,
9,albert/albert-base-v1,LLM,context length,False,


## Dataset

In [56]:
datasets = api.list_datasets(full=True)

In [57]:
datasets = list(itertools.islice(datasets, 0, 1000))
datasets_df = pd.DataFrame(datasets)
datasets_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,likes,paperswithcode_id,tags,card_data,siblings
0,amirveyseh/acronym_identification,amirveyseh,15ef643450d589d5883e289ffadeb03563e80a9e,2022-03-02 23:29:22+00:00,2024-01-09 11:39:57+00:00,False,False,False,115,19,acronym-identification,"[task_categories:token-classification, annotat...",{},
1,ade-benchmark-corpus/ade_corpus_v2,ade-benchmark-corpus,4ba01c71687dd7c996597042449448ea312126cf,2022-03-02 23:29:22+00:00,2024-01-09 11:42:58+00:00,False,False,False,451,25,,"[task_categories:text-classification, task_cat...",{},
2,UCLNLP/adversarial_qa,UCLNLP,c2d5f738db1ad21a4126a144dfbb00cb51e0a4a9,2022-03-02 23:29:22+00:00,2023-12-21 14:20:00+00:00,False,False,False,192,32,adversarialqa,"[task_categories:question-answering, task_ids:...",{},
3,Yale-LILY/aeslc,Yale-LILY,2305f2e63b68056f9b9037a3805c8c196e0d5581,2022-03-02 23:29:22+00:00,2024-01-09 11:49:13+00:00,False,False,False,82,12,aeslc,"[task_categories:summarization, annotations_cr...",{},
4,nwu-ctext/afrikaans_ner_corpus,nwu-ctext,445834a997dce8b40e1d108638064381de80c497,2022-03-02 23:29:22+00:00,2024-01-09 11:51:47+00:00,False,False,False,85,6,,"[task_categories:token-classification, task_id...",{},
5,fancyzhx/ag_news,fancyzhx,eb185aade064a813bc0b7f42de02595523103ca4,2022-03-02 23:29:22+00:00,2024-03-07 12:02:37+00:00,False,False,False,6511,122,ag-news,"[task_categories:text-classification, task_ids...",{},
6,allenai/ai2_arc,allenai,210d026faf9955653af8916fad021475a3f00453,2022-03-02 23:29:22+00:00,2023-12-21 15:09:48+00:00,False,False,False,571510,107,,"[task_categories:question-answering, task_ids:...",{},
7,google/air_dialogue,google,dbdbe7bcef8d344bc3c68a05600f3d95917d6898,2022-03-02 23:29:22+00:00,2024-03-07 15:22:15+00:00,False,False,False,44,15,,"[task_categories:text-generation, task_categor...",{},
8,komari6/ajgt_twitter_ar,komari6,af3f2fa5462ac461b696cb300d66e07ad366057f,2022-03-02 23:29:22+00:00,2024-01-09 11:58:01+00:00,False,False,False,84,3,,"[task_categories:text-classification, task_ids...",{},
9,legacy-datasets/allegro_reviews,legacy-datasets,71593d1379934286885c53d147bc863ffe830745,2022-03-02 23:29:22+00:00,2024-01-09 11:59:39+00:00,False,False,False,61,4,allegro-reviews,"[task_categories:text-classification, task_ids...",{},


In [58]:
datasets_df.columns

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'likes', 'paperswithcode_id', 'tags',
       'card_data', 'siblings'],
      dtype='object')

In [59]:
datasets_df.loc[0]

id                                   amirveyseh/acronym_identification
author                                                      amirveyseh
sha                           15ef643450d589d5883e289ffadeb03563e80a9e
created_at                                   2022-03-02 23:29:22+00:00
last_modified                                2024-01-09 11:39:57+00:00
private                                                          False
gated                                                            False
disabled                                                         False
downloads                                                          115
likes                                                               19
paperswithcode_id                               acronym-identification
tags                 [task_categories:token-classification, annotat...
card_data                                                           {}
siblings                                                          None
Name: 

In [60]:
datasets_df.loc[2]['tags']

['task_categories:question-answering',
 'task_ids:extractive-qa',
 'task_ids:open-domain-qa',
 'annotations_creators:crowdsourced',
 'language_creators:found',
 'multilinguality:monolingual',
 'source_datasets:original',
 'language:en',
 'license:cc-by-sa-4.0',
 'size_categories:10K<n<100K',
 'format:parquet',
 'modality:text',
 'library:datasets',
 'library:pandas',
 'library:mlcroissant',
 'library:polars',
 'arxiv:2002.00293',
 'arxiv:1606.05250',
 'region:us']

In [61]:
def match_language(entries):
    return find_all_matches(entries, r'language:(\S+)')

def match_size(entries):
    return match_string(entries, r'size_categories:(\S+)')

In [62]:
# Fill attributes for a random dataset

# TODO: check None attributes

dataset_idx = 0

dataset = datasets_df.loc[dataset_idx]
dataset_tags = datasets_df.loc[dataset_idx]['tags']
dataset_attributes = dict()

dataset_attributes['name'] = extract_name(dataset['id'])

dataset_attributes['language'] = match_language(dataset_tags)

dataset_attributes['dataset creator'] = dataset['author'] # TODO: add attribute in our model?

dataset_attributes['license to use'] = match_license(dataset_tags)

dataset_attributes['uri'] = match_uri(dataset_tags) # TODO: add multiple URIs when available?

dataset_attributes['fine-tuning'] = None

dataset_attributes['domain'] = []
for t in dataset_tags:
	if t in tag_domain:
		dataset_attributes['domain'].append(t)

dataset_attributes['size'] = match_size(dataset_tags) # TODO: how to deal with this?

In [63]:
dataset_tags

['task_categories:token-classification',
 'annotations_creators:expert-generated',
 'language_creators:found',
 'multilinguality:monolingual',
 'source_datasets:original',
 'language:en',
 'license:mit',
 'size_categories:10K<n<100K',
 'format:parquet',
 'modality:text',
 'library:datasets',
 'library:pandas',
 'library:mlcroissant',
 'library:polars',
 'arxiv:2010.14678',
 'region:us',
 'acronym-identification']

In [64]:
for key, value in dataset_attributes.items():
	print(f"{'  '}{key}: {value}")

  name: acronym_identification
  language: ['en']
  dataset creator: amirveyseh
  license to use: mit
  uri: 2010.14678
  fine-tuning: None
  domain: []
  size: 10K<n<100K


## Downstream Task

In [27]:
# TODO: save converted datasets to JSON in a new directory

def fetch_and_extract_text(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        target_paragraph = soup.find('p', class_='text-[1.2rem] text-gray-500')
        
        if target_paragraph:
            return target_paragraph.get_text().strip()
        else:
            return "Target paragraph not found."
    else:
        return f"Failed to fetch the webpage. Status code: {response.status_code}"

def create_tasks_json():
    tasks_data = []

    for task in TAG_DOWNSTREAM_TASK:
        url = f"https://huggingface.co/tasks/{task}"
        description = fetch_and_extract_text(url)
        
        tasks_data.append({
            "name": task,
            "description": description, # TODO: text2text generation has no description
            "sub-task": []
        })
        
        print(f"Processed: {task}")
        # time.sleep(0.5)  # Be polite to the server

    with open('huggingface_tasks.json', 'w', encoding='utf-8') as f:
        json.dump(tasks_data, f, ensure_ascii=False, indent=2)

    print("JSON file 'huggingface_tasks.json' has been created.")


create_tasks_json()

Processed: text-classification
Processed: token-classification
Processed: table-question-answering
Processed: question-answering
Processed: zero-shot-classification
Processed: translation
Processed: summarization
Processed: feature-extraction
Processed: text-generation
Processed: text2text-generation
Processed: fill-mask
Processed: sentence-similarity
JSON file 'huggingface_tasks.json' has been created.


## Metrics

In [65]:
# Scrape metrics and descriptions from HF

url_metrics = 'https://huggingface.co/metrics'

response = requests.get(url_metrics)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

h4_tags = soup.find_all('h4')
metrics = [h4_tag.get_text(strip=True) for h4_tag in h4_tags]
# print(metrics)

p_tags = soup.find_all('p')
descriptions = [p_tag.get_text() for p_tag in p_tags]
descriptions = descriptions[2:] # drop first lines
# print(descriptions)

# remove from the list the metrics withoud description (not useful for our purpose)
metrics.remove('AlhitawiMohammed22/CER_Hu-Evaluation-Metrics')
metrics.remove('Aye10032/loss_metric')
metrics.remove('giulio98/code_eval_outputs')
metrics.remove('maysonma/lingo_judge_metric')
metrics.remove('lvwerra/test')
metrics.remove('sma2023/wil')


assert len(metrics) == len(descriptions)
print(len(metrics))

265


In [66]:
# from the lists, remove the descriptions and then the relative metric in the same index that have in the description 'TODO: add a description here\n\t\t\t\t\t\t'ArithmeticError

for i, description in enumerate(descriptions):
    if 'TODO: add a description here' in description:
        metrics.pop(i)
        descriptions.pop(i)

assert len(metrics) == len(descriptions)
print(len(metrics))

218


## Train relationship

In [119]:
model_idx = 96

model = models_df.loc[model_idx]
model_tags = models_df.loc[model_idx]['tags']

datasets = match_dataset(model_tags)
#datasets = [extract_name(dataset) for dataset in datasets]

train_relationship = dict()
for dataset in datasets:
    train_relationship = {
        "model_id": model['id'],
        "dataset_id": dataset,
    }

train_relationship

{'model': 'Babelscape/rebel-large', 'dataset': 'Babelscape/rebel-dataset'}

## Suited for relationship

In [122]:
model_idx = 12

model = models_df.loc[model_idx]
model_tags = models_df.loc[model_idx]['tags']
suited_for_relationship = dict()

for t in model_tags:
	if t in TAG_DOWNSTREAM_TASK:
		suited_for_relationship = {
        "model_id": model['id'],
        "task": t,
    }

suited_for_relationship

{'model_id': 'Ayham/bert_gpt2_summarization_xsum',
 'task': 'text2text-generation'}