# Data Availability

In [159]:
from huggingface_hub import HfApi
import pandas as pd
import itertools
import re
import requests
from bs4 import BeautifulSoup

from tags import * # tags.py

In [160]:
api = HfApi()

### LLMs

In [161]:
models = api.list_models(full=True)

In [162]:
model = itertools.islice(models, 1000, 2000)
models_df = pd.DataFrame(model)
models_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,likes,...,pipeline_tag,mask_token,card_data,widget_data,model_index,config,transformers_info,siblings,spaces,safetensors
0,Aybars/ModelOnWhole,Aybars,fd356c873f4480da1d7521f3a88cba0bf60f009b,2022-03-02 23:29:04+00:00,2022-02-14 06:33:52+00:00,False,False,,18,0,...,question-answering,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
1,Aybars/XLM_Turkish,Aybars,15910d0ddbddc7465d4dbb769d9b2c077eb79c35,2022-03-02 23:29:04+00:00,2022-02-15 10:31:35+00:00,False,False,,6,0,...,question-answering,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
2,Ayham/albert_bert_summarization_cnn_dailymail,Ayham,fd80e2dd89e9856d9e9fbc9d462f61467ca3571f,2022-03-02 23:29:04+00:00,2022-02-25 11:32:57+00:00,False,False,,41,0,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
3,Ayham/albert_distilgpt2_summarization_cnn_dail...,Ayham,94829c8517eae5d582c588590d84d0fc841f0fc9,2022-03-02 23:29:04+00:00,2022-02-02 23:15:10+00:00,False,False,,18,0,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
4,Ayham/albert_gpt2_Full_summarization_cnndm,Ayham,fc1d4e38606ca19d737aeb46db2a5e5bbb51310d,2022-03-02 23:29:04+00:00,2022-01-03 23:42:44+00:00,False,False,,35,0,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
5,Ayham/albert_gpt2_summarization_cnndm,Ayham,6913e216cabf073f758ceb6839eeff23f3e29e20,2022-03-02 23:29:04+00:00,2021-12-23 01:36:49+00:00,False,False,,11,0,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
6,Ayham/albert_gpt2_summarization_xsum,Ayham,758cf02b0c2f13647301713dd42606f4ced4e1cc,2022-03-02 23:29:04+00:00,2021-12-21 21:20:14+00:00,False,False,,10,0,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
7,Ayham/albert_roberta_summarization_cnn_dailymail,Ayham,ffbda6f1317f83f3bcfd5e97e03705700e85d00d,2022-03-02 23:29:04+00:00,2022-03-01 01:54:22+00:00,False,False,,13,0,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
8,Ayham/bert_bert_summarization_cnn_dailymail,Ayham,2a4e0f292c17c6c2da5f92313e7d7698d2b3ddf5,2022-03-02 23:29:04+00:00,2022-02-21 08:57:52+00:00,False,False,,14,1,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
9,Ayham/bert_distilgpt2_summarization_cnn_dailymail,Ayham,68ec945266361649f2e0b3c7e47a70809db90eca,2022-03-02 23:29:04+00:00,2022-02-03 13:33:41+00:00,False,False,,11,1,...,text2text-generation,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,


In [163]:
print(models_df.columns)

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'likes', 'library_name', 'tags',
       'pipeline_tag', 'mask_token', 'card_data', 'widget_data', 'model_index',
       'config', 'transformers_info', 'siblings', 'spaces', 'safetensors'],
      dtype='object')


In [164]:
models_df.loc[0]['tags']

['transformers',
 'pytorch',
 'bert',
 'question-answering',
 'endpoints_compatible',
 'region:us']

In [165]:
example_model = api.list_models(model_name='meta-llama/Meta-Llama-3.1-8B-Instruct')
example_df = pd.DataFrame(example_model)
example_df.loc[0]['tags']

['transformers',
 'safetensors',
 'llama',
 'text-generation',
 'facebook',
 'meta',
 'pytorch',
 'llama-3',
 'conversational',
 'en',
 'de',
 'fr',
 'it',
 'pt',
 'hi',
 'es',
 'th',
 'arxiv:2204.05149',
 'license:llama3.1',
 'autotrain_compatible',
 'text-generation-inference',
 'endpoints_compatible',
 'region:us']

In [166]:
# Scrape languages from HF

url_languages = 'https://huggingface.co/languages'

response = requests.get(url_languages)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

code_tags = soup.find_all('code')
tag_language = [code_tag.get_text() for code_tag in code_tags]
print(tag_language)

tag_language.remove('jax') # 'jax' is the ISO for Jambi Malay (present in 3 datasets, 36 models), impossible to distinguish from JAX the library... TODO: better solution?

['en', 'zh', 'fr', 'es', 'ru', 'de', 'ja', 'pt', 'ko', 'ar', 'it', 'vi', 'tr', 'hi', 'id', 'pl', 'nl', 'th', 'bn', 'fa', 'sv', 'cs', 'ro', 'fi', 'ca', 'ta', 'da', 'hu', 'uk', 'ind', 'el', 'te', 'ur', 'bg', 'he', 'ms', 'sl', 'ml', 'mr', 'sk', 'sw', 'et', 'kn', 'eu', 'gu', 'sr', 'no', 'hr', 'lt', 'lv', 'pa', 'is', 'yo', 'vie', 'am', 'ne', 'af', 'az', 'mt', 'si', 'ga', 'gl', 'sq', 'or', 'kk', 'tl', 'ceb', 'tha', 'as', 'cy', 'hy', 'mk', 'ka', 'ha', 'my', 'uz', 'eng', 'ig', 'eo', 'be', 'nb', 'km', 'mn', 'ky', 'la', 'zu', 'min', 'jav', 'so', 'xh', 'nn', 'ps', 'rw', 'jv', 'mya', 'yue', 'tt', 'br', 'bs', 'sa', 'ckb', 'lg', 'lo', 'wo', 'ku', 'ug', 'sd', 'ilo', 'ast', 'sun', 'tw', 'tg', 'ace', 'lb', 'nso', 'gd', 'war', 'fil', 'bug', 'su', 'tgl', 'fy', 'bjn', 'khm', 'sn', 'tk', 'oc', 'gn', 'ht', 'yi', 'mai', 'bo', 'zlm', 'ban', 'ba', 'fo', 'tn', 'dv', 'ln', 'bm', 'kab', 'ny', 'shn', 'cv', 'mi', 'aa', 'mg', 'sat', 'ti', 'lao', 'arz', 'ee', 'mar', 'sah', 'st', 'pag', 'ia', 'qu', 'azb', 'hsb', 'vec'

In [167]:
def extract_name(full_name):
    pattern = re.compile(r'[^/]+/(.+)')
    match = re.search(pattern, full_name)
    return match.group(1) # the part after '/' might also contain version and number of parameters (impossible to extract in a uniform way)

def match_string(entries, pattern):
    pattern = re.compile(pattern)
    for entry in entries:
        match = pattern.match(entry)
        if match:
            return match.group(1)
    return None

def match_license(entries):
    return match_string(entries, r'license:(\S+)')

def match_dataset(entries):
    return match_string(entries, r'dataset:(\S+)')

def match_uri(entries):
    return match_string(entries, r'arxiv:(\S+)')

In [173]:
# Fill attributes for a random model

# TODO: check None attributes

model_idx = 0


model = models_df.loc[model_idx]
model_tags = models_df.loc[model_idx]['tags']
model_attributes = dict()

model_attributes['name'] = extract_name(model['id'])
model_attributes['version'] = None # in model['id'] but impossible to extract in a uniform way
model_attributes['number of parameters'] = None # sometimes in model['id'] -> difficult to extract, sometimes in model description on HF

model_attributes['quantization'] = None
for t in model_tags:
	if t in tag_quantization:
		model_attributes['quantization'] = t

model_attributes['architecture'] = None

model_attributes['language'] = []
for t in model_tags:
	if t in tag_language:
		model_attributes['language'].append(t)

model_attributes['model creator'] = None # TODO: same as model author?
model_attributes['developer'] = model['author']

model_attributes['license to use'] = match_license(model_tags)

model_attributes['library'] = [] # TODO: change type into list(str) in our model
for t in model_tags:
	if t in tag_library:
		model_attributes['library'].append(t)

model_attributes['context length'] = None
model_attributes['open source'] = None

model_attributes['uri'] = match_uri(model_tags)

model_attributes['fine-tuned'] = None

model_attributes['carbon emission'] = None
try:
	model_card_data = next(api.list_models(model_name=model['id'], full=True, cardData=True)).card_data.to_dict()
	model_attributes['carbon emission'] = model_card_data['co2_eq_emissions']
except KeyError:
	print('No emission data available for this model')
except AttributeError:
	print('No card data available for this model')

model_attributes['tokenizer'] = None

No card data available for this model
No card data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model
No emission data available for this model


In [16]:
models_df.loc[model_idx]

id                                         google-bert/bert-base-cased
author                                                     google-bert
sha                           cd5ef92a9fb2f889e972770a36d4ed042daf221e
created_at                                   2022-03-02 23:29:04+00:00
last_modified                                2024-02-19 11:02:26+00:00
private                                                          False
gated                                                            False
disabled                                                          None
downloads                                                      5668211
likes                                                              245
library_name                                              transformers
tags                 [transformers, pytorch, tf, jax, safetensors, ...
pipeline_tag                                                 fill-mask
mask_token                                                        None
card_d

In [14]:
model_tags

['transformers',
 'pytorch',
 'tf',
 'jax',
 'safetensors',
 'bert',
 'fill-mask',
 'exbert',
 'en',
 'dataset:bookcorpus',
 'dataset:wikipedia',
 'arxiv:1810.04805',
 'license:apache-2.0',
 'autotrain_compatible',
 'endpoints_compatible',
 'region:us']

In [15]:
for key, value in model_attributes.items():
	print(f"{'  '}{key}: {value}")

  name: bert-base-cased
  version: None
  number of parameters: None
  quantization: None
  architecture: None
  language: ['en']
  model creator: None
  developer: google-bert
  license to use: apache-2.0
  library: ['transformers', 'pytorch', 'tf', 'jax', 'safetensors']
  context length: None
  open source: None
  uri: 1810.04805
  fine-tuned: None
  carbon emission: False
  tokenizer: None


In [16]:
availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

llm_attributes = ['name', 'version', 'number of parameters', 'quantization', 'architecture', 'language', 'model creator', 'license to use', 'library framework', 'context length', 'developer', 'open source', 'uri', 'fine-tuned', 'carbon emission', 'tokenizer']
llm_attributes_API_availability = [True, False, False, False, False, True, True, True, True, False, False, False, True, False, True, False]
availability['id'] = models_df['id']
availability['entity name'] = 'LLM'
availability = availability.loc[availability.index.repeat(len(llm_attributes))].reset_index(drop=True)
availability['attribute name'] = llm_attributes * len(models_df)
availability['available API'] = llm_attributes_API_availability * len(models_df)

In [17]:
availability.head(32)

Unnamed: 0,id,entity name,attribute name,available API,available scraping
0,albert/albert-base-v1,LLM,name,True,
1,albert/albert-base-v1,LLM,version,False,
2,albert/albert-base-v1,LLM,number of parameters,False,
3,albert/albert-base-v1,LLM,quantization,False,
4,albert/albert-base-v1,LLM,architecture,False,
5,albert/albert-base-v1,LLM,language,True,
6,albert/albert-base-v1,LLM,model creator,True,
7,albert/albert-base-v1,LLM,license to use,True,
8,albert/albert-base-v1,LLM,library framework,True,
9,albert/albert-base-v1,LLM,context length,False,


## Dataset

In [18]:
datasets = api.list_datasets(full=True)

In [19]:
datasets = list(itertools.islice(datasets, 0, 10))
datasets_df = pd.DataFrame(datasets)
datasets_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,likes,paperswithcode_id,tags,card_data,siblings
0,amirveyseh/acronym_identification,amirveyseh,15ef643450d589d5883e289ffadeb03563e80a9e,2022-03-02 23:29:22+00:00,2024-01-09 11:39:57+00:00,False,False,False,219,19,acronym-identification,"[task_categories:token-classification, annotat...",{},
1,ade-benchmark-corpus/ade_corpus_v2,ade-benchmark-corpus,4ba01c71687dd7c996597042449448ea312126cf,2022-03-02 23:29:22+00:00,2024-01-09 11:42:58+00:00,False,False,False,834,25,,"[task_categories:text-classification, task_cat...",{},
2,UCLNLP/adversarial_qa,UCLNLP,c2d5f738db1ad21a4126a144dfbb00cb51e0a4a9,2022-03-02 23:29:22+00:00,2023-12-21 14:20:00+00:00,False,False,False,355,32,adversarialqa,"[task_categories:question-answering, task_ids:...",{},
3,Yale-LILY/aeslc,Yale-LILY,2305f2e63b68056f9b9037a3805c8c196e0d5581,2022-03-02 23:29:22+00:00,2024-01-09 11:49:13+00:00,False,False,False,181,12,aeslc,"[task_categories:summarization, annotations_cr...",{},
4,nwu-ctext/afrikaans_ner_corpus,nwu-ctext,445834a997dce8b40e1d108638064381de80c497,2022-03-02 23:29:22+00:00,2024-01-09 11:51:47+00:00,False,False,False,142,6,,"[task_categories:token-classification, task_id...",{},
5,fancyzhx/ag_news,fancyzhx,eb185aade064a813bc0b7f42de02595523103ca4,2022-03-02 23:29:22+00:00,2024-03-07 12:02:37+00:00,False,False,False,7185,121,ag-news,"[task_categories:text-classification, task_ids...",{},
6,allenai/ai2_arc,allenai,210d026faf9955653af8916fad021475a3f00453,2022-03-02 23:29:22+00:00,2023-12-21 15:09:48+00:00,False,False,False,569814,103,,"[task_categories:question-answering, task_ids:...",{},
7,google/air_dialogue,google,dbdbe7bcef8d344bc3c68a05600f3d95917d6898,2022-03-02 23:29:22+00:00,2024-03-07 15:22:15+00:00,False,False,False,113,15,,"[task_categories:text-generation, task_categor...",{},
8,komari6/ajgt_twitter_ar,komari6,af3f2fa5462ac461b696cb300d66e07ad366057f,2022-03-02 23:29:22+00:00,2024-01-09 11:58:01+00:00,False,False,False,141,3,,"[task_categories:text-classification, task_ids...",{},
9,legacy-datasets/allegro_reviews,legacy-datasets,71593d1379934286885c53d147bc863ffe830745,2022-03-02 23:29:22+00:00,2024-01-09 11:59:39+00:00,False,False,False,115,4,allegro-reviews,"[task_categories:text-classification, task_ids...",{},


In [20]:
datasets_df.columns

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'likes', 'paperswithcode_id', 'tags',
       'card_data', 'siblings'],
      dtype='object')

In [21]:
datasets_df.loc[0]

id                                   amirveyseh/acronym_identification
author                                                      amirveyseh
sha                           15ef643450d589d5883e289ffadeb03563e80a9e
created_at                                   2022-03-02 23:29:22+00:00
last_modified                                2024-01-09 11:39:57+00:00
private                                                          False
gated                                                            False
disabled                                                         False
downloads                                                          219
likes                                                               19
paperswithcode_id                               acronym-identification
tags                 [task_categories:token-classification, annotat...
card_data                                                           {}
siblings                                                          None
Name: 

In [22]:
datasets_df.loc[2]['tags']

['task_categories:question-answering',
 'task_ids:extractive-qa',
 'task_ids:open-domain-qa',
 'annotations_creators:crowdsourced',
 'language_creators:found',
 'multilinguality:monolingual',
 'source_datasets:original',
 'language:en',
 'license:cc-by-sa-4.0',
 'size_categories:10K<n<100K',
 'format:parquet',
 'modality:text',
 'library:datasets',
 'library:pandas',
 'library:mlcroissant',
 'library:polars',
 'arxiv:2002.00293',
 'arxiv:1606.05250',
 'region:us']

In [23]:
def match_language(entries):
    return match_string(entries, r'language:(\S+)')

def match_size(entries):
    return match_string(entries, r'size_categories:(\S+)')

In [24]:
# Fill attributes for a random dataset

# TODO: check None attributes

dataset_idx = 1

dataset = datasets_df.loc[dataset_idx]
dataset_tags = datasets_df.loc[dataset_idx]['tags']
dataset_attributes = dict()

dataset_attributes['name'] = extract_name(dataset['id'])

dataset_attributes['language'] = match_language(dataset_tags)

dataset_attributes['dataset creator'] = dataset['author'] # TODO: add attribute in our model?

dataset_attributes['license to use'] = match_license(dataset_tags)

dataset_attributes['uri'] = match_uri(dataset_tags)

dataset_attributes['fine-tuning'] = None

dataset_attributes['domain'] = []
for t in dataset_tags:
	if t in tag_domain:
		dataset_attributes['domain'].append(t)

dataset_attributes['size'] = match_size(dataset_tags) # TODO: how to deal with this?

In [25]:
dataset_tags

['task_categories:text-classification',
 'task_categories:token-classification',
 'task_ids:coreference-resolution',
 'task_ids:fact-checking',
 'annotations_creators:expert-generated',
 'language_creators:found',
 'multilinguality:monolingual',
 'source_datasets:original',
 'language:en',
 'license:unknown',
 'size_categories:10K<n<100K',
 'format:parquet',
 'modality:text',
 'library:datasets',
 'library:pandas',
 'library:mlcroissant',
 'library:polars',
 'region:us']

In [26]:
for key, value in dataset_attributes.items():
	print(f"{'  '}{key}: {value}")

  name: ade_corpus_v2
  language: en
  dataset creator: ade-benchmark-corpus
  license to use: unknown
  uri: None
  fine-tuning: None
  domain: []
  size: 10K<n<100K


## Downstream Task

## Metrics

In [40]:
# Scrape metrics and descriptions from HF

url_metrics = 'https://huggingface.co/metrics'

response = requests.get(url_metrics)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

h4_tags = soup.find_all('h4')
metrics = [h4_tag.get_text(strip=True) for h4_tag in h4_tags]
# print(metrics)

p_tags = soup.find_all('p')
descriptions = [p_tag.get_text() for p_tag in p_tags]
descriptions = descriptions[2:] # drop first lines
# print(descriptions)

# remove from the list the metrics withoud description (not useful for our purpose)
metrics.remove('AlhitawiMohammed22/CER_Hu-Evaluation-Metrics')
metrics.remove('Aye10032/loss_metric')
metrics.remove('giulio98/code_eval_outputs')
metrics.remove('maysonma/lingo_judge_metric')
metrics.remove('lvwerra/test')
metrics.remove('sma2023/wil')


assert len(metrics) == len(descriptions)
print(len(metrics))

263


In [41]:
# from the lists, remove the descriptions and then the relative metric in the same index that have in the description 'TODO: add a description here\n\t\t\t\t\t\t'ArithmeticError

for i, description in enumerate(descriptions):
    if 'TODO: add a description here' in description:
        metrics.pop(i)
        descriptions.pop(i)

assert len(metrics) == len(descriptions)
print(len(metrics))

216
