# Data Availability

In [None]:
from huggingface_hub import HfApi
import pandas as pd
import itertools
import re
import requests
from bs4 import BeautifulSoup

from tags import * # tags.py

In [2]:
api = HfApi()

### LLMs

In [3]:
models = api.list_models(full=True)

In [4]:
model = list(itertools.islice(models, 0, 1000))
models_df = pd.DataFrame(model)
models_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,downloads_all_time,...,pipeline_tag,mask_token,card_data,widget_data,model_index,config,transformers_info,siblings,spaces,safetensors
0,albert/albert-base-v1,albert,082438ba120d36b97b9288772a41144e941705b9,2022-03-02 23:29:04+00:00,2024-02-19 10:57:35+00:00,False,False,,17963,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
1,albert/albert-base-v2,albert,8e2f239c5f8a2c0f253781ca60135db913e5c80c,2022-03-02 23:29:04+00:00,2024-02-19 10:58:14+00:00,False,False,,2305801,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
2,albert/albert-large-v1,albert,94fd741fb5d6cb5bc578fc154837016c583bafef,2022-03-02 23:29:04+00:00,2024-02-19 10:58:26+00:00,False,False,,2029,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
3,albert/albert-large-v2,albert,dfed3a5ef4499fb3351c4ebbcf487375d1e942c8,2022-03-02 23:29:04+00:00,2024-02-19 10:58:48+00:00,False,False,,9378,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
4,albert/albert-xlarge-v1,albert,ed6f87d14403b3c459a458fa6aa9dc5c51c517c1,2022-03-02 23:29:04+00:00,2024-02-19 11:01:28+00:00,False,False,,1486,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
5,albert/albert-xlarge-v2,albert,4fd2c2aa9aeb305f87704a7e595be7bfffa3db88,2022-03-02 23:29:04+00:00,2024-04-10 09:57:46+00:00,False,False,,3585,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
6,albert/albert-xxlarge-v1,albert,43129068ee5f6a481c148daeac11cc593b8ff440,2022-03-02 23:29:04+00:00,2024-02-19 11:01:42+00:00,False,False,,4410,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
7,albert/albert-xxlarge-v2,albert,97d3e58863d3a41dc581882f73b34d110b18f1f8,2022-03-02 23:29:04+00:00,2024-02-19 11:02:09+00:00,False,False,,15595,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
8,google-bert/bert-base-cased-finetuned-mrpc,google-bert,f150c1d609d1e50dd5e2e5408661cfac8339277c,2022-03-02 23:29:04+00:00,2024-02-19 11:03:21+00:00,False,False,,53418,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
9,google-bert/bert-base-cased,google-bert,cd5ef92a9fb2f889e972770a36d4ed042daf221e,2022-03-02 23:29:04+00:00,2024-02-19 11:02:26+00:00,False,False,,5403657,,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,


In [5]:
print(models_df.columns)

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'downloads_all_time', 'likes',
       'library_name', 'tags', 'pipeline_tag', 'mask_token', 'card_data',
       'widget_data', 'model_index', 'config', 'transformers_info', 'siblings',
       'spaces', 'safetensors'],
      dtype='object')


In [6]:
models_df.loc[0]['tags']

['transformers',
 'pytorch',
 'tf',
 'safetensors',
 'albert',
 'fill-mask',
 'exbert',
 'en',
 'dataset:bookcorpus',
 'dataset:wikipedia',
 'arxiv:1909.11942',
 'license:apache-2.0',
 'autotrain_compatible',
 'endpoints_compatible',
 'region:us']

In [69]:
example_model = api.list_models(model_name='meta-llama/Meta-Llama-3.1-8B-Instruct')
example_df = pd.DataFrame(example_model)
example_df.loc[0]['tags']

['transformers',
 'safetensors',
 'llama',
 'text-generation',
 'facebook',
 'meta',
 'pytorch',
 'llama-3',
 'conversational',
 'en',
 'de',
 'fr',
 'it',
 'pt',
 'hi',
 'es',
 'th',
 'arxiv:2204.05149',
 'license:llama3.1',
 'autotrain_compatible',
 'text-generation-inference',
 'endpoints_compatible',
 'region:us']

In [39]:
# Scrape languages from HF

url_languages = 'https://huggingface.co/languages'

response = requests.get(url_languages)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

code_tags = soup.find_all('code')
tag_language = [code_tag.get_text() for code_tag in code_tags]
print(tag_language)

tag_language.remove('jax') # 'jax' is the ISO for Jambi Malay (present in 3 datasets, 36 models), impossible to distinguish from JAX the library... TODO: better solution?

['en', 'zh', 'fr', 'es', 'ru', 'de', 'ja', 'pt', 'ar', 'ko', 'it', 'vi', 'hi', 'tr', 'id', 'pl', 'nl', 'th', 'bn', 'fa', 'sv', 'cs', 'fi', 'ca', 'ro', 'ta', 'da', 'hu', 'uk', 'ind', 'el', 'te', 'ur', 'bg', 'he', 'ms', 'sl', 'ml', 'mr', 'sk', 'sw', 'et', 'kn', 'eu', 'gu', 'sr', 'hr', 'no', 'lt', 'lv', 'pa', 'is', 'yo', 'vie', 'am', 'af', 'ne', 'az', 'si', 'mt', 'ga', 'gl', 'sq', 'or', 'kk', 'ceb', 'tha', 'tl', 'hy', 'as', 'mk', 'ha', 'cy', 'ka', 'my', 'uz', 'eng', 'eo', 'ig', 'be', 'km', 'nb', 'mn', 'ky', 'zu', 'la', 'min', 'jav', 'so', 'xh', 'nn', 'rw', 'ps', 'jv', 'mya', 'br', 'tt', 'yue', 'bs', 'sa', 'ckb', 'lg', 'wo', 'lo', 'ku', 'sd', 'ug', 'ilo', 'sun', 'ast', 'tw', 'tg', 'ace', 'lb', 'nso', 'gd', 'war', 'fy', 'tgl', 'fil', 'bug', 'su', 'oc', 'sn', 'bjn', 'khm', 'tk', 'gn', 'yi', 'ht', 'mai', 'bo', 'ban', 'zlm', 'ba', 'fo', 'ln', 'dv', 'tn', 'kab', 'bm', 'ny', 'shn', 'cv', 'aa', 'mg', 'mi', 'ti', 'sat', 'lao', 'arz', 'mar', 'sah', 'st', 'ee', 'pag', 'ia', 'qu', 'azb', 'vec', 'lij'

In [40]:
def extract_name(full_name):
    pattern = re.compile(r'[^/]+/(.+)')
    match = re.search(pattern, full_name)
    return match.group(1) # the part after '/' might also contain version and number of parameters (impossible to extract in a uniform way)

def match_string(entries, pattern):
    pattern = re.compile(pattern)
    for entry in entries:
        match = pattern.match(entry)
        if match:
            return match.group(1)
    return None

def match_license(entries):
    return match_string(entries, r'license:(\S+)')

def match_dataset(entries):
    return match_string(entries, r'dataset:(\S+)')

def match_uri(entries):
    return match_string(entries, r'arxiv:(\S+)')

In [55]:
# Fill attributes for a random model

# TODO: check None attributes

model_idx = 1

model = models_df.loc[model_idx]
model_tags = models_df.loc[model_idx]['tags']
model_attributes = dict()

model_attributes['name'] = extract_name(model['id'])
model_attributes['version'] = None
model_attributes['number of parameters'] = None

model_attributes['quantization'] = None
for t in model_tags:
	if t in tag_quantization:
		model_attributes['quantization'] = t

model_attributes['architecture'] = None

model_attributes['language'] = []
for t in model_tags:
	if t in tag_language:
		model_attributes['language'].append(t)

model_attributes['model creator'] = model['author']
model_attributes['developer'] = None # TODO: same as model creator?

model_attributes['license to use'] = match_license(model_tags)

model_attributes['library'] = [] # TODO: change type into list(str) in our model
for t in model_tags:
	if t in tag_library:
		model_attributes['library'].append(t)

model_attributes['context length'] = None
model_attributes['open source'] = None

model_attributes['uri'] = match_uri(model_tags)

model_attributes['fine-tuned'] = None

model_attributes['carbon emission'] = False
for t in model_tags:
	if t in tag_carbon_emission:
		model_attributes['carbon emission'] = True # only possible to extract boolean from HF... TODO: always set to False?

model_attributes['tokenizer'] = None

In [56]:
model_tags

['transformers',
 'pytorch',
 'tf',
 'jax',
 'rust',
 'safetensors',
 'albert',
 'fill-mask',
 'en',
 'dataset:bookcorpus',
 'dataset:wikipedia',
 'arxiv:1909.11942',
 'license:apache-2.0',
 'autotrain_compatible',
 'endpoints_compatible',
 'region:us']

In [57]:
for key, value in model_attributes.items():
	print(f"{'  '}{key}: {value}")

  name: albert-base-v2
  version: None
  number of parameters: None
  quantization: None
  architecture: None
  language: ['en']
  model creator: albert
  developer: None
  license to use: apache-2.0
  library: ['transformers', 'pytorch', 'jax', 'rust', 'safetensors']
  context length: None
  open source: None
  uri: 1909.11942
  fine-tuned: None
  carbon emission: False
  tokenizer: None


In [44]:
availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

llm_attributes = ['name', 'version', 'number of parameters', 'quantization', 'architecture', 'language', 'model creator', 'license to use', 'library framework', 'context length', 'developer', 'open source', 'uri', 'fine-tuned', 'carbon emission', 'tokenizer']
llm_attributes_API_availability = [True, False, False, False, False, True, True, True, True, False, False, False, True, False, True, False]
availability['id'] = models_df['id']
availability['entity name'] = 'LLM'
availability = availability.loc[availability.index.repeat(len(llm_attributes))].reset_index(drop=True)
availability['attribute name'] = llm_attributes * len(models_df)
availability['available API'] = llm_attributes_API_availability * len(models_df)

In [77]:
availability.head(32)

Unnamed: 0,id,entity name,attribute name,available API,available scraping
0,albert/albert-base-v1,LLM,name,True,
1,albert/albert-base-v1,LLM,version,False,
2,albert/albert-base-v1,LLM,number of parameters,False,
3,albert/albert-base-v1,LLM,quantization,False,
4,albert/albert-base-v1,LLM,architecture,False,
5,albert/albert-base-v1,LLM,language,True,
6,albert/albert-base-v1,LLM,model creator,True,
7,albert/albert-base-v1,LLM,license to use,True,
8,albert/albert-base-v1,LLM,library framework,True,
9,albert/albert-base-v1,LLM,context length,False,


## Dataset

In [61]:
datasets = api.list_datasets(full=True)

In [62]:
datasets = list(itertools.islice(datasets, 0, 10))
datasets_df = pd.DataFrame(datasets)
datasets_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,downloads_all_time,likes,paperswithcode_id,tags,card_data,siblings
0,amirveyseh/acronym_identification,amirveyseh,15ef643450d589d5883e289ffadeb03563e80a9e,2022-03-02 23:29:22+00:00,2024-01-09 11:39:57+00:00,False,False,False,218,,19,acronym-identification,"[task_categories:token-classification, annotat...",{},
1,ade-benchmark-corpus/ade_corpus_v2,ade-benchmark-corpus,4ba01c71687dd7c996597042449448ea312126cf,2022-03-02 23:29:22+00:00,2024-01-09 11:42:58+00:00,False,False,False,874,,25,,"[task_categories:text-classification, task_cat...",{},
2,UCLNLP/adversarial_qa,UCLNLP,c2d5f738db1ad21a4126a144dfbb00cb51e0a4a9,2022-03-02 23:29:22+00:00,2023-12-21 14:20:00+00:00,False,False,False,353,,32,adversarialqa,"[task_categories:question-answering, task_ids:...",{},
3,Yale-LILY/aeslc,Yale-LILY,2305f2e63b68056f9b9037a3805c8c196e0d5581,2022-03-02 23:29:22+00:00,2024-01-09 11:49:13+00:00,False,False,False,181,,12,aeslc,"[task_categories:summarization, annotations_cr...",{},
4,nwu-ctext/afrikaans_ner_corpus,nwu-ctext,445834a997dce8b40e1d108638064381de80c497,2022-03-02 23:29:22+00:00,2024-01-09 11:51:47+00:00,False,False,False,143,,6,,"[task_categories:token-classification, task_id...",{},
5,fancyzhx/ag_news,fancyzhx,eb185aade064a813bc0b7f42de02595523103ca4,2022-03-02 23:29:22+00:00,2024-03-07 12:02:37+00:00,False,False,False,7289,,121,ag-news,"[task_categories:text-classification, task_ids...",{},
6,allenai/ai2_arc,allenai,210d026faf9955653af8916fad021475a3f00453,2022-03-02 23:29:22+00:00,2023-12-21 15:09:48+00:00,False,False,False,571497,,103,,"[task_categories:question-answering, task_ids:...",{},
7,google/air_dialogue,google,dbdbe7bcef8d344bc3c68a05600f3d95917d6898,2022-03-02 23:29:22+00:00,2024-03-07 15:22:15+00:00,False,False,False,119,,15,,"[task_categories:text-generation, task_categor...",{},
8,komari6/ajgt_twitter_ar,komari6,af3f2fa5462ac461b696cb300d66e07ad366057f,2022-03-02 23:29:22+00:00,2024-01-09 11:58:01+00:00,False,False,False,136,,3,,"[task_categories:text-classification, task_ids...",{},
9,legacy-datasets/allegro_reviews,legacy-datasets,71593d1379934286885c53d147bc863ffe830745,2022-03-02 23:29:22+00:00,2024-01-09 11:59:39+00:00,False,False,False,110,,4,allegro-reviews,"[task_categories:text-classification, task_ids...",{},


In [63]:
datasets_df.columns

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'downloads_all_time', 'likes',
       'paperswithcode_id', 'tags', 'card_data', 'siblings'],
      dtype='object')

In [64]:
datasets_df.loc[0]

id                                    amirveyseh/acronym_identification
author                                                       amirveyseh
sha                            15ef643450d589d5883e289ffadeb03563e80a9e
created_at                                    2022-03-02 23:29:22+00:00
last_modified                                 2024-01-09 11:39:57+00:00
private                                                           False
gated                                                             False
disabled                                                          False
downloads                                                           218
downloads_all_time                                                 None
likes                                                                19
paperswithcode_id                                acronym-identification
tags                  [task_categories:token-classification, annotat...
card_data                                                       

In [65]:
datasets_df.loc[2]['tags']

['task_categories:question-answering',
 'task_ids:extractive-qa',
 'task_ids:open-domain-qa',
 'annotations_creators:crowdsourced',
 'language_creators:found',
 'multilinguality:monolingual',
 'source_datasets:original',
 'language:en',
 'license:cc-by-sa-4.0',
 'size_categories:10K<n<100K',
 'format:parquet',
 'modality:text',
 'library:datasets',
 'library:pandas',
 'library:mlcroissant',
 'library:polars',
 'arxiv:2002.00293',
 'arxiv:1606.05250',
 'region:us']

In [72]:
def match_language(entries):
    return match_string(entries, r'language:(\S+)')

def match_size(entries):
    return match_string(entries, r'size_categories:(\S+)')

In [79]:
# Fill attributes for a random dataset

# TODO: check None attributes

dataset_idx = 1

dataset = datasets_df.loc[dataset_idx]
dataset_tags = datasets_df.loc[dataset_idx]['tags']
dataset_attributes = dict()

dataset_attributes['name'] = extract_name(dataset['id'])

dataset_attributes['language'] = match_language(dataset_tags)

dataset_attributes['dataset creator'] = dataset['author'] # TODO: add attribute in our model?

dataset_attributes['license to use'] = match_license(dataset_tags)

dataset_attributes['uri'] = match_uri(dataset_tags)

dataset_attributes['fine-tuning'] = None

dataset_attributes['domain'] = []
for t in dataset_tags:
	if t in tag_domain:
		dataset_attributes['domain'].append(t)

dataset_attributes['size'] = match_size(dataset_tags) # TODO: how to deal with this?

In [80]:
dataset_tags

['task_categories:text-classification',
 'task_categories:token-classification',
 'task_ids:coreference-resolution',
 'task_ids:fact-checking',
 'annotations_creators:expert-generated',
 'language_creators:found',
 'multilinguality:monolingual',
 'source_datasets:original',
 'language:en',
 'license:unknown',
 'size_categories:10K<n<100K',
 'format:parquet',
 'modality:text',
 'library:datasets',
 'library:pandas',
 'library:mlcroissant',
 'library:polars',
 'region:us']

In [81]:
for key, value in dataset_attributes.items():
	print(f"{'  '}{key}: {value}")

  name: ade_corpus_v2
  language: en
  dataset creator: ade-benchmark-corpus
  license to use: unknown
  uri: None
  fine-tuning: None
  domain: []
  size: 10K<n<100K


## Downstream Task

## Metrics

In [90]:
# Scrape metrics and descriptions from HF

url_metrics = 'https://huggingface.co/metrics'

response = requests.get(url_metrics)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

h4_tags = soup.find_all('h4')
metrics = [h4_tag.get_text(strip=True) for h4_tag in h4_tags]
# print(metrics)

p_tags = soup.find_all('p')
descriptions = [p_tag.get_text() for p_tag in p_tags]
descriptions = descriptions[2:] # drop first lines
# print(descriptions)

assert len(metrics) == len(descriptions) # TODO: check why assert fails

AssertionError: 