# Data Availability

In [1]:
from huggingface_hub import HfApi
import pandas as pd
import itertools
import re
import requests
from bs4 import BeautifulSoup
import time
import json
import os

from tags import * # tags.py

In [2]:
api = HfApi()

## LLMs

### Information extraction

In [3]:
models = api.list_models(full=True)

In [4]:
model = itertools.islice(models, 0, 1000)
models_df = pd.DataFrame(model)
models_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,likes,...,pipeline_tag,mask_token,card_data,widget_data,model_index,config,transformers_info,siblings,spaces,safetensors
0,albert/albert-base-v1,albert,082438ba120d36b97b9288772a41144e941705b9,2022-03-02 23:29:04+00:00,2024-02-19 10:57:35+00:00,False,False,,14480,8,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
1,albert/albert-base-v2,albert,8e2f239c5f8a2c0f253781ca60135db913e5c80c,2022-03-02 23:29:04+00:00,2024-02-19 10:58:14+00:00,False,False,,2430669,99,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
2,albert/albert-large-v1,albert,94fd741fb5d6cb5bc578fc154837016c583bafef,2022-03-02 23:29:04+00:00,2024-02-19 10:58:26+00:00,False,False,,1843,3,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
3,albert/albert-large-v2,albert,dfed3a5ef4499fb3351c4ebbcf487375d1e942c8,2022-03-02 23:29:04+00:00,2024-02-19 10:58:48+00:00,False,False,,15973,15,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
4,albert/albert-xlarge-v1,albert,ed6f87d14403b3c459a458fa6aa9dc5c51c517c1,2022-03-02 23:29:04+00:00,2024-02-19 11:01:28+00:00,False,False,,1299,4,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
5,albert/albert-xlarge-v2,albert,4fd2c2aa9aeb305f87704a7e595be7bfffa3db88,2022-03-02 23:29:04+00:00,2024-04-10 09:57:46+00:00,False,False,,3809,8,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
6,albert/albert-xxlarge-v1,albert,43129068ee5f6a481c148daeac11cc593b8ff440,2022-03-02 23:29:04+00:00,2024-02-19 11:01:42+00:00,False,False,,4743,5,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
7,albert/albert-xxlarge-v2,albert,97d3e58863d3a41dc581882f73b34d110b18f1f8,2022-03-02 23:29:04+00:00,2024-02-19 11:02:09+00:00,False,False,,8952,19,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
8,google-bert/bert-base-cased-finetuned-mrpc,google-bert,f150c1d609d1e50dd5e2e5408661cfac8339277c,2022-03-02 23:29:04+00:00,2024-02-19 11:03:21+00:00,False,False,,51295,1,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,
9,google-bert/bert-base-cased,google-bert,cd5ef92a9fb2f889e972770a36d4ed042daf221e,2022-03-02 23:29:04+00:00,2024-02-19 11:02:26+00:00,False,False,,5931863,246,...,fill-mask,,,,,,,"[{'rfilename': '.gitattributes', 'size': None,...",,


In [5]:
print(models_df.columns)

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'likes', 'library_name', 'tags',
       'pipeline_tag', 'mask_token', 'card_data', 'widget_data', 'model_index',
       'config', 'transformers_info', 'siblings', 'spaces', 'safetensors'],
      dtype='object')


In [6]:
# tag examples
models_df.loc[0]['tags']

['transformers',
 'pytorch',
 'tf',
 'safetensors',
 'albert',
 'fill-mask',
 'exbert',
 'en',
 'dataset:bookcorpus',
 'dataset:wikipedia',
 'arxiv:1909.11942',
 'license:apache-2.0',
 'autotrain_compatible',
 'endpoints_compatible',
 'region:us']

In [7]:
# richer tag information example
example_model = api.list_models(model_name='albert_bert_summarization_cnn_dailymail')
example_df = pd.DataFrame(example_model)
example_df.loc[0]['tags']

['transformers',
 'pytorch',
 'tensorboard',
 'encoder-decoder',
 'text2text-generation',
 'generated_from_trainer',
 'dataset:cnn_dailymail',
 'autotrain_compatible',
 'endpoints_compatible',
 'region:us']

In [8]:
# Scrape languages from HF

url_languages = 'https://huggingface.co/languages'

response = requests.get(url_languages)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

code_tags = soup.find_all('code')
tag_language = [code_tag.get_text() for code_tag in code_tags]
print(tag_language)

tag_language.remove('jax') # 'jax' is the ISO for Jambi Malay (present in 3 datasets, 36 models), impossible to distinguish from JAX the library... TODO: better solution?

['en', 'zh', 'fr', 'es', 'ru', 'de', 'ja', 'pt', 'ko', 'ar', 'it', 'vi', 'tr', 'hi', 'id', 'pl', 'nl', 'th', 'bn', 'fa', 'cs', 'sv', 'ro', 'fi', 'ca', 'ta', 'da', 'hu', 'uk', 'ind', 'el', 'te', 'ur', 'bg', 'he', 'ml', 'ms', 'sl', 'mr', 'sk', 'sw', 'et', 'eu', 'kn', 'gu', 'sr', 'no', 'hr', 'lt', 'lv', 'pa', 'is', 'yo', 'am', 'vie', 'ne', 'az', 'af', 'ga', 'mt', 'si', 'gl', 'sq', 'or', 'kk', 'cy', 'tl', 'ceb', 'tha', 'as', 'mk', 'hy', 'ha', 'uz', 'my', 'ka', 'eng', 'ig', 'eo', 'be', 'nb', 'km', 'mn', 'ky', 'la', 'zu', 'so', 'min', 'jav', 'xh', 'nn', 'ps', 'rw', 'yue', 'jv', 'mya', 'tt', 'br', 'bs', 'ckb', 'sa', 'lg', 'lo', 'wo', 'ku', 'ug', 'ilo', 'sd', 'ast', 'tw', 'sun', 'tg', 'lb', 'ace', 'nso', 'gd', 'war', 'fil', 'oc', 'fy', 'tgl', 'su', 'tk', 'bug', 'sn', 'khm', 'bjn', 'gn', 'ht', 'yi', 'mai', 'ba', 'bo', 'ban', 'zlm', 'tn', 'fo', 'dv', 'kab', 'ln', 'bm', 'hin', 'aa', 'ny', 'ti', 'cv', 'shn', 'mi', 'sat', 'mg', 'lao', 'sah', 'arz', 'ee', 'st', 'mar', 'ia', 'pag', 'qu', 'hsb', 'ab',

In [9]:
# pattern matching functions

def extract_name(full_name):
    pattern = re.compile(r'[^/]+/(.+)')
    match = re.search(pattern, full_name)
    if match:
        return match.group(1) # the part after '/' might also contain version and number of parameters (impossible to extract in a uniform way)
    else:
        return full_name

def match_string(entries, pattern):
    pattern = re.compile(pattern)
    for entry in entries:
        match = pattern.match(entry)
        if match:
            return match.group(1)
    return None

def find_all_matches(entries, pattern):
    pattern = re.compile(pattern)
    matches = []
    for entry in entries:
        match = pattern.match(entry)
        if match:
            matches.append(match.group(1))
    return matches

def match_license(entries):
    return match_string(entries, r'license:(\S+)')

def match_dataset(entries):
    return find_all_matches(entries, r'dataset:(\S+)')

def match_uri(entries):
    uri = match_string(entries, r'arxiv:(\S+)')
    if uri is None:
        uri = match_string(entries, r'doi:(\S+)')
    return uri

In [10]:
def extract_model_attributes(model_idx):

	model = models_df.loc[model_idx]
	model_tags = models_df.loc[model_idx]['tags']
	model_card_data = None
	try:
		model_card_data = next(api.list_models(model_name=model['id'], full=True, cardData=True)).card_data.to_dict()
	except AttributeError:
		print('No card data available for this model')
	model_attributes = dict()

	model_attributes['name'] = extract_name(model['id'])
	model_attributes['version'] = None # sometimes in model['id'] but impossible to extract in a uniform way
	model_attributes['numberOfParameters'] = None # sometimes in model['id'] or model description but impossible to extract in a uniform way

	model_attributes['quantization'] = None
	for t in model_tags:
		if t in tag_quantization:
			model_attributes['quantization'] = t

	model_attributes['architecture'] = None
	try:
		if model_card_data is not None:
			model_attributes['architecture'] = model_card_data['base_model']
	except KeyError:
		print('No architecture data available for this model')

	model_attributes['languages'] = []
	for t in model_tags:
		if t in tag_language:
			model_attributes['languages'].append(t)

	model_attributes['modelCreator'] = None # TODO: if base_model exists, look for 'author' of the base model
	try:
		if model_card_data is not None:
			base_model = model_card_data['base_model']
			base_model_data = pd.DataFrame(api.list_models(model_name=base_model, full=True))
			model_attributes['modelCreator'] = base_model_data.loc[0]['author']
	except KeyError:
		print('No base model data available for this model')

	model_attributes['licenseToUse'] = match_license(model_tags)

	model_attributes['libraryFramework'] = [] # TODO: change type into list(str) in our model
	for t in model_tags:
		if t in tag_library:
			model_attributes['libraryFramework'].append(t)

	model_attributes['contextLength'] = None
	model_attributes['developers'] = [model['author']]
	model_attributes['openSource'] = True

	model_attributes['uri'] = match_uri(model_tags)

	model_attributes['fineTuned'] = None # if there is a 'base_model' in card_data, it is fine-tuned
	try:
		if model_card_data is not None:
			if 'base_model' in model_card_data:
				model_attributes['fineTuned'] = True
	except KeyError:
		print('No base model data available for this model')

	model_attributes['carbonEmission [CO2eq tons]'] = None
	try:
		if model_card_data is not None:
			model_attributes['carbonEmission [CO2eq tons]'] = model_card_data['co2_eq_emissions']
	except KeyError:
		print('No emission data available for this model')

	model_attributes['tokenizer'] = None

	return model_attributes

In [11]:
def model_to_json(model_idx):
    
    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    
    os.makedirs(result_path, exist_ok=True)
    
    model_attributes = extract_model_attributes(model_idx)
    
    with open(os.path.join(result_path, 'test_models.json'), 'w', encoding='utf-8') as f:
        json.dump(model_attributes, f, indent=4)

In [12]:
def models_to_json(models_df):
    
    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    
    os.makedirs(result_path, exist_ok=True)

    output = []
    
    for model_idx in range(models_df.shape[0]):
        output.append(extract_model_attributes(model_idx))
    
    with open(os.path.join(result_path, 'ChatIMPACT.LargeLanguageModel.json'), 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=4)

In [13]:
# models_to_json(models_df)

### Statistics

In [14]:
# Read JSON and count available attributes

def models_statistics():

	name_count = 0
	version_count = 0
	number_of_parameters_count = 0
	quantization_count = 0
	architecture_count = 0
	languages_count = 0
	model_creator_count = 0
	license_count = 0
	library_count = 0
	context_length_count = 0
	developers_count = 0
	open_source_count = 0
	uri_count = 0
	finetuned_count = 0
	carbon_emission_count = 0
	tokenizer_count = 0

	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')

	models_json = open(os.path.join(result_path, 'ChatIMPACT.LargeLanguageModel.json'))
	models_data_json = json.load(models_json)

	models_df = pd.DataFrame(models_data_json) 

	# TODO: add more attributes (?)
	for idx, item in enumerate(models_data_json):
		if item['name'] is not None:
			name_count += 1
		if item['version'] is not None:
			version_count += 1
		if item['numberOfParameters'] is not None:
			number_of_parameters_count += 1
		if item['quantization'] is not None:
			quantization_count += 1
		if item['architecture'] is not None:
			architecture_count += 1
		if len(item['languages']) > 0:
			languages_count += 1
		if item['modelCreator'] is not None:
			model_creator_count += 1
		if item['licenseToUse'] is not None:
			license_count += 1
		if len(item['libraryFramework']) > 0:
			library_count += 1
		if item['contextLength'] is not None:
			context_length_count += 1
		if len(item['developers']) > 0:
			developers_count += 1
		if item['openSource'] is not None:
			open_source_count += 1
		if item['uri'] is not None:
			uri_count += 1
		if item['fineTuned'] is not None:
			finetuned_count += 1
		if item['carbonEmission [CO2eq tons]'] is not None:
			carbon_emission_count += 1
		if item['tokenizer'] is not None:
			tokenizer_count += 1
	
	total_models = idx + 1

	print(f'Number of processed models: {total_models}')
	print(f'    Name: {name_count} ({(name_count / total_models) * 100:.2f}%)')
	print(f'    Version: {version_count} ({(version_count / total_models) * 100:.2f}%)')
	print(f'    Number of Parameters: {number_of_parameters_count} ({(number_of_parameters_count / total_models) * 100:.2f}%)')
	print(f'    Quantization: {quantization_count} ({(quantization_count / total_models) * 100:.2f}%)')
	print(f'    Architecture: {architecture_count} ({(architecture_count / total_models) * 100:.2f}%)')
	print(f'    Languages: {languages_count} ({(languages_count / total_models) * 100:.2f}%)')
	print(f'    Model creator: {model_creator_count} ({(model_creator_count / total_models) * 100:.2f}%)')
	print(f'    License to use: {license_count} ({(license_count / total_models) * 100:.2f}%)')
	print(f'    Library: {library_count} ({(library_count / total_models) * 100:.2f}%)')
	print(f'    Context Length: {context_length_count} ({(context_length_count / total_models) * 100:.2f}%)')
	print(f'    Developers: {developers_count} ({(developers_count / total_models) * 100:.2f}%)')
	print(f'    Open Source: {open_source_count} ({(open_source_count / total_models) * 100:.2f}%)')
	print(f'    URI: {uri_count} ({(uri_count / total_models) * 100:.2f}%)')
	print(f'    Fine-tuned: {finetuned_count} ({(finetuned_count / total_models) * 100:.2f}%)')
	print(f'    Carbon emission: {carbon_emission_count} ({(carbon_emission_count / total_models) * 100:.2f}%)')
	print(f'    Tokenizer: {tokenizer_count} ({(tokenizer_count / total_models) * 100:.2f}%)')

	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	llm_attributes = models_data_json[0].keys()

	for idx, item in enumerate(models_data_json):
		model_name = item['name']
		for attr in llm_attributes:
			if item[attr] is not None and type(item[attr]) != list:
				availability = pd.concat([availability, pd.DataFrame([{'id': model_name, 'entity name': 'LLM', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
				availability = pd.concat([availability, pd.DataFrame([{'id': model_name, 'entity name': 'LLM', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': model_name, 'entity name': 'LLM', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability



In [15]:
availability = models_statistics()

Number of processed models: 1000
    Name: 1000 (100.00%)
    Version: 0 (0.00%)
    Number of Parameters: 0 (0.00%)
    Quantization: 0 (0.00%)
    Architecture: 6 (0.60%)
    Languages: 251 (25.10%)
    Model creator: 6 (0.60%)
    License to use: 172 (17.20%)
    Library: 599 (59.90%)
    Context Length: 0 (0.00%)
    Developers: 1000 (100.00%)
    Open Source: 1000 (100.00%)
    URI: 177 (17.70%)
    Fine-tuned: 6 (0.60%)
    Carbon emission: 9 (0.90%)
    Tokenizer: 0 (0.00%)


In [16]:
availability.head(30)

Unnamed: 0,id,entity name,attribute name,available API,available scraping
0,albert-base-v1,LLM,name,True,False
1,albert-base-v1,LLM,version,False,False
2,albert-base-v1,LLM,numberOfParameters,False,False
3,albert-base-v1,LLM,quantization,False,False
4,albert-base-v1,LLM,architecture,False,False
5,albert-base-v1,LLM,languages,True,False
6,albert-base-v1,LLM,modelCreator,False,False
7,albert-base-v1,LLM,licenseToUse,True,False
8,albert-base-v1,LLM,libraryFramework,True,False
9,albert-base-v1,LLM,contextLength,False,False


## Dataset

### Information extraction

In [17]:
datasets = api.list_datasets(full=True)

In [18]:
# TODO run for all datasets
datasets = list(itertools.islice(datasets, 0, 1000))
datasets_df = pd.DataFrame(datasets)
datasets_df.head(10)

Unnamed: 0,id,author,sha,created_at,last_modified,private,gated,disabled,downloads,likes,paperswithcode_id,tags,card_data,siblings
0,amirveyseh/acronym_identification,amirveyseh,15ef643450d589d5883e289ffadeb03563e80a9e,2022-03-02 23:29:22+00:00,2024-01-09 11:39:57+00:00,False,False,False,180,19,acronym-identification,"[task_categories:token-classification, annotat...",{},
1,ade-benchmark-corpus/ade_corpus_v2,ade-benchmark-corpus,4ba01c71687dd7c996597042449448ea312126cf,2022-03-02 23:29:22+00:00,2024-01-09 11:42:58+00:00,False,False,False,241,25,,"[task_categories:text-classification, task_cat...",{},
2,UCLNLP/adversarial_qa,UCLNLP,c2d5f738db1ad21a4126a144dfbb00cb51e0a4a9,2022-03-02 23:29:22+00:00,2023-12-21 14:20:00+00:00,False,False,False,139,32,adversarialqa,"[task_categories:question-answering, task_ids:...",{},
3,Yale-LILY/aeslc,Yale-LILY,2305f2e63b68056f9b9037a3805c8c196e0d5581,2022-03-02 23:29:22+00:00,2024-01-09 11:49:13+00:00,False,False,False,124,12,aeslc,"[task_categories:summarization, annotations_cr...",{},
4,nwu-ctext/afrikaans_ner_corpus,nwu-ctext,445834a997dce8b40e1d108638064381de80c497,2022-03-02 23:29:22+00:00,2024-01-09 11:51:47+00:00,False,False,False,112,6,,"[task_categories:token-classification, task_id...",{},
5,fancyzhx/ag_news,fancyzhx,eb185aade064a813bc0b7f42de02595523103ca4,2022-03-02 23:29:22+00:00,2024-03-07 12:02:37+00:00,False,False,False,7297,123,ag-news,"[task_categories:text-classification, task_ids...",{},
6,allenai/ai2_arc,allenai,210d026faf9955653af8916fad021475a3f00453,2022-03-02 23:29:22+00:00,2023-12-21 15:09:48+00:00,False,False,False,785162,111,,"[task_categories:question-answering, task_ids:...",{},
7,google/air_dialogue,google,dbdbe7bcef8d344bc3c68a05600f3d95917d6898,2022-03-02 23:29:22+00:00,2024-03-07 15:22:15+00:00,False,False,False,75,15,,"[task_categories:text-generation, task_categor...",{},
8,komari6/ajgt_twitter_ar,komari6,af3f2fa5462ac461b696cb300d66e07ad366057f,2022-03-02 23:29:22+00:00,2024-01-09 11:58:01+00:00,False,False,False,119,4,,"[task_categories:text-classification, task_ids...",{},
9,legacy-datasets/allegro_reviews,legacy-datasets,71593d1379934286885c53d147bc863ffe830745,2022-03-02 23:29:22+00:00,2024-01-09 11:59:39+00:00,False,False,False,84,4,allegro-reviews,"[task_categories:text-classification, task_ids...",{},


In [19]:
datasets_df.columns

Index(['id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'likes', 'paperswithcode_id', 'tags',
       'card_data', 'siblings'],
      dtype='object')

In [20]:
datasets_df.loc[0]

id                                   amirveyseh/acronym_identification
author                                                      amirveyseh
sha                           15ef643450d589d5883e289ffadeb03563e80a9e
created_at                                   2022-03-02 23:29:22+00:00
last_modified                                2024-01-09 11:39:57+00:00
private                                                          False
gated                                                            False
disabled                                                         False
downloads                                                          180
likes                                                               19
paperswithcode_id                               acronym-identification
tags                 [task_categories:token-classification, annotat...
card_data                                                           {}
siblings                                                          None
Name: 

In [21]:
datasets_df.loc[2]['tags']

['task_categories:question-answering',
 'task_ids:extractive-qa',
 'task_ids:open-domain-qa',
 'annotations_creators:crowdsourced',
 'language_creators:found',
 'multilinguality:monolingual',
 'source_datasets:original',
 'language:en',
 'license:cc-by-sa-4.0',
 'size_categories:10K<n<100K',
 'format:parquet',
 'modality:text',
 'library:datasets',
 'library:pandas',
 'library:mlcroissant',
 'library:polars',
 'arxiv:2002.00293',
 'arxiv:1606.05250',
 'region:us']

In [22]:
def match_language(entries):
    return find_all_matches(entries, r'language:(\S+)')

def match_size(entries):
    return match_string(entries, r'size_categories:(\S+)')

In [23]:
def convert_file_size_to_gb(file_size_str):
    """
    Convert the file size string (e.g., '74.6 kB') to gigabytes (GB).
    """
    file_size_parts = file_size_str.split()
    file_size = float(file_size_parts[0])
    unit = file_size_parts[1]

    conversion_factors = {
        'B': 1 / (1024 ** 3),
        'kB': 1 / (1024 ** 2),
        'MB': 1 / 1024,
        'GB': 1,
        'TB': 1024,
    }

    if unit in conversion_factors:
        return float(file_size * conversion_factors[unit])
    else:
        return None

def extract_file_size(url):
    # Fetch the HTML content from the provided URL
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the div containing the "Size of downloaded dataset files:" text
    size_label_div = soup.find('div', string='Size of downloaded dataset files:')

    if size_label_div:
        # Find the next sibling div containing the file size
        size_div = size_label_div.find_next('div')
        if size_div:
            # Extract the file size text
            file_size = size_div.get_text(strip=True)
            return file_size
    else:
        return None


In [24]:
def extract_datasets_attributes(dataset_idx):

	dataset = datasets_df.loc[dataset_idx]
	dataset_tags = datasets_df.loc[dataset_idx]['tags']
	dataset_attributes = dict()

	dataset_attributes['name'] = extract_name(dataset['id'])
	dataset_attributes['size [GB]'] = None

	url = "https://huggingface.co/datasets/" + dataset['id']
	file_size_str = extract_file_size(url)
	if file_size_str:
		file_size_gb = convert_file_size_to_gb(file_size_str)
		if file_size_gb:
			dataset_attributes['size [GB]'] = file_size_gb

	dataset_attributes['languages'] = match_language(dataset_tags)

	# dataset_attributes['dataset creator'] = dataset['author'] # TODO: add attribute in our model?

	dataset_attributes['licenseToUse'] = match_license(dataset_tags)

	dataset_attributes['domain'] = []
	for t in dataset_tags:
		if t in tag_domain:
			dataset_attributes['domain'].append(t)

	dataset_attributes['uri'] = match_uri(dataset_tags) # TODO: add multiple URIs when available?

	dataset_attributes['fineTuning'] = None

	return dataset_attributes

In [28]:
def dataset_to_json(datasets_df):
    
    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    
    os.makedirs(result_path, exist_ok=True)

    output = []
    
    start_time = time.time()
    for dataset_idx in range(datasets_df.shape[0]):
        output.append(extract_datasets_attributes(dataset_idx))
        if dataset_idx % 10 == 0:
            print(f'Processed {dataset_idx} datasets, elapsed time: {time.time() - start_time:.2f} seconds')
    
    with open(os.path.join(result_path, 'ChatIMPACT.Dataset.json'), 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=4)

In [29]:
dataset_to_json(datasets_df)

Processed 0 datasets, elapsed time: 1.00 seconds
Processed 10 datasets, elapsed time: 9.65 seconds
Processed 20 datasets, elapsed time: 17.75 seconds
Processed 30 datasets, elapsed time: 25.48 seconds
Processed 40 datasets, elapsed time: 33.46 seconds
Processed 50 datasets, elapsed time: 42.07 seconds
Processed 60 datasets, elapsed time: 51.79 seconds
Processed 70 datasets, elapsed time: 58.55 seconds
Processed 80 datasets, elapsed time: 65.34 seconds
Processed 90 datasets, elapsed time: 73.20 seconds
Processed 100 datasets, elapsed time: 81.37 seconds
Processed 110 datasets, elapsed time: 89.41 seconds
Processed 120 datasets, elapsed time: 98.28 seconds
Processed 130 datasets, elapsed time: 105.30 seconds
Processed 140 datasets, elapsed time: 113.13 seconds
Processed 150 datasets, elapsed time: 119.39 seconds
Processed 160 datasets, elapsed time: 125.61 seconds
Processed 170 datasets, elapsed time: 133.69 seconds
Processed 180 datasets, elapsed time: 140.15 seconds
Processed 190 datas

In [33]:
# Read JSON and count available attributes

def datasets_statistics():

	name_count = 0
	size_count = 0
	languages_count = 0
	license_count = 0
	domain_count = 0
	uri_count = 0
	finetuning_count = 0

	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')

	datasets_json = open(os.path.join(result_path, 'ChatIMPACT.Dataset.json'))
	datasets_data_json = json.load(datasets_json)

	for idx, item in enumerate(datasets_data_json):
		if item['name'] is not None:
			name_count += 1
		if item['size [GB]'] is not None:
			size_count += 1
		if len(item['languages']) > 0:
			languages_count += 1	
		if item['licenseToUse'] is not None:	
			license_count += 1
		if len(item['domain']) > 0:
			domain_count += 1	
		if item['uri'] is not None:
			uri_count += 1
		if item['fineTuning'] is not None:
			finetuning_count += 1
	
	total_datasets = idx + 1	
	print(f'Number of processed datasets: {total_datasets}')
	print(f'    Name: {name_count} ({(name_count / total_datasets) * 100:.2f}%)')
	print(f'    Size [GB]: {size_count} ({(size_count / total_datasets) * 100:.2f}%)')
	print(f'    Languages: {languages_count} ({(languages_count / total_datasets) * 100:.2f}%)')
	print(f'    License to use: {license_count} ({(license_count / total_datasets) * 100:.2f}%)')
	print(f'    Domain: {domain_count} ({(domain_count / total_datasets) * 100:.2f}%)')
	print(f'    URI: {uri_count} ({(uri_count / total_datasets) * 100:.2f}%)')
	print(f'    Fine-tuning: {finetuning_count} ({(finetuning_count / total_datasets) * 100:.2f}%)')

	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	datasets_attributes = datasets_data_json[0].keys()

	for idx, item in enumerate(datasets_data_json):
		dataset_name = item['name']
		for attr in datasets_attributes:
			if item[attr] is not None and attr == 'size [GB]':
				availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True)
			elif item[attr] is not None and type(item[attr]) != list:
				availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
					availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability


In [34]:
availability = datasets_statistics()

Number of processed datasets: 1000
    Name: 1000 (100.00%)
    Size [GB]: 525 (52.50%)
    Languages: 817 (81.70%)
    License to use: 784 (78.40%)
    Domain: 2 (0.20%)
    URI: 323 (32.30%)
    Fine-tuning: 0 (0.00%)


In [35]:
availability.head(20)

Unnamed: 0,id,entity name,attribute name,available API,available scraping
0,acronym_identification,Dataset,name,True,False
1,acronym_identification,Dataset,size [GB],False,True
2,acronym_identification,Dataset,languages,True,False
3,acronym_identification,Dataset,licenseToUse,True,False
4,acronym_identification,Dataset,domain,False,False
5,acronym_identification,Dataset,uri,True,False
6,acronym_identification,Dataset,fineTuning,False,False
7,ade_corpus_v2,Dataset,name,True,False
8,ade_corpus_v2,Dataset,size [GB],False,True
9,ade_corpus_v2,Dataset,languages,True,False


## Downstream Task

In [69]:
def fetch_and_extract_text(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        target_paragraph = soup.find('p', class_='text-[1.2rem] text-gray-500')
        
        if target_paragraph:
            return target_paragraph.get_text().strip()
        else:
            return "Target paragraph not found."
    else:
        return f"Failed to fetch the webpage. Status code: {response.status_code}"

def create_tasks_json():

    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    
    os.makedirs(result_path, exist_ok=True)

    tasks_data = []

    for task in TAG_DOWNSTREAM_TASK:
        url = f"https://huggingface.co/tasks/{task}"
        description = fetch_and_extract_text(url)
        
        tasks_data.append({
            "name": task,
            "description": description, # TODO: text2text generation has no description
            "sub-task": []
        })
        
        print(f"Processed: {task}")
        # time.sleep(0.5)  # Be polite to the server

    with open(result_path + '/ChatIMPACT.DownstreamTask.json', 'w', encoding='utf-8') as f:
        json.dump(tasks_data, f, ensure_ascii=False, indent=2)

In [70]:
# create_tasks_json()

In [71]:
# Read JSON and count available attributes

def tasks_statistics():
	name_count = 0
	description_count = 0
	sub_task_count = 0
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')

	task_json = open(os.path.join(result_path, 'ChatIMPACT.DownstreamTask.json'))
	task_data_json = json.load(task_json)

	for idx, item in enumerate(task_data_json):
		if item['name'] is not None:
			name_count += 1	
		if item['description'] is not None:
			description_count += 1
		if len(item['sub-task']) > 0:
			sub_task_count += 1
	
	task_count = idx + 1
	print(f'Number of processed task: {idx + 1}')
	print(f'    Name: {name_count} ({(name_count / task_count) * 100:.2f}%)')
	print(f'    Description: {description_count} ({(description_count / task_count) * 100:.2f}%)')
	print(f'    Sub-task: {sub_task_count} ({(sub_task_count / task_count) * 100:.2f}%)')

	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	task_attributes = task_data_json[0].keys()

	for idx, item in enumerate(task_data_json):
		task_name = item['name']
		for attr in task_attributes:
			if item[attr] is not None and type(item[attr]) != list:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'DownstreamTask', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'DownstreamTask', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'DownstreamTask', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability


In [31]:
availability = tasks_statistics()

Number of processed task: 11
    Name: 11 (100.00%)
    Description: 11 (100.00%)
    Sub-task: 0 (0.00%)


In [32]:
availability.head(33)

Unnamed: 0,id,entity name,attribute name,available API,available scraping
0,text-classification,DownstreamTask,name,False,True
1,text-classification,DownstreamTask,description,False,True
2,text-classification,DownstreamTask,sub-task,False,False
3,token-classification,DownstreamTask,name,False,True
4,token-classification,DownstreamTask,description,False,True
5,token-classification,DownstreamTask,sub-task,False,False
6,table-question-answering,DownstreamTask,name,False,True
7,table-question-answering,DownstreamTask,description,False,True
8,table-question-answering,DownstreamTask,sub-task,False,False
9,question-answering,DownstreamTask,name,False,True


# Metrics

In [120]:
# Scrape metrics and descriptions from HF

url_metrics = 'https://huggingface.co/metrics'

response = requests.get(url_metrics)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

h4_tags = soup.find_all('h4')
metrics = [h4_tag.get_text(strip=True) for h4_tag in h4_tags]
# print(metrics)

p_tags = soup.find_all('p')
descriptions = [p_tag.get_text() for p_tag in p_tags]
descriptions = descriptions[2:] # drop first lines
# print(descriptions)

# remove from the list the metrics withoud description (not useful for our purpose)
metrics.remove('AlhitawiMohammed22/CER_Hu-Evaluation-Metrics')
metrics.remove('Aye10032/loss_metric')
metrics.remove('giulio98/code_eval_outputs')
metrics.remove('maysonma/lingo_judge_metric')
metrics.remove('lvwerra/test')
metrics.remove('sma2023/wil')


assert len(metrics) == len(descriptions)
print(len(metrics))

265


In [121]:
# From the lists, remove the descriptions and then the relative metric in the same index that have in the description 'TODO: add a description here\n\t\t\t\t\t\t'ArithmeticError

for i, description in enumerate(descriptions):
    if 'TODO: add a description here' in description:
        metrics.pop(i)
        descriptions.pop(i)

assert len(metrics) == len(descriptions)
print(len(metrics))

218


In [126]:
def create_metrics_json(metrics, descriptions):

    metrics_data = []

    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    
    os.makedirs(result_path, exist_ok=True)
    
    for idx in range(len(metrics)):
        metric_attributes = dict()

        metric_attributes['name'] = metrics[idx]
        metric_attributes['description'] = descriptions[idx]
        metric_attributes['context'] = None
        metric_attributes['featureBased/endToEnd'] = None
        metric_attributes['granularity'] = None

    
        metrics_data.append(metric_attributes)
        

    with open(os.path.join(result_path, 'ChatIMPACT.Metric.json'), 'w', encoding='utf-8') as f:
        json.dump(metrics_data, f, indent=4)

In [127]:
create_metrics_json(metrics, descriptions)

In [37]:
# Read JSON and count available attributes

def metric_statistics():
	name_count = 0
	description_count = 0
	context_count = 0
	featureBased_endToEnd_count = 0
	granularity_count = 0
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')

	metric_json = open(os.path.join(result_path, 'ChatIMPACT.Metric.json'))
	metric_data_json = json.load(metric_json)

	for idx, item in enumerate(metric_data_json):
		if item['name'] is not None:
			name_count += 1
		if item['description'] is not None:
			description_count += 1
		if item['context'] is not None:
			context_count += 1
		if item['featureBased/endToEnd'] is not None:
			featureBased_endToEnd_count += 1
		if item['granularity'] is not None:	
			granularity_count += 1
	
	total_datasets = idx + 1

	print(f'Number of processed datasets: {total_datasets}')
	print(f'    Name: {name_count} ({(name_count / total_datasets) * 100:.2f}%)')
	print(f'    Description: {description_count} ({(description_count / total_datasets) * 100:.2f}%)')
	print(f'    Context: {context_count} ({(context_count / total_datasets) * 100:.2f}%)')
	print(f'    FeatureBased/endToEnd: {featureBased_endToEnd_count} ({(featureBased_endToEnd_count / total_datasets) * 100:.2f}%)')
	print(f'    Granularity: {granularity_count} ({(granularity_count / total_datasets) * 100:.2f}%)')
	
	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	metric_attributes = metric_data_json[0].keys()

	for idx, item in enumerate(metric_data_json):
		task_name = item['name']
		for attr in metric_attributes:
			if item[attr] is not None and type(item[attr]) != list:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'Metric', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'Metric', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'Metric', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability


In [38]:
availability = metric_statistics()

Number of processed datasets: 218
    Name: 218 (100.00%)
    Description: 218 (100.00%)
    Context: 0 (0.00%)
    FeatureBased/endToEnd: 0 (0.00%)
    Granularity: 0 (0.00%)


In [68]:
availability.head(10)

Unnamed: 0,id,entity name,attribute name,available API,available scraping
0,accuracy,Metric,name,False,True
1,accuracy,Metric,description,False,True
2,accuracy,Metric,context,False,False
3,accuracy,Metric,featureBased/endToEnd,False,False
4,accuracy,Metric,granularity,False,False
5,bertscore,Metric,name,False,True
6,bertscore,Metric,description,False,True
7,bertscore,Metric,context,False,False
8,bertscore,Metric,featureBased/endToEnd,False,False
9,bertscore,Metric,granularity,False,False


# Train relationship

In [54]:
def extract_train_relationship():
    
    train_relationship_list = []
    
    for model_idx in range(len(models_df)):
        
        model = models_df.loc[model_idx]
        model_tags = models_df.loc[model_idx]['tags']
        datasets = match_dataset(model_tags)
        
        if not datasets:
            continue
        else:
            train_relationship = dict()
            train_relationship['Models'] = extract_name(model['id']) # {"$oid": <...>} for MongoDB
            train_relationship['Datasets'] = datasets # [{"$oid": <...>}, ..., {"$oid": <...>}] for MongoDB
            train_relationship_list.append(train_relationship)
    
    return train_relationship_list

In [55]:
def create_train_relationship_json():
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')
	
	os.makedirs(result_path, exist_ok=True)

	train_relationship = extract_train_relationship()

	with open(os.path.join(result_path, 'ChatIMPACT.TrainRelationship.json'), 'w', encoding='utf-8') as f:
		json.dump(train_relationship, f, indent=4)

In [56]:
create_train_relationship_json()

# SuitedFor relationship

In [57]:
def extract_suited_for_relationship():
    
    suited_for_relationship_list = []
    
    for model_idx in range(len(models_df)):
        
        model = models_df.loc[model_idx]
        model_tags = models_df.loc[model_idx]['tags']
        
        tasks = []
        for t in model_tags:
            if t in TAG_DOWNSTREAM_TASK:
                tasks.append(t)

        if not tasks:
            continue
        else:
            suited_for_relationship = dict()
            suited_for_relationship['LargeLanguageModel'] = extract_name(model['id']) # {"$oid": <...>} for MongoDB
            suited_for_relationship['DownstreamTask'] = tasks # [{"$oid": <...>}, ..., {"$oid": <...>}] for MongoDB
            suited_for_relationship_list.append(suited_for_relationship)
    
    return suited_for_relationship_list

In [58]:
def create_suited_for_relationship_json():
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')
	
	os.makedirs(result_path, exist_ok=True)

	suited_for_relationship = extract_suited_for_relationship()

	with open(os.path.join(result_path, 'ChatIMPACT.SuitedForRelationship.json'), 'w', encoding='utf-8') as f:
		json.dump(suited_for_relationship, f, indent=4)

In [59]:
create_suited_for_relationship_json()

# Enable relationship

In [60]:
def match_tasks(entries):
    return find_all_matches(entries, r'task_categories:(\S+)')

In [61]:
def extract_enable_relationship():
	
	enable_relationship_list = []
	
	for dataset_idx in range(len(datasets_df)):
		
		dataset = datasets_df.loc[dataset_idx]
		dataset_tags = datasets_df.loc[dataset_idx]['tags']
		
		tasks = match_tasks(dataset_tags)

		if not tasks:
			continue
		else:
			enable_relationship = dict()
			enable_relationship['Dataset'] = extract_name(dataset['id']) # {"$oid": <...>} for MongoDB
			enable_relationship['DownstreamTask'] = tasks # [{"$oid": <...>}, ..., {"$oid": <...>}] for MongoDB
			enable_relationship_list.append(enable_relationship)
	
	return enable_relationship_list

In [62]:
def create_enable_relationship_json():
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')
	
	os.makedirs(result_path, exist_ok=True)

	enable_relationship = extract_enable_relationship()

	with open(os.path.join(result_path, 'ChatIMPACT.EnableRelationship.json'), 'w', encoding='utf-8') as f:
		json.dump(enable_relationship, f, indent=4)

In [66]:
create_enable_relationship_json()

# Assess relationship

In [4]:
# TODO: here https://huggingface.co/tasks some tasks have associated metrics, we could scrape the tasks one by one

def extract_assess_relationship():

    assess = []
    for task in TAG_DOWNSTREAM_TASK:
        assess_element = {'Metric': [], 'DownstreamTask': task}
        print(f"Processing task: {task}\n")
        url = f"https://huggingface.co/tasks/{task}"
        # Fetch the webpage
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all the <dl> elements
        dl_elements = soup.find_all('dl', class_='flex items-center rounded-lg border border-gray-100')

        # Loop through each <dl> element
        for dl in dl_elements:
            # Extract the metric name from the <dt> tag inside the <summary>
            metric_name = dl.find('dt').get_text(strip=True)

            assess_element['Metric'].append(metric_name)

        assess.append(assess_element)
    return assess


In [5]:
def create_asess_relationship_json():
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')
	
	os.makedirs(result_path, exist_ok=True)

	assess_relationship = extract_assess_relationship()

	with open(os.path.join(result_path, 'ChatIMPACT.AssessRelationship.json'), 'w', encoding='utf-8') as f:
		json.dump(assess_relationship, f, indent=4)

In [6]:
create_asess_relationship_json()

Processing task: text-classification

Processing task: token-classification

Processing task: table-question-answering

Processing task: question-answering

Processing task: zero-shot-classification

Processing task: translation

Processing task: summarization

Processing task: feature-extraction

Processing task: text-generation

Processing task: fill-mask

Processing task: sentence-similarity



# Evaluate relationship

In [70]:
# TODO: check that this is correct (the output and the model cards on HF do not seem to be coherent?)
# Model card template: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1

def extract_evaluate_relationship():

	evaluate_relationship_list = []

	for model_idx in range(len(models_df)):

		model = models_df.loc[model_idx]
		
		try:
			model_card_data = next(api.list_models(model_name=model['id'], full=True, cardData=True)).card_data.to_dict()
		except AttributeError:
			print('No card data available for this model')
		
		metrics = []
		if model_card_data is not None:
			if 'metrics' in model_card_data:
				metrics = model_card_data['metrics']
			
		if not metrics:
			continue
		else:
			evaluate_relationship = dict()
			evaluate_relationship['LargeLanguageModel'] = extract_name(model['id'])
			evaluate_relationship['Metric'] = metrics
			evaluate_relationship_list.append(evaluate_relationship)
	
	return evaluate_relationship_list

In [71]:
def create_evaluate_relationship_json():
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')
	
	os.makedirs(result_path, exist_ok=True)

	evaluate_relationship = extract_evaluate_relationship()

	with open(os.path.join(result_path, 'ChatIMPACT.EvaluateRelationship.json'), 'w', encoding='utf-8') as f:
		json.dump(evaluate_relationship, f, indent=4)

In [None]:
create_evaluate_relationship_json()