# Data extraction

In [1]:
from huggingface_hub import HfApi
import pandas as pd
import itertools
import re
import requests
from bs4 import BeautifulSoup
import time
import json
import os
from huggingface_hub.utils import logging

from tags import * # tags.py
logging.set_verbosity_error()

In [2]:
api = HfApi()

## Utils

In [3]:
# Scrape languages from HF

url_languages = 'https://huggingface.co/languages'

response = requests.get(url_languages)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

code_tags = soup.find_all('code')
tag_language = [code_tag.get_text() for code_tag in code_tags]

tag_language.remove('jax') # 'jax' is the ISO for Jambi Malay (present in 3 datasets, 36 models), impossible to distinguish from JAX the library... TODO: better solution?

tag_language = set(tag_language)

In [4]:
# Pattern matching functions

def extract_name(full_name):
    pattern = re.compile(r'[^/]+/(.+)')
    match = re.search(pattern, full_name)
    if match:
        return match.group(1) # the part after '/' might also contain version and number of parameters (impossible to extract in a uniform way)
    else:
        return full_name

def match_string(entries, pattern):
    pattern = re.compile(pattern)
    for entry in entries:
        match = pattern.match(entry)
        if match:
            return match.group(1)
    return None

def find_all_matches(entries, pattern):
    pattern = re.compile(pattern)
    matches = []
    for entry in entries:
        match = pattern.match(entry)
        if match:
            matches.append(match.group(1))
    return matches

def match_license(entries):
    return match_string(entries, r'license:(\S+)')

def match_dataset(entries):
    return find_all_matches(entries, r'dataset:(\S+)')

def match_uri(entries):
    uri = match_string(entries, r'arxiv:(\S+)') # TODO: use DOI instead of arXiv?
    if uri is None:
        uri = match_string(entries, r'doi:(\S+)')
    return uri

def match_language(entries):
    return find_all_matches(entries, r'language:(\S+)')

def match_size(entries):
    return match_string(entries, r'size_categories:(\S+)')

def match_tasks(entries):
    return find_all_matches(entries, r'task_categories:(\S+)')

In [5]:
def add_to_json_file(data, file_path):

    if os.path.exists(file_path):
        with open(file_path, 'r+', encoding='utf-8') as f:

            f.seek(0, os.SEEK_END)
            f.seek(f.tell() - 1, os.SEEK_SET)
            f.truncate()
            f.write(',\n')
            json.dump(data, f, indent=4)
            f.write(']')
    else:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump([data], f, indent=4)

In [3]:
current_path = os.getcwd()
parent_path = os.path.dirname(current_path)
result_path = os.path.join(parent_path, 'database', 'HF entries', 'hf extracted json')
os.makedirs(result_path, exist_ok=True)

## LLMs

In [8]:
# Get all models

# models = api.list_models(full=True)

In [None]:
# Process the first 1000 models

# model = itertools.islice(models, 0, 1000)
# models_df = pd.DataFrame(model)
# models_df.head(10)

In [10]:
def extract_model_attributes(model):

	model_tags = model.tags
	if model.card_data is not None:
		model_card_data = model.card_data.to_dict()
	else:
		model_card_data = None
	model_attributes = dict()

	model_attributes['name'] = extract_name(model.id)
	model_attributes['id'] = model.id
	model_attributes['version'] = None # sometimes in model['id'] but impossible to extract in a uniform way
	model_attributes['numberOfParameters'] = None # sometimes in model['id'] or model description but impossible to extract in a uniform way

	model_attributes['quantization'] = None
	for t in model_tags:
		if t in tag_quantization:
			model_attributes['quantization'] = t

	model_attributes['architecture'] = None
	try:
		if model_card_data is not None:
			model_attributes['architecture'] = model_card_data['base_model']
	except KeyError:
		pass

	model_attributes['languages'] = []
	for t in model_tags:
		if t in tag_language:
			model_attributes['languages'].append(t)

	model_attributes['modelCreator'] = None # extracted in a postprocessing step

	model_attributes['licenseToUse'] = match_license(model_tags)

	model_attributes['libraryFramework'] = [] 
	for t in model_tags:
		if t in tag_library:
			model_attributes['libraryFramework'].append(t)

	model_attributes['contextLength'] = None
	model_attributes['developers'] = [model.author]
	model_attributes['openSource'] = True

	model_attributes['uri'] = match_uri(model_tags)

	model_attributes['fineTuned'] = None # if there is a 'base_model' in card_data, it is fine-tuned
	try:
		if model_card_data is not None:
			if 'base_model' in model_card_data:
				model_attributes['fineTuned'] = True
	except KeyError:
		pass

	model_attributes['carbonEmission [CO2eq tons]'] = None
	try:
		if model_card_data is not None:
			model_attributes['carbonEmission [CO2eq tons]'] = model_card_data['co2_eq_emissions']
	except KeyError:
		pass

	model_attributes['tokenizer'] = None
	model_attributes['likes'] = model.likes
	model_attributes['downloads'] = model.downloads

	return model_attributes

In [11]:
file_path = os.path.join(result_path, 'models_data_no_modelCreator.json')

# Total: 697,162 models
count = 0
start_time = time.time()
for task in TAG_DOWNSTREAM_TASK:
    print(f'Processing {task} models...')
    models = api.list_models(filter=task, full=True, cardData=True)
    for model in models:
        model_attributes = extract_model_attributes(model)
        add_to_json_file(model_attributes, file_path)
        count += 1
        if count % 1000 == 0:
            print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

Processing token-classification models...
1000 models processed, 1.0878269672393799 seconds elapsed, estimated time remaining: 757.3165774698257 seconds
2000 models processed, 2.522703170776367 seconds elapsed, estimated time remaining: 876.8450167179108 seconds
3000 models processed, 3.8365001678466797 seconds elapsed, estimated time remaining: 887.7182051753997 seconds
4000 models processed, 5.370304107666016 seconds elapsed, estimated time remaining: 930.623386335969 seconds
5000 models processed, 6.618153095245361 seconds elapsed, estimated time remaining: 916.16803772192 seconds
6000 models processed, 7.742092132568359 seconds elapsed, estimated time remaining: 891.8405297079087 seconds
7000 models processed, 8.940644264221191 seconds elapsed, estimated time remaining: 881.4991540736471 seconds
8000 models processed, 10.21143102645874 seconds elapsed, estimated time remaining: 879.66654563421 seconds
9000 models processed, 11.39786410331726 seconds elapsed, estimated time remainin

In [4]:
# Reload the data as a DataFrame

file_path = os.path.join(result_path, 'models_data_no_modelCreator.json')

with open(file_path, 'r') as file:
    data = json.load(file)
models_df = pd.DataFrame(data)

In [5]:
# Remove duplicates
print(f'len before removing duplicates: {  len(models_df) }')
models_df = models_df.loc[models_df.astype(str).drop_duplicates().index]
print(f'len after removing duplicates: {  len(models_df) }')

len before removing duplicates: 277497
len after removing duplicates: 269335


In [6]:
# Postprocessing: find the modelCreator

df_filtered = models_df[models_df['architecture'].notna()]

# Process each row
count = 0
start_time = time.time()
for index, row in df_filtered.iterrows():
    # Find the row where 'id' matches the 'architecture' of the current row
    try:
        matching_row = models_df[models_df['id'].astype(str) == str(row['architecture'])]
    except ValueError:
        break
    
    if not matching_row.empty:
        # Get the first developer from the 'developers' list
        first_developer = matching_row['developers'].iloc[0][0] if matching_row['developers'].iloc[0] else None
        # Set the 'modelCreator' attribute of the original row
        models_df.at[index, 'modelCreator'] = first_developer
    
    count += 1
    if count % 1000 == 0:
        print(f'{count} rows processed ({count/len(df_filtered)*100} %), elapsed time: {time.time() - start_time} seconds, estimated time remaining: {(time.time() - start_time) / count * (len(df_filtered) - count)} seconds')
    
        

1000 rows processed (1.8459380134015098 %), elapsed time: 23.079020977020264 seconds, estimated time remaining: 1227.1808838305474 seconds
2000 rows processed (3.6918760268030195 %), elapsed time: 46.008147954940796 seconds, estimated time remaining: 1200.191576504588 seconds
3000 rows processed (5.5378140402045295 %), elapsed time: 69.48612809181213 seconds, estimated time remaining: 1185.2715322297415 seconds
4000 rows processed (7.383752053606039 %), elapsed time: 92.65146589279175 seconds, estimated time remaining: 1162.1505115219354 seconds
5000 rows processed (9.22969006700755 %), elapsed time: 115.6754059791565 seconds, estimated time remaining: 1137.621357021618 seconds
6000 rows processed (11.075628080409059 %), elapsed time: 138.77773475646973 seconds, estimated time remaining: 1114.223312308351 seconds
7000 rows processed (12.921566093810569 %), elapsed time: 162.82808685302734 seconds, estimated time remaining: 1097.2985174701214 seconds
8000 rows processed (14.767504107212

In [7]:
models_list = models_df.drop(columns=['id']).to_dict(orient='records')

In [8]:
file_path_postprocessed = os.path.join(result_path, 'models_data.json')

with open(file_path_postprocessed, "w") as json_file:
    json.dump(models_list, json_file, indent=4)

## Dataset

In [9]:
# Get all datasets

# datasets = api.list_datasets(full=True)

In [10]:
# Process the first 1000 models

# datasets = list(itertools.islice(datasets, 0, 1000))
# datasets_df = pd.DataFrame(datasets)
# datasets_df.head(10)

In [13]:
def convert_file_size_to_gb(file_size_str):
    """
    Convert the file size string (e.g., '74.6 kB') to gigabytes (GB).
    """
    file_size_parts = file_size_str.split()
    file_size = float(file_size_parts[0])
    unit = file_size_parts[1]

    conversion_factors = {
        'B': 1 / (1024 ** 3),
        'kB': 1 / (1024 ** 2),
        'MB': 1 / 1024,
        'GB': 1,
        'TB': 1024,
    }

    if unit in conversion_factors:
        return float(file_size * conversion_factors[unit])
    else:
        return None

def extract_file_size(url):
    # Fetch the HTML content from the provided URL
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the div containing the "Size of downloaded dataset files:" text
    size_label_div = soup.find('div', string='Size of downloaded dataset files:')

    if size_label_div:
        # Find the next sibling div containing the file size
        size_div = size_label_div.find_next('div')
        if size_div:
            # Extract the file size text
            file_size = size_div.get_text(strip=True)
            return file_size
    else:
        return None

In [14]:
def extract_datasets_attributes(dataset):

	dataset_tags = dataset.tags
	dataset_attributes = dict()

	dataset_attributes['name'] = extract_name(dataset.id)
	dataset_attributes['size [GB]'] = match_size(dataset_tags)

	# url = "https://huggingface.co/datasets/" + dataset.id
	# file_size_str = extract_file_size(url)
	# if file_size_str:
	# 	file_size_gb = convert_file_size_to_gb(file_size_str)
	# 	if file_size_gb:
	# 		dataset_attributes['size [GB]'] = file_size_gb

	dataset_attributes['languages'] = match_language(dataset_tags)

	# dataset_attributes['dataset creator'] = dataset['author'] # TODO: add attribute in our model?

	dataset_attributes['licenseToUse'] = match_license(dataset_tags)

	dataset_attributes['domain'] = []
	for t in dataset_tags:
		if t in tag_domain:
			dataset_attributes['domain'].append(t)

	dataset_attributes['uri'] = match_uri(dataset_tags)

	dataset_attributes['fineTuning'] = None

	return dataset_attributes

In [14]:
file_path = os.path.join(result_path, 'datasets_data.json')

count = 0
start_time = time.time()
for task in TAG_DOWNSTREAM_TASK:
    print(f'Processing {task} datasets...')
    datasets = api.list_datasets(filter=task, full=True)
    for dataset in datasets:
        dataset_attributes = extract_datasets_attributes(dataset)
        add_to_json_file(dataset_attributes, file_path)
        count += 1
        if count % 1000 == 0:
            print(f'{count} datasets processed, {time.time() - start_time} seconds elapsed, estimated time remaining: {(time.time() - start_time) / count * (199642 - count):.2f} seconds')

Processing sentence-similarity datasets...
Processing summarization datasets...
Processing text-classification datasets...
Processing question-answering datasets...
Processing feature-extraction datasets...
Processing zero-shot-classification datasets...
Processing token-classification datasets...
Processing text-generation datasets...
Processing translation datasets...
Processing fill-mask datasets...
Processing table-question-answering datasets...


In [25]:
# Reload the data as a DataFrame

file_path = os.path.join(result_path, 'nlp_datasets_data.json')

with open(file_path, 'r') as file:
    data = json.load(file)
datasets_df = pd.DataFrame(data)

In [26]:
# Remove duplicates
print(f'len before removing duplicates: {  len(datasets_df) }')
datasets_df = datasets_df.loc[datasets_df.astype(str).drop_duplicates().index]
print(f'len after removing duplicates: {  len(datasets_df) }')

len before removing duplicates: 773
len after removing duplicates: 489


In [27]:
datasets_list = datasets_df.to_dict(orient='records')

In [28]:
file_path_postprocessed = os.path.join(result_path, 'nlp_datasets_no_duplicates.json')

with open(file_path_postprocessed, "w") as json_file:
    json.dump(datasets_list, json_file, indent=4)

## Downstream Task

In [17]:
def fetch_and_extract_text(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        target_paragraph = soup.find('p', class_='text-[1.2rem] text-gray-500')
        
        if target_paragraph:
            return target_paragraph.get_text().strip()
        else:
            return "Target paragraph not found."
    else:
        return f"Failed to fetch the webpage. Status code: {response.status_code}"

def create_tasks_json():

    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    os.makedirs(result_path, exist_ok=True)

    tasks_data = []

    for task in TAG_DOWNSTREAM_TASK:
        url = f"https://huggingface.co/tasks/{task}"
        description = fetch_and_extract_text(url)
        
        tasks_data.append({
            "name": task,
            "description": description, # TODO: text2text generation has no description
            "sub-task": []
        })
        
        print(f"Processed: {task}")
        # time.sleep(0.5)  # Be polite to the server

    with open(result_path + '/ChatIMPACT.DownstreamTask.json', 'w', encoding='utf-8') as f:
        json.dump(tasks_data, f, ensure_ascii=False, indent=2)

In [None]:
create_tasks_json()

# Metrics

In [19]:
# Scrape metrics and descriptions from HF

def scrape_metrics():
	url_metrics = 'https://huggingface.co/metrics'

	response = requests.get(url_metrics)
	html_content = response.text

	soup = BeautifulSoup(html_content, 'html.parser')

	h4_tags = soup.find_all('h4')
	metrics = [h4_tag.get_text(strip=True) for h4_tag in h4_tags]
	# print(metrics)

	p_tags = soup.find_all('p')
	descriptions = [p_tag.get_text() for p_tag in p_tags]
	descriptions = descriptions[2:] # drop first lines
	# print(descriptions)

	# remove from the list the metrics withoud description (not useful for our purpose)
	metrics.remove('AlhitawiMohammed22/CER_Hu-Evaluation-Metrics')
	metrics.remove('Aye10032/loss_metric')
	metrics.remove('giulio98/code_eval_outputs')
	metrics.remove('maysonma/lingo_judge_metric')
	metrics.remove('lvwerra/test')
	metrics.remove('sma2023/wil')

	# From the lists, remove the descriptions and then the relative metric in the same index that have in the description 'TODO: add a description here\n\t\t\t\t\t\t'ArithmeticError

	for i, description in enumerate(descriptions):
		if 'TODO: add a description here' in description:
			metrics.pop(i)
			descriptions.pop(i)
	
	return metrics, descriptions

In [16]:
def create_metrics_json(metrics, descriptions):

    metrics_data = []

    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    os.makedirs(result_path, exist_ok=True)

    metrics, descriptions = scrape_metrics()
    
    for idx in range(len(metrics)):
        metric_attributes = dict()

        metric_attributes['name'] = metrics[idx]
        metric_attributes['description'] = descriptions[idx]
        metric_attributes['trained'] = None
        metric_attributes['context'] = None
        metric_attributes['featureBased/endToEnd'] = None
        metric_attributes['granularity'] = None

        metrics_data.append(metric_attributes)

    with open(os.path.join(result_path, 'ChatIMPACT.Metric.json'), 'w', encoding='utf-8') as f:
        json.dump(metrics_data, f, indent=4)

In [17]:
create_metrics_json()

# Train relationship

In [29]:
def create_train_relationship():

    file_path = os.path.join(result_path, 'train_relationship.json')

    count = 0
    start_time = time.time()
    for task in TAG_DOWNSTREAM_TASK:
        print(f'Processing {task} models...')
        models = api.list_models(filter=task, full=True)
        for model in models:
            model_tags = model.tags
            datasets = match_dataset(model_tags)
            if len(datasets) != 0:
                train_relationship = dict()
                train_relationship["Models"] = extract_name(model.id)
                train_relationship["Datasets"] = [extract_name(dataset) for dataset in datasets]
                add_to_json_file(train_relationship, file_path)
            count += 1
            if count % 10000 == 0:
                print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [30]:
create_train_relationship()

Processing sentence-similarity models...
Processing translation models...
10000 models processed, 42.742920875549316 seconds elapsed
Processing question-answering models...
20000 models processed, 106.41267013549805 seconds elapsed
Processing table-question-answering models...
Processing summarization models...
Processing fill-mask models...
30000 models processed, 146.85001277923584 seconds elapsed
Processing text-generation models...
40000 models processed, 171.17433261871338 seconds elapsed
50000 models processed, 195.7509036064148 seconds elapsed
60000 models processed, 241.23480033874512 seconds elapsed
70000 models processed, 274.88327288627625 seconds elapsed
80000 models processed, 310.5311772823334 seconds elapsed
90000 models processed, 351.8119206428528 seconds elapsed
100000 models processed, 408.94479608535767 seconds elapsed
110000 models processed, 434.0016312599182 seconds elapsed
120000 models processed, 459.8774256706238 seconds elapsed
130000 models processed, 491.97

# SuitedFor relationship

In [35]:
def create_suited_for_relationship():

    file_path = os.path.join(result_path, 'suited_for_relationship.json')

    count = 0
    start_time = time.time()
    models = api.list_models(full=True)
    for model in models:
        model_tags = model.tags
        tasks = []
        for t in model_tags:
            if t in TAG_DOWNSTREAM_TASK:
                tasks.append(t)
        if len(tasks) != 0:
            suited_for_relationship = dict()
            suited_for_relationship['LargeLanguageModel'] = extract_name(model.id)
            suited_for_relationship['DownstreamTask'] = tasks
            add_to_json_file(suited_for_relationship, file_path)
        count += 1
        if count % 10000 == 0:
            print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [None]:
create_suited_for_relationship()

# Enable relationship

In [48]:
def create_enable_relationship():

	file_path = os.path.join(result_path, 'enable_relationship.json')

	count = 0
	start_time = time.time()
	datasets = api.list_datasets(full=True)
	for dataset in datasets:
		dataset_tags = dataset.tags
		tasks = match_tasks(dataset_tags)
		if len(tasks) != 0:
			enable_relationship = dict()
			enable_relationship['Dataset'] = extract_name(dataset.id)
			enable_relationship['DownstreamTask'] = tasks
			add_to_json_file(enable_relationship, file_path)
		count += 1
		if count % 10000 == 0:
			print(f'{count} datasets processed, {time.time() - start_time} seconds elapsed')

In [49]:
create_enable_relationship()

10000 datasets processed, 36.90543603897095 seconds elapsed
20000 datasets processed, 61.39656043052673 seconds elapsed
30000 datasets processed, 81.8622419834137 seconds elapsed
40000 datasets processed, 99.32798981666565 seconds elapsed
50000 datasets processed, 128.8480429649353 seconds elapsed
60000 datasets processed, 182.50446486473083 seconds elapsed
70000 datasets processed, 207.4542465209961 seconds elapsed
80000 datasets processed, 229.8976695537567 seconds elapsed
90000 datasets processed, 261.43047404289246 seconds elapsed
100000 datasets processed, 299.85623478889465 seconds elapsed
110000 datasets processed, 335.7912917137146 seconds elapsed
120000 datasets processed, 372.2245075702667 seconds elapsed
130000 datasets processed, 398.1815721988678 seconds elapsed
140000 datasets processed, 418.6915295124054 seconds elapsed
150000 datasets processed, 438.64215755462646 seconds elapsed
160000 datasets processed, 462.48283767700195 seconds elapsed
170000 datasets processed, 49

# Assess relationship

In [None]:
# TODO: here https://huggingface.co/tasks some tasks have associated metrics, we could scrape the tasks one by one

def extract_assess_relationship():

    assess = []
    for task in TAG_DOWNSTREAM_TASK:
        assess_element = {'Metric': [], 'DownstreamTask': task}
        print(f"Processing task: {task}\n")
        url = f"https://huggingface.co/tasks/{task}"
        # Fetch the webpage
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all the <dl> elements
        dl_elements = soup.find_all('dl', class_='flex items-center rounded-lg border border-gray-100')

        # Loop through each <dl> element
        for dl in dl_elements:
            # Extract the metric name from the <dt> tag inside the <summary>
            metric_name = dl.find('dt').get_text(strip=True)

            assess_element['Metric'].append(metric_name)

        assess.append(assess_element)

    return assess

In [None]:
def create_asess_relationship_json():
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')

	os.makedirs(result_path, exist_ok=True)

	assess_relationship = extract_assess_relationship()

	with open(os.path.join(result_path, 'ChatIMPACT.AssessRelationship.json'), 'w', encoding='utf-8') as f:
		json.dump(assess_relationship, f, indent=4)

In [None]:
create_asess_relationship_json()

# Evaluate relationship

In [None]:
# TODO: check that this is correct (the output and the model cards on HF do not seem to be coherent?)
# Model card template: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1

def create_evaluate_relationship():

	file_path = os.path.join(result_path, 'evaluate_relationship.json')

	count = 0
	start_time = time.time()
	for task in TAG_DOWNSTREAM_TASK:
		print(f'Processing {task} models...')
		models = api.list_models(filter=task, full=True, cardData=True)
		for model in models:
			if model.card_data is not None:
				model_card_data = model.card_data.to_dict()
				if 'metrics' in model_card_data:
					metrics = model_card_data['metrics']
					evaluate_relationship = dict()
					evaluate_relationship['LargeLanguageModel'] = extract_name(model.id)
					evaluate_relationship['Metric'] = metrics
					add_to_json_file(evaluate_relationship, file_path)
		count += 1
		if count % 10000 == 0:
			print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [None]:
create_evaluate_relationship()