# Data extraction

In [1]:
from huggingface_hub import HfApi
import pandas as pd
import itertools
import re
import requests
from bs4 import BeautifulSoup
import time
import json
import os
from huggingface_hub.utils import logging

from tags import * # tags.py
logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
api = HfApi()

## Utils

In [3]:
# Scrape languages from HF

url_languages = 'https://huggingface.co/languages'

default_path = "/home/csavelli/database/HF entries/hf extracted json/"

response = requests.get(url_languages)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

code_tags = soup.find_all('code')
tag_language = [code_tag.get_text() for code_tag in code_tags]

tag_language.remove('jax') # 'jax' is the ISO for Jambi Malay (present in 3 datasets, 36 models), impossible to distinguish from JAX the library... TODO: better solution?

tag_language = set(tag_language)

In [4]:
# Pattern matching functions

def extract_name(full_name):
    pattern = re.compile(r'[^/]+/(.+)')
    match = re.search(pattern, full_name)
    if match:
        return match.group(1) # the part after '/' might also contain version and number of parameters (impossible to extract in a uniform way)
    else:
        return full_name

def match_string(entries, pattern):
    pattern = re.compile(pattern)
    for entry in entries:
        match = pattern.match(entry)
        if match:
            return match.group(1)
    return None

def find_all_matches(entries, pattern):
    pattern = re.compile(pattern)
    matches = []
    for entry in entries:
        match = pattern.match(entry)
        if match:
            matches.append(match.group(1))
    return matches

def match_license(entries):
    return match_string(entries, r'license:(\S+)')

def match_dataset(entries):
    return find_all_matches(entries, r'dataset:(\S+)')

def match_uri(entries):
    uri = match_string(entries, r'arxiv:(\S+)')
    if uri is None:
        uri = match_string(entries, r'doi:(\S+)')
    return uri

def match_language(entries):
    return find_all_matches(entries, r'language:(\S+)')

def match_size(entries):
    return match_string(entries, r'size_categories:(\S+)')

def match_tasks(entries):
    return find_all_matches(entries, r'task_categories:(\S+)')

In [5]:
def add_to_json_file(data, file_path):

    if os.path.exists(file_path):
        with open(file_path, 'r+', encoding='utf-8') as f:

            f.seek(0, os.SEEK_END)
            f.seek(f.tell() - 1, os.SEEK_SET)
            f.truncate()
            f.write(',\n')
            json.dump(data, f, indent=4)
            f.write(']')
    else:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump([data], f, indent=4)

In [6]:
current_path = os.getcwd()
parent_path = os.path.dirname(current_path)
result_path = os.path.join(parent_path, 'database', 'HF entries', 'hf extracted json')
os.makedirs(result_path, exist_ok=True)

# JSON analysis

In [41]:
# load llm json 

with open('/home/csavelli/chatIMPACT/database/database/HF entries/hf extracted json/models_new.json', 'r') as f:
    llm = json.load(f)

models_df = pd.DataFrame(llm)

models_df.sort_values(by='age', inplace=True, ascending=False)
models_df

Unnamed: 0,name,id,version,numberOfParameters,quantization,architecture,languages,modelCreator,licenseToUse,libraryFramework,...,uri,fineTuned,carbonEmission [CO2eq tons],tokenizer,likes,downloads_all_time,downloads,creation_date,age,vocab_size
62,all-mpnet-base-v2,sentence-transformers/all-mpnet-base-v2,,,,,[en],,apache-2.0,"[sentence-transformers, pytorch, onnx, safeten...",...,1904.06472,,,,903,771404860,382681384,2022-03-02 23:29:05,2.742466,30527
7,bert-large-uncased-whole-word-masking-finetune...,google-bert/bert-large-uncased-whole-word-mask...,,,,,[en],,apache-2.0,"[transformers, pytorch, tf, jax, safetensors]",...,1810.04805,,,,172,33050939,115466,2022-03-02 23:29:04,2.742466,30522
303304,opus-mt-gl-pt,Helsinki-NLP/opus-mt-gl-pt,,,,,"[gl, pt]",,apache-2.0,"[transformers, pytorch, tf]",...,,,,,0,23829,14942,2022-03-02 23:29:04,2.742466,5835
303305,opus-mt-gmq-en,Helsinki-NLP/opus-mt-gmq-en,,,,,"[da, nb, sv, is, nn, fo, en]",,apache-2.0,"[transformers, pytorch, tf]",...,,,,,1,200329,12278,2022-03-02 23:29:04,2.742466,57388
303306,opus-mt-gmw-gmw,Helsinki-NLP/opus-mt-gmw-gmw,,,,,"[nl, en, lb, af, de, fy, yi]",,apache-2.0,"[transformers, pytorch, tf]",...,,,,,1,225652,11301,2022-03-02 23:29:04,2.742466,35464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303674,gpt2-law,naresh810/gpt2-law,,,,,[],,,"[transformers, safetensors]",...,1910.09700,,,,0,7,7,2024-11-27 00:53:01,0.002740,16000
303675,Wisedom-8B-EmbeddingReordering,wisenut-nlp-team/Wisedom-8B-EmbeddingReordering,,,,,[],,,"[transformers, safetensors]",...,1910.09700,,,,0,2,2,2024-11-27 00:54:16,0.002740,128256
303676,Qwen-2.5-3b-Text_to_SQL-GGUF,Ellbendls/Qwen-2.5-3b-Text_to_SQL-GGUF,,,,[Qwen/Qwen2.5-3B-Instruct],[],,mit,"[transformers, gguf]",...,,True,,,0,18,18,2024-11-27 00:55:35,0.002740,
303677,Qwen-2.5-72B-Instruct-abliterated-v2-Q6_K.gguf,blotfaba/Qwen-2.5-72B-Instruct-abliterated-v2-...,,,,zetasepic/Qwen2.5-72B-Instruct-abliterated-v2,[en],zetasepic,other,"[transformers, gguf]",...,,True,,,0,26,26,2024-11-27 01:10:08,0.002740,


In [42]:
# get the model info given the id from HF 

id = "Helsinki-NLP/opus-mt-gmq-en"

creation_date = api.model_info(id).created_at
creation_date

datetime.datetime(2022, 3, 2, 23, 29, 4, tzinfo=datetime.timezone.utc)

## LLMs

In [None]:
# Get all models

# models = api.list_models(full=True)

In [None]:
# Process the first 1000 models

# model = itertools.islice(models, 0, 1000)
# models_df = pd.DataFrame(model)
# models_df.head(10)

In [None]:
def extract_model_attributes(model):

	model_tags = model.tags
	if model.card_data is not None:
		model_card_data = model.card_data.to_dict()
	else:
		model_card_data = None
	model_attributes = dict()

	model_attributes['name'] = extract_name(model.id)
	model_attributes['id'] = model.id
	model_attributes['version'] = None # sometimes in model['id'] but impossible to extract in a uniform way
	model_attributes['numberOfParameters'] = None # sometimes in model['id'] or model description but impossible to extract in a uniform way

	model_attributes['quantization'] = None
	for t in model_tags:
		if t in tag_quantization:
			model_attributes['quantization'] = t

	model_attributes['architecture'] = None
	try:
		if model_card_data is not None:
			model_attributes['architecture'] = model_card_data['base_model']
	except KeyError:
		pass

	model_attributes['languages'] = []
	for t in model_tags:
		if t in tag_language:
			model_attributes['languages'].append(t)

	model_attributes['modelCreator'] = None # extracted in a postprocessing step

	model_attributes['licenseToUse'] = match_license(model_tags)

	model_attributes['libraryFramework'] = [] 
	for t in model_tags:
		if t in tag_library:
			model_attributes['libraryFramework'].append(t)

	model_attributes['contextLength'] = None
	model_attributes['developers'] = [model.author]
	model_attributes['openSource'] = True

	model_attributes['uri'] = match_uri(model_tags)

	model_attributes['fineTuned'] = None # if there is a 'base_model' in card_data, it is fine-tuned
	try:
		if model_card_data is not None:
			if 'base_model' in model_card_data:
				model_attributes['fineTuned'] = True
	except KeyError:
		pass

	model_attributes['carbonEmission [CO2eq tons]'] = None
	try:
		if model_card_data is not None:
			model_attributes['carbonEmission [CO2eq tons]'] = model_card_data['co2_eq_emissions']
	except KeyError:
		pass

	model_attributes['tokenizer'] = None
	model_attributes['likes'] = model.likes

	info = api.model_info(repo_id=model.id, expand="downloadsAllTime")
	model_attributes['downloads_all_time'] = info.downloads_all_time

	model_attributes['downloads'] = model.downloads

	model_attributes['creation_date'] = model.created_at.strftime('%Y-%m-%d %H:%M:%S')

	# Convert both datetimes to timezone-naive
	starting_datetime = pd.to_datetime(model.created_at).tz_localize(None)
	current_datetime = pd.to_datetime('today').tz_localize(None)

	# evaluate how many years have passed since the creation 
	model_attributes["age"] = (current_datetime - starting_datetime).days / 365


	return model_attributes

In [None]:
file_path = os.path.join(result_path, 'models_duplicates_no_modelCreator.json')

# Total: 697,162 models
count = 0
start_time = time.time()
for task in TAG_DOWNSTREAM_TASK:
    print(f'Processing {task} models...')
    models = api.list_models(filter=task, full=True, cardData=True)
    for model in models:
        model_attributes = extract_model_attributes(model)
        add_to_json_file(model_attributes, file_path)
        count += 1
        if count % 1000 == 0:
            print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [None]:
for task in TAG_DOWNSTREAM_TASK:
    print(f'Processing {task} models...')
    models = api.list_models(filter=task, full=True, cardData=True)
    for model in models:
        model_attributes = extract_model_attributes(model)
        add_to_json_file(model_attributes, "text")
        count += 1
        if count % 1000 == 0:
            print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [None]:
# Reload the data as a DataFrame
with open(default_path + "models_duplicates_no_modelCreator.json", 'r') as file:
    data = json.load(file)

models_df = pd.DataFrame(data)

In [10]:
len(models_df)

335254

In [11]:
# Remove duplicates
print(f'len before removing duplicates: {  len(models_df) }')
models_df = models_df.loc[models_df.astype(str).drop_duplicates().index]
print(f'len after removing duplicates: {  len(models_df) }')

len before removing duplicates: 335254
len after removing duplicates: 303843


In [12]:
# Postprocessing: find the modelCreator

df_filtered = models_df[models_df['architecture'].notna()]

# Process each row
count = 0
start_time = time.time()
for index, row in df_filtered.iterrows():
    # Find the row where 'id' matches the 'architecture' of the current row
    try:
        matching_row = models_df[models_df['id'].astype(str) == str(row['architecture'])]
    except ValueError:
        break
    
    if not matching_row.empty:
        # Get the first developer from the 'developers' list
        first_developer = matching_row['developers'].iloc[0][0] if matching_row['developers'].iloc[0] else None
        # Set the 'modelCreator' attribute of the original row
        models_df.at[index, 'modelCreator'] = first_developer
    
    count += 1
    if count % 1000 == 0:
        print(f'{count} rows processed ({count/len(df_filtered)*100} %), elapsed time: {time.time() - start_time} seconds, estimated time remaining: {(time.time() - start_time) / count * (len(df_filtered) - count)} seconds')
    
        

1000 rows processed (1.3250649281814808 %), elapsed time: 17.53973937034607 seconds, estimated time remaining: 1306.1494179582596 seconds
2000 rows processed (2.6501298563629616 %), elapsed time: 34.99690842628479 seconds, estimated time remaining: 1285.576477921486 seconds
3000 rows processed (3.9751947845444424 %), elapsed time: 52.463114976882935 seconds, estimated time remaining: 1267.2990341777802 seconds
4000 rows processed (5.300259712725923 %), elapsed time: 69.906982421875 seconds, estimated time remaining: 1249.0280847504139 seconds
5000 rows processed (6.6253246409074045 %), elapsed time: 87.37666869163513 seconds, estimated time remaining: 1231.4518380334855 seconds
6000 rows processed (7.950389569088885 %), elapsed time: 104.87480330467224 seconds, estimated time remaining: 1214.240489223957 seconds
7000 rows processed (9.275454497270367 %), elapsed time: 122.38737726211548 seconds, estimated time remaining: 1197.0884395678386 seconds
8000 rows processed (10.60051942545184

In [13]:
models_list = models_df.to_dict(orient='records')

In [14]:
file_path_postprocessed = os.path.join('/home/csavelli/chatIMPACT/database/database/HF entries/hf extracted json/models.json')

with open(file_path_postprocessed, "w") as json_file:
    json.dump(models_list, json_file, indent=4)

## Dataset

In [None]:
# Get all datasets

# datasets = api.list_datasets(full=True)

In [None]:
# Process the first 1000 models

# datasets = list(itertools.islice(datasets, 0, 1000))
# datasets_df = pd.DataFrame(datasets)
# datasets_df.head(10)

In [None]:
def convert_file_size_to_gb(file_size_str):
    """
    Convert the file size string (e.g., '74.6 kB') to gigabytes (GB).
    """
    file_size_parts = file_size_str.split()
    file_size = float(file_size_parts[0])
    unit = file_size_parts[1]

    conversion_factors = {
        'B': 1 / (1024 ** 3),
        'kB': 1 / (1024 ** 2),
        'MB': 1 / 1024,
        'GB': 1,
        'TB': 1024,
    }

    if unit in conversion_factors:
        return float(file_size * conversion_factors[unit])
    else:
        return None

def extract_file_size(url):
    # Fetch the HTML content from the provided URL
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the div containing the "Size of downloaded dataset files:" text
    size_label_div = soup.find('div', string='Size of downloaded dataset files:')

    if size_label_div:
        # Find the next sibling div containing the file size
        size_div = size_label_div.find_next('div')
        if size_div:
            # Extract the file size text
            file_size = size_div.get_text(strip=True)
            return file_size
    else:
        return None

In [None]:
def extract_datasets_attributes(dataset):

	dataset_tags = dataset.tags
	dataset_attributes = dict()

	dataset_attributes['name'] = extract_name(dataset.id)
	dataset_attributes['size [GB]'] = match_size(dataset_tags)

	# url = "https://huggingface.co/datasets/" + dataset.id
	# file_size_str = extract_file_size(url)
	# if file_size_str:
	# 	file_size_gb = convert_file_size_to_gb(file_size_str)
	# 	if file_size_gb:
	# 		dataset_attributes['size [GB]'] = file_size_gb

	dataset_attributes['languages'] = match_language(dataset_tags)

	# dataset_attributes['dataset creator'] = dataset['author'] # TODO: add attribute in our model?

	dataset_attributes['licenseToUse'] = match_license(dataset_tags)

	dataset_attributes['domain'] = []
	for t in dataset_tags:
		if t in tag_domain:
			dataset_attributes['domain'].append(t)

	dataset_attributes['uri'] = match_uri(dataset_tags)

	dataset_attributes['fineTuning'] = None

	return dataset_attributes

In [None]:
file_path = os.path.join(result_path, 'datasets_duplicates_new.json')

count = 0
start_time = time.time()
for task in TAG_DOWNSTREAM_TASK:
    print(f'Processing {task} datasets...')
    datasets = api.list_datasets(task_categories=task, full=True)
    for dataset in datasets:
        dataset_attributes = extract_datasets_attributes(dataset)
        add_to_json_file(dataset_attributes, file_path)
        count += 1
        if count % 1000 == 0:
            print(f'{count} datasets processed, {time.time() - start_time} seconds elapsed, estimated time remaining: {(time.time() - start_time) / count:.2f} seconds')

In [None]:
count

In [None]:
# Reload the data as a DataFrame

file_path = os.path.join(result_path, 'datasets_duplicates_new.json')

with open(file_path, 'r') as file:
    data = json.load(file)
datasets_df = pd.DataFrame(data)

In [None]:
# Remove duplicates
print(f'len before removing duplicates: {len(datasets_df)}')
datasets_df = datasets_df.loc[datasets_df.astype(str).drop_duplicates().index]
print(f'len after removing duplicates: {len(datasets_df)}')

In [None]:
datasets_list = datasets_df.to_dict(orient='records')

In [None]:
file_path_postprocessed = os.path.join(result_path, 'datasets.json')

with open(file_path_postprocessed, "w") as json_file:
    json.dump(datasets_list, json_file, indent=4)

## Downstream Task

In [None]:
def fetch_and_extract_text(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        target_paragraph = soup.find('p', class_='text-[1.2rem] text-gray-500')
        
        if target_paragraph:
            return target_paragraph.get_text().strip()
        else:
            return "Target paragraph not found."
    else:
        return f"Failed to fetch the webpage. Status code: {response.status_code}"

def create_tasks_json():

    current_path = os.getcwd()
    parent_path = os.path.dirname(current_path)
    result_path = os.path.join(parent_path, 'database', 'hf extracted json')
    os.makedirs(result_path, exist_ok=True)

    tasks_data = []

    for task in TAG_DOWNSTREAM_TASK:
        url = f"https://huggingface.co/tasks/{task}"
        description = fetch_and_extract_text(url)
        
        tasks_data.append({
            "name": task,
            "description": description,
            "sub-task": []
        })
        
        print(f"Processed: {task}")
        # time.sleep(0.5)  # Be polite to the server
    
    file_path = os.path.join(result_path, 'downstreamtasks.json')

    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(tasks_data, f, ensure_ascii=False, indent=2)

In [None]:
create_tasks_json()

# Metrics

In [None]:
# Scrape metrics and descriptions from HF

def extract_metrics():
	metrics = api.list_metrics()

	metrics_names = [metric.id for metric in metrics]
	metrics_descriptions = [metric.description for metric in metrics]

	# url_metrics = 'https://huggingface.co/metrics'

	# # Remove from the list the metrics withoud description (not useful for our purpose)
	# metrics.remove('AlhitawiMohammed22/CER_Hu-Evaluation-Metrics')
	# metrics.remove('Aye10032/loss_metric')
	# metrics.remove('giulio98/code_eval_outputs')
	# metrics.remove('maysonma/lingo_judge_metric')
	# metrics.remove('lvwerra/test')
	# metrics.remove('sma2023/wil')

	# From the lists, replace the description 'TODO: add a description here' with None

	for i, description in enumerate(metrics_descriptions):
		if type(description) is not str or 'TODO: add a description here' in description:
			metrics_descriptions[i] = None
	
	return metrics_names, metrics_descriptions

In [None]:
def create_metrics_json():

    metrics_data = []

    metrics, descriptions = extract_metrics()
    
    for idx in range(len(metrics)):
        metric_attributes = dict()

        metric_attributes['name'] = metrics[idx]
        metric_attributes['description'] = descriptions[idx]
        metric_attributes['trained'] = None
        metric_attributes['context'] = None
        metric_attributes['featureBased/endToEnd'] = None
        metric_attributes['granularity'] = None

        metrics_data.append(metric_attributes)
    
    file_path = os.path.join(result_path, 'metrics.json')

    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(metrics_data, f, indent=4)

In [None]:
create_metrics_json()

# Train relationship

In [None]:
def create_train_relationship():

    file_path = os.path.join(result_path, 'train_duplicates.json')

    count = 0
    start_time = time.time()
    for task in TAG_DOWNSTREAM_TASK:
        print(f'Processing {task} models...')
        models = api.list_models(filter=task, full=True)
        for model in models:
            model_tags = model.tags
            datasets = match_dataset(model_tags)
            if len(datasets) != 0:
                train_relationship = dict()
                train_relationship["Models"] = extract_name(model.id)
                train_relationship["Datasets"] = [extract_name(dataset) for dataset in datasets]
                add_to_json_file(train_relationship, file_path)
            count += 1
            if count % 10000 == 0:
                print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [None]:
create_train_relationship()

In [None]:
# Reload the data as a DataFrame

file_path = os.path.join(result_path, 'train_duplicates.json')

with open(file_path, 'r') as file:
	data = json.load(file)
train_df = pd.DataFrame(data)

In [None]:
# Remove duplicates

print(f'len before removing duplicates: {len(train_df)}')
train_df = train_df.loc[train_df.astype(str).drop_duplicates().index]
print(f'len after removing duplicates: {len(train_df)}')

In [None]:
train_list = train_df.to_dict(orient='records')

In [None]:
file_path_postprocessed = os.path.join(result_path, 'train.json')

with open(file_path_postprocessed, "w") as json_file:
	json.dump(train_list, json_file, indent=4)

# SuitedFor relationship

In [None]:
def create_suited_for_relationship():

    file_path = os.path.join(result_path, 'suited_for_duplicates.json')

    count = 0
    start_time = time.time()
    for task in TAG_DOWNSTREAM_TASK:
        print(f'Processing {task} models...')
        models = api.list_models(filter=task, full=True)
        for model in models:
            suited_for_relationship = dict()
            suited_for_relationship['LargeLanguageModel'] = extract_name(model.id)
            suited_for_relationship['DownstreamTask'] = task
            add_to_json_file(suited_for_relationship, file_path)
            count += 1
            if count % 10000 == 0:
                print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [None]:
create_suited_for_relationship()

In [None]:
# Reload the data as a DataFrame

file_path = os.path.join(result_path, 'suited_for_duplicates.json')
with open(file_path, 'r') as file:
	data = json.load(file)
suited_for_df = pd.DataFrame(data)

In [None]:
# Merge duplicates

print(f'len before removing duplicates: {len(suited_for_df)}')
suited_for_df = suited_for_df.groupby('LargeLanguageModel')['DownstreamTask'].apply(list).reset_index()
print(f'len after removing duplicates: {len(suited_for_df)}')

In [None]:
file_path_postprocessed = os.path.join(result_path, 'suited_for.json')

with open(file_path_postprocessed, "w") as json_file:
	json.dump(suited_for_df.to_dict(orient='records'), json_file, indent=4)

# Enable relationship

In [None]:
def create_enable_relationship():

	file_path = os.path.join(result_path, 'enable_duplicates.json')

	count = 0
	start_time = time.time()
	for task in TAG_DOWNSTREAM_TASK:
		print(f'Processing {task} datasets...')
		datasets = api.list_datasets(filter=task, full=True)
		for dataset in datasets:
			enable_relationship = dict()
			enable_relationship['Dataset'] = extract_name(dataset.id)
			enable_relationship['DownstreamTask'] = task
			add_to_json_file(enable_relationship, file_path)
			count += 1
			if count % 1000 == 0:
				print(f'{count} datasets processed, {time.time() - start_time} seconds elapsed')

In [None]:
create_enable_relationship()

In [None]:
# Reload the data as a DataFrame

file_path = os.path.join(result_path, 'enable_duplicates.json')
with open(file_path, 'r') as file:
	data = json.load(file)
enable_df = pd.DataFrame(data)

In [None]:
# Merge duplicates

print(f'len before removing duplicates: {len(enable_df)}')
enable_df = enable_df.groupby('Dataset')['DownstreamTask'].apply(list).reset_index()
print(f'len after removing duplicates: {len(enable_df)}')

In [None]:
file_path_postprocessed = os.path.join(result_path, 'enable.json')

with open(file_path_postprocessed, "w") as json_file:
	json.dump(enable_df.to_dict(orient='records'), json_file, indent=4)

# Assess relationship

In [None]:
def extract_assess_relationship():

    assess = []
    for task in TAG_DOWNSTREAM_TASK:
        assess_element = {'Metric': [], 'DownstreamTask': task}
        print(f"Processing task: {task}")
        url = f"https://huggingface.co/tasks/{task}"
        # Fetch the webpage
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract all the <dl> elements
        dl_elements = soup.find_all('dl', class_='flex items-center rounded-lg border border-gray-100')

        # Loop through each <dl> element
        for dl in dl_elements:
            # Extract the metric name from the <dt> tag inside the <summary>
            metric_name = dl.find('dt').get_text(strip=True)

            assess_element['Metric'].append(metric_name)

        assess.append(assess_element)

    return assess

In [None]:
def create_asess_relationship_json():

	assess_relationship = extract_assess_relationship()

	file_path = os.path.join(result_path, 'assess.json')

	with open(file_path, 'w', encoding='utf-8') as f:
		json.dump(assess_relationship, f, indent=4)

In [None]:
create_asess_relationship_json()

# Evaluate relationship

In [None]:
# TODO: check that this is correct (the output and the model cards on HF do not seem to be coherent?)
# Model card template: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1

def create_evaluate_relationship():

	file_path = os.path.join(result_path, 'evaluate_duplicates.json')

	count = 0
	start_time = time.time()
	for task in TAG_DOWNSTREAM_TASK:
		print(f'Processing {task} models...')
		models = api.list_models(filter=task, full=True, cardData=True)
		for model in models:
			if model.card_data is not None:
				model_card_data = model.card_data.to_dict()
				if 'metrics' in model_card_data:
					metrics = model_card_data['metrics']
					evaluate_relationship = dict()
					evaluate_relationship['LargeLanguageModel'] = extract_name(model.id)
					evaluate_relationship['Metric'] = metrics
					add_to_json_file(evaluate_relationship, file_path)
			count += 1
			if count % 10000 == 0:
				print(f'{count} models processed, {time.time() - start_time} seconds elapsed')

In [None]:
create_evaluate_relationship()

In [None]:
# Reload the data as a DataFrame

file_path = os.path.join(result_path, 'evaluate_duplicates.json')
with open(file_path, 'r') as file:
	data = json.load(file)
evaluate_df = pd.DataFrame(data)

In [None]:
# Remove duplicates

print(f'len before removing duplicates: {len(evaluate_df)}')
evaluate_df = evaluate_df.loc[evaluate_df.astype(str).drop_duplicates().index]
print(f'len after removing duplicates: {len(evaluate_df)}')

In [None]:
evaluate_list = evaluate_df.to_dict(orient='records')

In [None]:
file_path_postprocessed = os.path.join(result_path, 'evaluate.json')

with open(file_path_postprocessed, "w") as json_file:
	json.dump(evaluate_list, json_file, indent=4)