# Data availability

In [None]:
import pandas as pd
import json
import os

## LLMs

In [None]:
# Read JSON and count available attributes

def models_statistics(file_name):

	name_count = 0
	version_count = 0
	number_of_parameters_count = 0
	quantization_count = 0
	architecture_count = 0
	languages_count = 0
	model_creator_count = 0
	license_count = 0
	library_count = 0
	context_length_count = 0
	developers_count = 0
	open_source_count = 0
	uri_count = 0
	finetuned_count = 0
	carbon_emission_count = 0
	tokenizer_count = 0

	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'HF entries', 'hf extracted json')

	models_json = open(os.path.join(result_path, file_name))
	models_data_json = json.load(models_json)

	models_df = pd.DataFrame(models_data_json) 

	# TODO: add more attributes (?)
	for idx, item in enumerate(models_data_json):
		if item['name'] is not None:
			name_count += 1
		if item['version'] is not None:
			version_count += 1
		if item['numberOfParameters'] is not None:
			number_of_parameters_count += 1
		if item['quantization'] is not None:
			quantization_count += 1
		if item['architecture'] is not None:
			architecture_count += 1
		if len(item['languages']) > 0:
			languages_count += 1
		if item['modelCreator'] is not None:
			model_creator_count += 1
		if item['licenseToUse'] is not None:
			license_count += 1
		if len(item['libraryFramework']) > 0:
			library_count += 1
		if item['contextLength'] is not None:
			context_length_count += 1
		if len(item['developers']) > 0:
			developers_count += 1
		if item['openSource'] is not None:
			open_source_count += 1
		if item['uri'] is not None:
			uri_count += 1
		if item['fineTuned'] is not None:
			finetuned_count += 1
		if item['carbonEmission [CO2eq tons]'] is not None:
			carbon_emission_count += 1
		if item['tokenizer'] is not None:
			tokenizer_count += 1
	
	total_models = idx + 1

	print(f'Number of processed models: {total_models}')
	print(f'    Name: {name_count} ({(name_count / total_models) * 100:.2f}%)')
	print(f'    Version: {version_count} ({(version_count / total_models) * 100:.2f}%)')
	print(f'    Number of Parameters: {number_of_parameters_count} ({(number_of_parameters_count / total_models) * 100:.2f}%)')
	print(f'    Quantization: {quantization_count} ({(quantization_count / total_models) * 100:.2f}%)')
	print(f'    Architecture: {architecture_count} ({(architecture_count / total_models) * 100:.2f}%)')
	print(f'    Languages: {languages_count} ({(languages_count / total_models) * 100:.2f}%)')
	print(f'    Model creator: {model_creator_count} ({(model_creator_count / total_models) * 100:.2f}%)')
	print(f'    License to use: {license_count} ({(license_count / total_models) * 100:.2f}%)')
	print(f'    Library: {library_count} ({(library_count / total_models) * 100:.2f}%)')
	print(f'    Context Length: {context_length_count} ({(context_length_count / total_models) * 100:.2f}%)')
	print(f'    Developers: {developers_count} ({(developers_count / total_models) * 100:.2f}%)')
	print(f'    Open Source: {open_source_count} ({(open_source_count / total_models) * 100:.2f}%)')
	print(f'    URI: {uri_count} ({(uri_count / total_models) * 100:.2f}%)')
	print(f'    Fine-tuned: {finetuned_count} ({(finetuned_count / total_models) * 100:.2f}%)')
	print(f'    Carbon emission: {carbon_emission_count} ({(carbon_emission_count / total_models) * 100:.2f}%)')
	print(f'    Tokenizer: {tokenizer_count} ({(tokenizer_count / total_models) * 100:.2f}%)')

	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	llm_attributes = models_data_json[0].keys()

	for idx, item in enumerate(models_data_json):
		model_name = item['name']
		for attr in llm_attributes:
			if item[attr] is not None and type(item[attr]) != list:
				availability = pd.concat([availability, pd.DataFrame([{'id': model_name, 'entity name': 'LLM', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
				availability = pd.concat([availability, pd.DataFrame([{'id': model_name, 'entity name': 'LLM', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': model_name, 'entity name': 'LLM', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability



In [None]:
availability = models_statistics('models_data.json')

## Datasets

In [None]:
# Read JSON and count available attributes

def datasets_statistics(file_name):

	name_count = 0
	size_count = 0
	languages_count = 0
	license_count = 0
	domain_count = 0
	uri_count = 0
	finetuning_count = 0

	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'HF entries', 'hf extracted json')
	print()
	datasets_json = open(os.path.join(result_path, file_name))
	datasets_data_json = json.load(datasets_json)

	for idx, item in enumerate(datasets_data_json):
		if item['name'] is not None:
			name_count += 1
		if item['size [rows]'] is not None: # can be size [GB] or size [rows]
			size_count += 1
		if len(item['languages']) > 0:
			languages_count += 1	
		if item['licenseToUse'] is not None:	
			license_count += 1
		if len(item['domain']) > 0:
			domain_count += 1	
		if item['uri'] is not None:
			uri_count += 1
		if item['fineTuning'] is not None:
			finetuning_count += 1
	
	total_datasets = idx + 1	
	print(f'Number of processed datasets: {total_datasets}')
	print(f'    Name: {name_count} ({(name_count / total_datasets) * 100:.2f}%)')
	print(f'    Size: {size_count} ({(size_count / total_datasets) * 100:.2f}%)')
	print(f'    Languages: {languages_count} ({(languages_count / total_datasets) * 100:.2f}%)')
	print(f'    License to use: {license_count} ({(license_count / total_datasets) * 100:.2f}%)')
	print(f'    Domain: {domain_count} ({(domain_count / total_datasets) * 100:.2f}%)')
	print(f'    URI: {uri_count} ({(uri_count / total_datasets) * 100:.2f}%)')
	print(f'    Fine-tuning: {finetuning_count} ({(finetuning_count / total_datasets) * 100:.2f}%)')

	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	datasets_attributes = datasets_data_json[0].keys()

	for idx, item in enumerate(datasets_data_json):
		dataset_name = item['name']
		for attr in datasets_attributes:
			if item[attr] is not None and attr == 'size [GB]':
				availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True)
			elif item[attr] is not None and type(item[attr]) != list:
				availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
					availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': True, 'available scraping': False}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': dataset_name, 'entity name': 'Dataset', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability

In [None]:
availability = datasets_statistics('datasets_data.json')

## DownstreamTasks

In [None]:
# Read JSON and count available attributes

def tasks_statistics():
	name_count = 0
	description_count = 0
	sub_task_count = 0
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')

	task_json = open(os.path.join(result_path, 'ChatIMPACT.DownstreamTask.json'))
	task_data_json = json.load(task_json)

	for idx, item in enumerate(task_data_json):
		if item['name'] is not None:
			name_count += 1	
		if item['description'] is not None:
			description_count += 1
		if len(item['sub-task']) > 0:
			sub_task_count += 1
	
	task_count = idx + 1
	print(f'Number of processed task: {idx + 1}')
	print(f'    Name: {name_count} ({(name_count / task_count) * 100:.2f}%)')
	print(f'    Description: {description_count} ({(description_count / task_count) * 100:.2f}%)')
	print(f'    Sub-task: {sub_task_count} ({(sub_task_count / task_count) * 100:.2f}%)')

	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	task_attributes = task_data_json[0].keys()

	for idx, item in enumerate(task_data_json):
		task_name = item['name']
		for attr in task_attributes:
			if item[attr] is not None and type(item[attr]) != list:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'DownstreamTask', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'DownstreamTask', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'DownstreamTask', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability


In [None]:
availability = tasks_statistics()

## Metrics

In [None]:
# Read JSON and count available attributes

def metric_statistics():
	name_count = 0
	description_count = 0
	context_count = 0
	featureBased_endToEnd_count = 0
	granularity_count = 0
	
	current_path = os.getcwd()
	parent_path = os.path.dirname(current_path)
	result_path = os.path.join(parent_path, 'database', 'hf extracted json')

	metric_json = open(os.path.join(result_path, 'ChatIMPACT.Metric.json'))
	metric_data_json = json.load(metric_json)

	for idx, item in enumerate(metric_data_json):
		if item['name'] is not None:
			name_count += 1
		if item['description'] is not None:
			description_count += 1
		if item['context'] is not None:
			context_count += 1
		if item['featureBased/endToEnd'] is not None:
			featureBased_endToEnd_count += 1
		if item['granularity'] is not None:	
			granularity_count += 1
	
	total_datasets = idx + 1

	print(f'Number of processed datasets: {total_datasets}')
	print(f'    Name: {name_count} ({(name_count / total_datasets) * 100:.2f}%)')
	print(f'    Description: {description_count} ({(description_count / total_datasets) * 100:.2f}%)')
	print(f'    Context: {context_count} ({(context_count / total_datasets) * 100:.2f}%)')
	print(f'    FeatureBased/endToEnd: {featureBased_endToEnd_count} ({(featureBased_endToEnd_count / total_datasets) * 100:.2f}%)')
	print(f'    Granularity: {granularity_count} ({(granularity_count / total_datasets) * 100:.2f}%)')
	
	availability = pd.DataFrame(columns=['id', 'entity name', 'attribute name', 'available API', 'available scraping'])

	metric_attributes = metric_data_json[0].keys()

	for idx, item in enumerate(metric_data_json):
		task_name = item['name']
		for attr in metric_attributes:
			if item[attr] is not None and type(item[attr]) != list:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'Metric', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True) # all llm attributes we are able to extract come from API, no attribute is obtained by scraping
			elif item[attr] is not None and type(item[attr]) == list and len(item[attr]) > 0:
					availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'Metric', 'attribute name': attr, 'available API': False, 'available scraping': True}])], ignore_index=True)
			else:
				availability = pd.concat([availability, pd.DataFrame([{'id': task_name, 'entity name': 'Metric', 'attribute name': attr, 'available API': False, 'available scraping': False}])], ignore_index=True)

	return availability


In [None]:
availability = metric_statistics()