In [6]:
# src/utils.py
import os
from datasets import load_dataset
import configparser
os.chdir('..')

def load_saved_dataset(dataset_name):
    """
    Loads the saved dataset from the specified directory.

    Args:
        dataset_name (str): The name of the dataset.
        data_dir (str): The directory where the dataset is saved.

    Returns:
        datasets.DatasetDict: Loaded dataset.
    """
    data_dir = 'data'
    # Load the dataset from the specified directory
    dataset = load_dataset('json', data_files={
        'train': os.path.join(data_dir, f"{dataset_name}_train.json"),
        'validation': os.path.join(data_dir, f"{dataset_name}_validation.json"),
        'test': os.path.join(data_dir, f"{dataset_name}_test.json")
    })
    
    return dataset


def read_config(config_file='config.ini'):
    """
    Reads the configuration file and returns the settings as a dictionary.

    Args:
        config_file (str): Path to the configuration file.

    Returns:
        dict: A dictionary containing the configuration settings.
    """
    config = configparser.ConfigParser()
    
    # Check if the config file exists
    if not os.path.exists(config_file):
        raise FileNotFoundError(f"The configuration file '{config_file}' does not exist.")
    
    config.read(config_file)
    
    # Convert config sections to a dictionary
    config_dict = {section: dict(config.items(section)) for section in config.sections()}
    
    return config_dict



In [7]:
!pwd

/run/media/meysam/PROGRAM/0.py/token-classification


In [8]:
config = read_config()
data = load_dataset(config['dataset']['name'])

In [22]:
data['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [23]:
data['train'][0]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [41]:
label_names = data["train"].features["ner_tags"].feature.names
print(label_names)
words = data["train"][0]["tokens"]
labels = data["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += f"{word:^10}"
    line2 += f"{full_label:^10}"

print(line1)
print(line2)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
    EU     rejects    German     call       to     boycott   British     lamb       .     
  B-ORG       O       B-MISC      O         O         O       B-MISC      O         O     
