In [1]:
# src/utils.py
import os
from datasets import load_dataset
import configparser
os.chdir('..')

def load_saved_dataset(dataset_name):
    """
    Loads the saved dataset from the specified directory.

    Args:
        dataset_name (str): The name of the dataset.
        data_dir (str): The directory where the dataset is saved.

    Returns:
        datasets.DatasetDict: Loaded dataset.
    """
    data_dir = 'data'
    # Load the dataset from the specified directory
    dataset = load_dataset('json', data_files={
        'train': os.path.join(data_dir, f"{dataset_name}_train.json"),
        'validation': os.path.join(data_dir, f"{dataset_name}_validation.json"),
        'test': os.path.join(data_dir, f"{dataset_name}_test.json")
    })
    
    return dataset


def read_config(config_file='config.ini'):
    """
    Reads the configuration file and returns the settings as a dictionary.

    Args:
        config_file (str): Path to the configuration file.

    Returns:
        dict: A dictionary containing the configuration settings.
    """
    config = configparser.ConfigParser()
    
    # Check if the config file exists
    if not os.path.exists(config_file):
        raise FileNotFoundError(f"The configuration file '{config_file}' does not exist.")
    
    config.read(config_file)
    
    # Convert config sections to a dictionary
    config_dict = {section: dict(config.items(section)) for section in config.sections()}
    
    return config_dict



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pwd

/run/media/meysam/PROGRAM/0.py/token-classification


In [2]:
config = read_config()
data = load_dataset(config['dataset']['name'])

In [3]:
data['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [4]:
data['train'][0]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [5]:
label_names = data["train"].features["ner_tags"].feature.names
print(label_names)
words = data["train"][0]["tokens"]
labels = data["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += f"{word:^10}"
    line2 += f"{full_label:^10}"

print(line1)
print(line2)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
    EU     rejects    German     call       to     boycott   British     lamb       .     
  B-ORG       O       B-MISC      O         O         O       B-MISC      O         O     


## model

In [6]:
from src.model import modelTokenizer
mt = modelTokenizer()

2025-01-05 03:07:54.190686: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Locally


In [18]:
inputs = mt.tokenizer(data["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [35]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels 
inputs = mt.tokenizer(data["train"][3]["tokens"], is_split_into_words=True)
inputs.tokens()
labels = data["train"][3]["ner_tags"]
word_ids = inputs.word_ids()
print(word_ids)
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, None]
[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [38]:
print(word_ids)
print(labels)

new_labels = []
# current_word = None
# for word_id in word_ids:
word_id = word_ids[16]
print(word_id)
if word_id != current_word:
    print(1111111111111)
    # Start of a new word!
    current_word = word_id
    label = -100 if word_id is None else labels[word_id]
    new_labels.append(label)
elif word_id is None:
    print(22222222222)
    # Special token
    new_labels.append(-100)
else:
    print(333333333333)
    # Same word as previous token
    label = labels[word_id]
    # If the label is B-XXX we change it to I-XXX
    if label % 2 == 1:
        label += 1
    new_labels.append(label)

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14, 15, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, None]
[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
14
333333333333


In [33]:
inputs

{'input_ids': [101, 1109, 1735, 2827, 1163, 1113, 9170, 1122, 19786, 1114, 1528, 5566, 1106, 11060, 1106, 188, 17315, 1418, 2495, 12913, 1235, 6479, 4959, 2480, 6340, 13991, 3653, 1169, 1129, 12086, 1106, 8892, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}