Helper notebook to generate annotation examples from dataset.

In [13]:
from datasets import load_dataset
import yaml
import json
from tqdm.notebook import tqdm

In [2]:
CONFIG_PATH = "../../settings/config.yml"

In [3]:
# Reading config file
config = yaml.safe_load(open(CONFIG_PATH))

In [4]:
def select_records(dataset_split):
    """
    # Selects two examples from the dataset
    # First record with only 0's in ner_tags
    # Second record with at least one non-zero tag 
    """
    record_with_zeros = None
    record_with_non_zeros = None

    for record in dataset_split:
        # Check if all ner_tags are 0
        if all(tag == 0 for tag in record['ner_tags']) and not record_with_zeros:
            record_with_zeros = record
        # Check if there is at least one non-zero ner_tag
        elif any(tag != 0 for tag in record['ner_tags']) and not record_with_non_zeros:
            record_with_non_zeros = record

        # If both records are found, break
        if record_with_zeros and record_with_non_zeros:
            break

    return record_with_zeros, record_with_non_zeros

In [5]:
language = 'bam'
language_name = config['languages_names'][language]
data = load_dataset(config['dataset'], language)

In [6]:
record_with_zeros, record_with_non_zeros = select_records(data['test'])

In [7]:
print(record_with_non_zeros)

{'id': '0', 'tokens': ['Ko', 'min', 'tɔ', 'bɛ', 'kɔ', '-', 'Dirisa', 'Togola', '-', 'Minisiriɲɛmɔgɔ', ',', 'Sogɛli', 'Kokala', 'Mayiga', "n'a", 'ka', 'Kunnafonidi', 'minisiri', 'Mɛtiri', 'Haruna', 'Ture', 'dalen', 'a', 'kan', ',', 'taara', 'nin', 'ntɛnɛndon', ',', 'zuwɛnkalo', 'tile', '28', ',', 'Kunnafonidalaw', 'ka', 'Soba', 'la', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0]}


In [8]:
print(record_with_zeros)

{'id': '1', 'tokens': ['Nin', 'balimayataama', 'ka', 'taa', 'bɔ', 'Kunnafonidalaw', 'ka', 'Soba', 'la', 'laɲini', 'tun', 'ye', 'ka', 'kunnafonidilaw', 'ladɔnniya', 'furancɛlafanga', 'gɔfɛrɛnaman', 'ka', 'baara', 'kɛta', 'fɔlɔfɔlɔw', 'la', ',', 'minnu', 'ye', 'jamana', 'dugukolo', 'lakanani', ',', 'politikikow', 'ni', 'fangabulonkow', 'yɛlɛmaniw', ',', 'ani', 'hadamadenyasiraw', 'basigili', 'ye', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [9]:
def convert_to_output_format(tokens, ner_tags, label_mapping):
    output = []
    for token, tag in zip(tokens, ner_tags):
        converted_tag = label_mapping.get(tag, 'O')  # Default to 'O' if tag is not in the conversion map
        output.append([token, converted_tag])
    return output

In [10]:
convert_to_output_format(record_with_non_zeros['tokens'], record_with_non_zeros['ner_tags'], config['label_mapping'])

[['Ko', 'O'],
 ['min', 'O'],
 ['tɔ', 'O'],
 ['bɛ', 'O'],
 ['kɔ', 'O'],
 ['-', 'O'],
 ['Dirisa', 'B-PER'],
 ['Togola', 'I-PER'],
 ['-', 'O'],
 ['Minisiriɲɛmɔgɔ', 'O'],
 [',', 'O'],
 ['Sogɛli', 'B-PER'],
 ['Kokala', 'I-PER'],
 ['Mayiga', 'I-PER'],
 ["n'a", 'O'],
 ['ka', 'O'],
 ['Kunnafonidi', 'O'],
 ['minisiri', 'O'],
 ['Mɛtiri', 'O'],
 ['Haruna', 'B-PER'],
 ['Ture', 'I-PER'],
 ['dalen', 'O'],
 ['a', 'O'],
 ['kan', 'O'],
 [',', 'O'],
 ['taara', 'O'],
 ['nin', 'O'],
 ['ntɛnɛndon', 'B-DATE'],
 [',', 'I-DATE'],
 ['zuwɛnkalo', 'I-DATE'],
 ['tile', 'I-DATE'],
 ['28', 'I-DATE'],
 [',', 'O'],
 ['Kunnafonidalaw', 'O'],
 ['ka', 'O'],
 ['Soba', 'O'],
 ['la', 'O'],
 ['.', 'O']]

In [11]:
annotations = {language_name: {
    'example1': {
        'input': record_with_zeros['tokens'],
        'output': convert_to_output_format(
            record_with_zeros['tokens'],
            record_with_zeros['ner_tags'],
            config['label_mapping']
        )
    },
    'example2': {
        'input': record_with_non_zeros['tokens'],
        'output': convert_to_output_format(
            record_with_non_zeros['tokens'],
            record_with_non_zeros['ner_tags'],
            config['label_mapping']
        )
    }
}}


In [12]:
json.dumps(annotations)

'{"Bambara": {"example1": {"input": ["Nin", "balimayataama", "ka", "taa", "b\\u0254", "Kunnafonidalaw", "ka", "Soba", "la", "la\\u0272ini", "tun", "ye", "ka", "kunnafonidilaw", "lad\\u0254nniya", "furanc\\u025blafanga", "g\\u0254f\\u025br\\u025bnaman", "ka", "baara", "k\\u025bta", "f\\u0254l\\u0254f\\u0254l\\u0254w", "la", ",", "minnu", "ye", "jamana", "dugukolo", "lakanani", ",", "politikikow", "ni", "fangabulonkow", "y\\u025bl\\u025bmaniw", ",", "ani", "hadamadenyasiraw", "basigili", "ye", "."], "output": [["Nin", "O"], ["balimayataama", "O"], ["ka", "O"], ["taa", "O"], ["b\\u0254", "O"], ["Kunnafonidalaw", "O"], ["ka", "O"], ["Soba", "O"], ["la", "O"], ["la\\u0272ini", "O"], ["tun", "O"], ["ye", "O"], ["ka", "O"], ["kunnafonidilaw", "O"], ["lad\\u0254nniya", "O"], ["furanc\\u025blafanga", "O"], ["g\\u0254f\\u025br\\u025bnaman", "O"], ["ka", "O"], ["baara", "O"], ["k\\u025bta", "O"], ["f\\u0254l\\u0254f\\u0254l\\u0254w", "O"], ["la", "O"], [",", "O"], ["minnu", "O"], ["ye", "O"], ["j

Creating annotation examples for all languages:

In [14]:
annotations = {}

for language in tqdm(config['languages_list']):
    # Get full language name
    language_name = config['languages_names'][language]
    # Load dataset
    data = load_dataset(config['dataset'], language)
    # Select records as examples
    record_with_zeros, record_with_non_zeros = select_records(data['test'])

    annotations[language_name] = {
        'example1': {
            'input': record_with_zeros['tokens'],
            'output': convert_to_output_format(
                record_with_zeros['tokens'],
                record_with_zeros['ner_tags'],
                config['label_mapping']
            )
        },
        'example2': {
            'input': record_with_non_zeros['tokens'],
            'output': convert_to_output_format(
                record_with_non_zeros['tokens'],
                record_with_non_zeros['ner_tags'],
                config['label_mapping']
            )
        }
    }
    

  0%|          | 0/20 [00:00<?, ?it/s]

In [16]:
import json

# Writing annotations to a file
file_path = 'ner_examples_all_languages.json'

with open(file_path, 'w') as file:
    json.dump(annotations, file, indent=4)

Testing:

In [21]:
import os
import sys

queries_module_path = os.path.abspath('../src/query')
if queries_module_path not in sys.path:
    sys.path.append(queries_module_path)

In [22]:
from query_gpt import add_annotation_examples

In [23]:
print(add_annotation_examples(file_path, 'Bambara'))

Example 1:
Input: ['Nin', 'balimayataama', 'ka', 'taa', 'bɔ', 'Kunnafonidalaw', 'ka', 'Soba', 'la', 'laɲini', 'tun', 'ye', 'ka', 'kunnafonidilaw', 'ladɔnniya', 'furancɛlafanga', 'gɔfɛrɛnaman', 'ka', 'baara', 'kɛta', 'fɔlɔfɔlɔw', 'la', ',', 'minnu', 'ye', 'jamana', 'dugukolo', 'lakanani', ',', 'politikikow', 'ni', 'fangabulonkow', 'yɛlɛmaniw', ',', 'ani', 'hadamadenyasiraw', 'basigili', 'ye', '.']
Output: { 'output': [['Nin', 'O'], ['balimayataama', 'O'], ['ka', 'O'], ['taa', 'O'], ['bɔ', 'O'], ['Kunnafonidalaw', 'O'], ['ka', 'O'], ['Soba', 'O'], ['la', 'O'], ['laɲini', 'O'], ['tun', 'O'], ['ye', 'O'], ['ka', 'O'], ['kunnafonidilaw', 'O'], ['ladɔnniya', 'O'], ['furancɛlafanga', 'O'], ['gɔfɛrɛnaman', 'O'], ['ka', 'O'], ['baara', 'O'], ['kɛta', 'O'], ['fɔlɔfɔlɔw', 'O'], ['la', 'O'], [',', 'O'], ['minnu', 'O'], ['ye', 'O'], ['jamana', 'O'], ['dugukolo', 'O'], ['lakanani', 'O'], [',', 'O'], ['politikikow', 'O'], ['ni', 'O'], ['fangabulonkow', 'O'], ['yɛlɛmaniw', 'O'], [',', 'O'], ['ani', 'O