In [1]:
import pandas as pd
from datasets import Dataset

# Load your CoNLL formatted dataset
def load_conll_dataset(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence, labels = [], []
        for line in f:
            if line.strip():
                token, label = line.split()
                sentence.append(token)
                labels.append(label)
            else:
                data.append((sentence, labels))
                sentence, labels = [], []
    return data

data = load_conll_dataset(r'C:\Users\Admin.DESKTOP-M4R2VLU\ourweek5\labeled_telegram_product_price_contact.conll')

# Convert to DataFrame
df = pd.DataFrame(data, columns=['tokens', 'labels'])

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

ModuleNotFoundError: No module named 'datasets'

In [12]:
# Define a mapping of labels to IDs
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE']  # Add all your labels here
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for idx, label in enumerate(label_list)}

# Update the tokenize_and_align_labels function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Will return a list of word ids
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Default label is -100
        for j, word_id in enumerate(word_ids):
            if word_id is not None:  # Only label the first token of each word
                label_ids[j] = label_to_id[label[word_id]]  # Map to integer ID
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Note: Make sure to convert your labels in the dataset to the appropriate format

In [13]:
import numpy as np

# Convert labels to a list of lists
df['labels'] = df['labels'].apply(lambda x: [label_to_id[label] for label in x])  # Map labels to IDs

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [14]:
print(df.dtypes)  # Ensure labels are of type list
print(df.head())  # Check a sample of your DataFrame

tokens    object
labels    object
dtype: object
                                              tokens  \
0  [የ, 1, አመት, ሳይዝ, 1500, ብር, 0913870090, ለማዘዝ, @...   
1  [ዘናጭ, 2, ፒስ, ከ3-6, እና, ከ, 6-9, ወር, 1500, ብር, 0...   
2  [ዘናጭ, 2, ፒስ, ከ9, ወር-, 3, አመት, ሳይዝ, አለዉ, ለልደትም,...   
3  [3, ፒስ, 100%, ኮተን, የአራስ, ልብስ, ኮፍያ, የዳይፐር, ቲሸርት...   
4  [3, ፒስ, 100%, ኮተን, የአራስ, ልብስ, ኮፍያ, የዳይፐር, ቲሸርት...   

                                              labels  
0                        [1, 2, 2, 2, 4, 4, 0, 0, 0]  
1            [1, 2, 2, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0]  
2  [1, 2, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 4, 4, 0, ...  
3  [1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 4, 4, 0, ...  
4  [1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 4, ...  


In [15]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the data and align the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Will return a list of word ids
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Default label is -100
        for j, word_id in enumerate(word_ids):
            if word_id is not None:  # Only label the first token of each word
                label_ids[j] = label[word_id]
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 1519/1519 [00:00<00:00, 2438.56 examples/s]


In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [17]:
from datasets import DatasetDict

# Assuming tokenized_dataset is your tokenized data
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)  # 80% train, 20% validation

# Create a DatasetDict to hold both training and validation sets
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'],
})

# Check the dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1215
    })
    validation: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 304
    })
})


In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        truncation=True, 
        padding='max_length',  # Pad to max length
        max_length=512,  # Adjust as needed
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Initialize with -100

        for j, word_id in enumerate(word_ids):
            if word_id is not None:  # Only label the first token of each word
                if word_id < len(label):  # Ensure we don't go out of bounds
                    label_ids[j] = label_to_id[label[word_id]]  # Map to integer ID
                else:
                    print(f"Warning: word_id {word_id} is out of bounds for label length {len(label)}")

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [19]:
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE', 'B-CONTACT', 'I-CONTACT']  # Add your labels
label_to_id = {label: idx for idx, label in enumerate(label_list)}

In [20]:
print(dataset)  # Check the structure of your dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1215
    })
    validation: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 304
    })
})


In [21]:
# Print the first few examples to diagnose the structure
print(dataset['train'][0])  # Adjust this index based on your dataset

{'tokens': ['Mother’s', 'choice', '2', 'ፒስ', 'ለስጦታም', 'ሆነ', 'ለራስዎ', 'ከ', 'አራስ-6', 'ወር', '1450', 'ብር', '0913870090', '@KiDu_W'], 'labels': [-100, 1, 1, 1, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 

In [22]:
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE']  # Your actual labels
label_to_id = {label: idx for idx, label in enumerate(label_list)}

In [23]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding='max_length',  # Pad to max length
        max_length=512,  # Adjust max length as needed
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Initialize with -100

        print(f"Processing example {i}:")
        print(f"Tokens: {examples['tokens']}")
        print(f"Tokenized Input IDs: {tokenized_inputs['input_ids']}")
        print(f"Original Labels: {label}")

        for j, word_id in enumerate(word_ids):
            if word_id is not None:  # Only label the first token of each word
                if j < len(label):  # Check if j is within bounds
                    label_ids[j] = label[j]  # Assign the label directly
                else:
                    print(f"Index out of range for label with j={j} and label length={len(label)}")

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [24]:
# Tokenize the dataset again
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

Processing example 0:
Tokens: [['Mother’s', 'choice', '2', 'ፒስ', 'ለስጦታም', 'ሆነ', 'ለራስዎ', 'ከ', 'አራስ-6', 'ወር', '1450', 'ብር', '0913870090', '@KiDu_W'], ['5', 'ፍሬ', 'እጅጌ', 'ጉርድ', 'የዳይፐር', 'ቲሸርት', 'ከ', '0-3', 'ወር', '2000', 'ብር', '0913870090', 'ለማዘዝ', '@KiDu_W'], ['5', 'ፍሬ', 'እጅጌ', 'ሙሉ', 'የዳይፐር', 'ቲሸርት', 'ከ0-3', 'ወር', '2000', 'ብር', '0913870090', '@KiDu_W'], ['የአልጋ', 'ላይ', 'ማናፈሻ', 'እና', 'ዳይፐር', 'መቀየሪያ', 'ምቹ', 'እና', 'ለስላሳ', 'ፈሳሽ', 'የማያስተላልፍ', '800', 'ብር', '0913870090', 'ለማዘዝ', '@KiDu_W'], ['ዘመናዊ', 'የልጆች', 'ሶፋ', 'ልጆች', 'መቀመጥ', 'ከሚጀመሩበት', 'ጊዜ', 'ጀምሮ', 'ወገባቸውን', 'የሚደግፍ', 'እና', 'እንዲጠነክሩ', 'የሚያግዝ', 'ለልጅዎም', 'ሆነ', 'ለአራስ', 'ወዳጅዎ', 'የሚጠቅም', 'እቃ', '3200', 'ብር', '0913870090', '@KiDu_W'], ['3', 'ፍሬ', 'ጀምሱት', 'ከ0-3', 'ወር', '2000', 'ብር', '0913870090', '@KiDu_W'], ['3', 'ፍሬ', 'ሄድ', 'ባንድ', '500', 'ብር', '0913870090', '@KiDu_W'], ['ኳሊቲ', '3', 'ፒስ', 'የልጆች', 'ቦርሳ', '2000', 'ብር', '0913870090', 'ለማዘዝ', '@KiDu_W'], ['5', 'ፍሬ', '100%', 'ኮተን', 'ኮፍያ', 'ከ', '6', 'ወር', 'በላይ', '1000', 'ብር', '0913870090', '@KiDu_W'], ['3',