In [None]:
import pandas as pd
import ast

# Load dataset
file_path = "amharic_telegram_cleaned_data.csv"  # or your full path
df = pd.read_csv(file_path)

# Select a subset of 50 messages
subset = df[['cleaned_text', 'tokens', 'prices']].head(50).copy()

# Convert tokens column to list if stored as string
subset['tokens'] = subset['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Define heuristic keywords
price_keywords = ['ብር', 'ዋጋ', 'በ']
location_keywords = ['አዲስ', 'አበባ', 'ቦሌ', 'አካባቢ', 'ማረፊያ', 'ከተማ', 'ቤት', 'መንገድ']
product_keywords = ['Kettle', 'Stopper', 'Protector', 'Sneaker', 'Bottle', 'Phone', 'Shoes', 
                    'ኬትል', 'ሳቺ', 'ስልክ', 'ጫማ']

# Labeling function
def label_tokens(tokens, text, prices):
    labels = ['O'] * len(tokens)
    
    # Label price
    if isinstance(prices, str):
        for price in prices.split(','):
            price_parts = price.strip().split()
            for i in range(len(tokens) - len(price_parts) + 1):
                if tokens[i:i+len(price_parts)] == price_parts:
                    labels[i] = 'B-PRICE'
                    for j in range(1, len(price_parts)):
                        labels[i + j] = 'I-PRICE'

    # Label product
    for i, token in enumerate(tokens):
        for kw in product_keywords:
            if kw.lower() in token.lower():
                if labels[i] == 'O':
                    labels[i] = 'B-Product'

    # Label location
    for i, token in enumerate(tokens):
        for kw in location_keywords:
            if kw in token and labels[i] == 'O':
                labels[i] = 'B-LOC'

    return list(zip(tokens, labels))

# Apply labeling to each row
subset['conll'] = subset.apply(lambda row: label_tokens(row['tokens'], row['cleaned_text'], row['prices']), axis=1)

# Convert to CoNLL format string
conll_lines = []
for message in subset['conll']:
    for token, label in message:
        conll_lines.append(f"{token} {label}")
    conll_lines.append("")  # Separate messages

# Save to file
output_path = "amharic_ner_conll_output.txt"
with open(output_path, "w", encoding="utf-8") as f:
    f.write("\n".join(conll_lines))

print(f"Labeled data saved to: {output_path}")
