In [None]:
import csv
import re
import string
from datetime import datetime
import pandas as pd

Preprocessing 

In [None]:
def normalize_amharic(text):
    text = text.replace('፡', ' ')
    text = text.replace('።', '.')
    text = text.replace('፣', ',')
    text = text.replace('\u1361', '')
    text = re.sub(r'[፤፥፦፧]', '', text)
    return text.strip()

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s፡።፣\u1200-\u137F]', '', text)  # Keep Amharic and basic punctuation
    return text.strip()

def tokenize(text):
    text = re.sub(r'([።፣.,])', r' \1 ', text)  # space around punctuation
    return text.split()

#  Verbose Preprocessing Function (for inspection) ===

def preprocess_messages(messages):
    processed = []
    for i, msg in enumerate(messages):
        original = msg
        norm = normalize_amharic(msg)
        cleaned = clean_text(norm)
        tokens = tokenize(cleaned)

        # Display for inspection
        print(f"\n📨 Original: {original}")
        print(f"🧹 Normalized: {norm}")
        print(f"🧼 Cleaned: {cleaned}")
        print(f"🔠 Tokens: {tokens}")

        processed.append(tokens)
    return processed

Save preprocessed data

In [None]:
def load_and_preprocess_structured(csv_path):
    structured_data = []
    with open(csv_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            raw_msg = row['Message']
            if not raw_msg:
                continue

            norm_msg = normalize_amharic(raw_msg)
            cleaned_msg = clean_text(norm_msg)
            tokens = tokenize(cleaned_msg)

            structured_data.append({
                'channel_title': row['Channel Title'],
                'channel_username': row['Channel Username'],
                'message_id': int(row['ID']),
                'date': row['Date'],
                'cleaned_text': cleaned_msg,
                'tokens': tokens
            })

    return structured_data

In [None]:
csv_path = '../scripts/telegram_data.csv'  # Update this if needed
structured_records = load_and_preprocess_structured(csv_path)

# Preview structured output
for record in structured_records[:5]:
    print(record)

In [None]:
sample_texts = [r['cleaned_text'] for r in structured_records[:3]]
preprocess_messages(sample_texts)


CoNLL format labeling 

In [None]:
# Convert to DataFrame
df = pd.DataFrame(structured_records)

# Save to CSV
df.to_csv('../data/structured_telegram_data.csv', index=False, encoding='utf-8')

print("✅ Saved to structured_telegram_data.csv")

In [None]:
# Pick 40 tokenized messages from the structured data
sampled_token_lists = [record['tokens'] for record in structured_records[:40]]

# Preview the first message in CoNLL-like format (default all labels = 'O')
for token in sampled_token_lists[0]:
    print(f"{token}\tO")

In [None]:

labeled_data = [
    ('BARDEFU', 'B-Product'),
    ('2', 'O'),
    ('IN', 'O'),
    ('1', 'O'),
    ('Multi', 'O'),
    ('purpose', 'O'),
    ('juicer', 'B-Product'),
    ('ኳሊቲ', 'B-LOC'),
    ('የሆነ', 'O'),
    ('የጁስ', 'O'),
    ('መፍጫ', 'O'),
    ('የጀርመን', 'I-Product'),
    ('ቴክኖሎጂ', 'O'),
    ('የሆነ', 'O'),
    ('3', 'O'),
    ('ሌትር', 'O'),
    ('ጁስ', 'O'),
    ('የሚፈጭ', 'O'),
    ('ጆግ', 'O'),
    ('ያለው', 'O'),
    ('የብና', 'O'),
    ('እና', 'O'),
    ('የቅመማ', 'O'),
    ('ቅመም', 'O'),
    ('መፍጫ', 'O'),
    ('ያለው', 'O'),
    ('8000Watt', 'O'),
    ('የሆነ', 'O'),
    ('ምላጮቹ', 'O'),
    ('ጠንካራ', 'O')
]

save labeled data

In [None]:
def save_to_conll_format(labeled_data, filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        for sentence in labeled_data:
            for token, label in sentence:
                f.write(f"{token}\t{label}\n")
            f.write("\n")  # Separate messages

# Wrap single sentence
labeled_data = [labeled_data]

# Save it
save_to_conll_format(labeled_data, '../amharic_ner_dataset.conll')

print("✅ Saved as amharic_ner_dataset.conll")