### Loaded raw Telegram data (CSV) from e-commerce channels

In [1]:
import pandas as pd
import re

csv_file_path = '../data/telegram_data.csv'
df = pd.read_csv(csv_file_path)
df['text'] = df['text'].fillna('')
df = df[df['text'].str.strip().astype(bool)]
df_backup = df.copy()

### Cleaned & normalized Amharic text

In [2]:
# Data Cleaning 

def clean_text(text):
    """
    Performs basic text cleaning:
    - Converts text to string (handles non-string types)
    - Removes URLs, emojis 
    - Removes special characters and numbers (keeping only letters and basic punctuation)
    - Removes extra whitespace
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+', '', text)
    text = re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+', '', text)
    text = re.sub(r'[^\u1200-\u137F\s\w.,!?-]+', '', text) 
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [3]:
# Normalize
def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip().casefold()
    text = text.replace('\u1369', '').replace('\u136A', '').replace('\u136B', '').replace('\u136C', '')
    text = text.replace('\u136D', '').replace('\u136E', '').replace('\u136F', '').replace('\u1370', '')
    text = text.replace('\u1371', '').replace('\u1372', '').replace('\u1373', '').replace('\u1374', '')
    text = text.replace('\u1375', '').replace('\u1376', '').replace('\u1377', '').replace('\u1378', '')
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df['normalized_text'] = df['cleaned_text'].apply(normalize_text)

### Tokenized text using regex

In [4]:
# Tokenization
def tokenize(text):
    if not isinstance(text, str):
        return []
    tokens = re.findall(r'[\u1200-\u137F\w]+|[.,!?-]', text)
    return [token for token in tokens if token]

df['tokens'] = df['normalized_text'].apply(tokenize)

### Sampled 50 messages

In [5]:
sample_df = df[['tokens']].dropna().sample(50, random_state=42).reset_index(drop=True)

# Remove dot and punctuation tokens
sample_df['tokens'] = sample_df['tokens'].apply(lambda toks: [t for t in toks if t not in ['.', '..', '...', ',', '!', '?']])

In [66]:
labeled_data = []
try:
    with open("../data/labeled_data.conll", "r", encoding="utf-8") as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line == "":
                if sentence:
                    labeled_data.append(sentence)
                    sentence = []
            else:
                parts = line.rsplit(" ", 1)
                if len(parts) == 2:
                    sentence.append((parts[0], parts[1]))
    print(f"Loaded {len(labeled_data)} labeled samples from file.")
except FileNotFoundError:
    print("No saved labels found. Starting from scratch.")
    labeled_data = []

Loaded 50 labeled samples from file.


In [67]:
def label_sentence(index):
    tokens = sample_df.loc[index, 'tokens']
    labeled_sentence = []

    print(f"\nSample {index + 1}/{len(sample_df)}:")

    for i, token in enumerate(tokens):
        print(f"{i}: {token}", end=" -> ")
        label = input("Label (O, B-Product, I-PRICE, etc.): ").strip()

        if " " in label:
            print(" Labels should not contain spaces. Try again.")
            label = input("Label (no space!): ").strip()

        if label == "":
            label = "O"

        labeled_sentence.append((token, label))

    labeled_data.append(labeled_sentence)
    print(f" Labeled {len(tokens)} tokens in Sample {index + 1}")

In [63]:
# sample_index = len(labeled_data)
label_sentence(49)


Sample 50/50:
0: ቴሌግራም -> 

Label (O, B-Product, I-PRICE, etc.):  O


1: t -> 

Label (O, B-Product, I-PRICE, etc.):  O


2: memodernshoppingcenter -> 

Label (O, B-Product, I-PRICE, etc.):  B-LOC


3: በአዲስ -> 

Label (O, B-Product, I-PRICE, etc.):  O


4: ነገረ -> 

Label (O, B-Product, I-PRICE, etc.):  O


5: ሁሌም -> 

Label (O, B-Product, I-PRICE, etc.):  O


6: ቀዳሚዏች -> 

Label (O, B-Product, I-PRICE, etc.):  O


7: ነን -> 

Label (O, B-Product, I-PRICE, etc.):  O


8: የጫማ -> 

Label (O, B-Product, I-PRICE, etc.):  B-Product


9: ማስቀመጫ -> 

Label (O, B-Product, I-PRICE, etc.):  I-Product


10: ፍሬሞቹ -> 

Label (O, B-Product, I-PRICE, etc.):  I-Product


11: የብረት -> 

Label (O, B-Product, I-PRICE, etc.):  O


12: ቱቦዎች -> 

Label (O, B-Product, I-PRICE, etc.):  O


13: የሆኑ -> 

Label (O, B-Product, I-PRICE, etc.):  O


14: መደርደሪያው -> 

Label (O, B-Product, I-PRICE, etc.):  B-Product


15: ብረት -> 

Label (O, B-Product, I-PRICE, etc.):  O


16: የሆነ -> 

Label (O, B-Product, I-PRICE, etc.):  O


17: መገጣጠሚያው -> 

Label (O, B-Product, I-PRICE, etc.):  O


18: ካርቦኔትድ -> 

Label (O, B-Product, I-PRICE, etc.):  O


19: የሆነ -> 

Label (O, B-Product, I-PRICE, etc.):  O


20: ጠንካራ -> 

Label (O, B-Product, I-PRICE, etc.):  O


21: ፕላሰሰቲክ -> 

Label (O, B-Product, I-PRICE, etc.):  O


22: ባለ -> 

Label (O, B-Product, I-PRICE, etc.):  O


23: 9 -> 

Label (O, B-Product, I-PRICE, etc.):  O


24: ደረጃ -> 

Label (O, B-Product, I-PRICE, etc.):  O


25: በሩ -> 

Label (O, B-Product, I-PRICE, etc.):  O


26: በዚፕ -> 

Label (O, B-Product, I-PRICE, etc.):  O


27: የሚዘጋ -> 

Label (O, B-Product, I-PRICE, etc.):  O


28: የካልስ -> 

Label (O, B-Product, I-PRICE, etc.):  O


29: ማስቀመጫ -> 

Label (O, B-Product, I-PRICE, etc.):  O


30: ኪስ -> 

Label (O, B-Product, I-PRICE, etc.):  O


31: ያለው -> 

Label (O, B-Product, I-PRICE, etc.):  O


32: 60cm30cm160m -> 

Label (O, B-Product, I-PRICE, etc.):  O


33: ልብሱ -> 

Label (O, B-Product, I-PRICE, etc.):  O


34: ሸራ -> 

Label (O, B-Product, I-PRICE, etc.):  O


35: የሆነ -> 

Label (O, B-Product, I-PRICE, etc.):  O


36: በተለያየ -> 

Label (O, B-Product, I-PRICE, etc.):  O


37: የከለር -> 

Label (O, B-Product, I-PRICE, etc.):  O


38: አማራጭ -> 

Label (O, B-Product, I-PRICE, etc.):  O


39: የቀረበ -> 

Label (O, B-Product, I-PRICE, etc.):  O


40: ለአጠቃቀም -> 

Label (O, B-Product, I-PRICE, etc.):  O


41: ቀላል -> 

Label (O, B-Product, I-PRICE, etc.):  O


42: በ -> 

Label (O, B-Product, I-PRICE, etc.):  O


43: 4200 -> 

Label (O, B-Product, I-PRICE, etc.):  O


44: 0924743736 -> 

Label (O, B-Product, I-PRICE, etc.):  O


45: 0974978584 -> 

Label (O, B-Product, I-PRICE, etc.):  O


46: በስራችን -> 

Label (O, B-Product, I-PRICE, etc.):  O


47: ላይ -> 

Label (O, B-Product, I-PRICE, etc.):  O


48: ቅሬታ -> 

Label (O, B-Product, I-PRICE, etc.):  O


49: ካለዎት -> 

Label (O, B-Product, I-PRICE, etc.):  O


50: ብቻ -> 

Label (O, B-Product, I-PRICE, etc.):  O


51: በዚህ -> 

Label (O, B-Product, I-PRICE, etc.):  O


52: ስልክ -> 

Label (O, B-Product, I-PRICE, etc.):  O


53: ደዉለዉ -> 

Label (O, B-Product, I-PRICE, etc.):  O


54: ያሳዉቁን። -> 

Label (O, B-Product, I-PRICE, etc.):  O


55: 0961373839 -> 

Label (O, B-Product, I-PRICE, etc.):  O


56: የመረጡትን -> 

Label (O, B-Product, I-PRICE, etc.):  O


57: እቃ -> 

Label (O, B-Product, I-PRICE, etc.):  O


58: ለማዘዝ -> 

Label (O, B-Product, I-PRICE, etc.):  O


59: ከታች -> 

Label (O, B-Product, I-PRICE, etc.):  O


60: ባለዉ -> 

Label (O, B-Product, I-PRICE, etc.):  O


61: የቴሌግራም -> 

Label (O, B-Product, I-PRICE, etc.):  O


62: አድራሻ -> 

Label (O, B-Product, I-PRICE, etc.):  O


63: ይላኩልን -> 

Label (O, B-Product, I-PRICE, etc.):  O


64: t -> 

Label (O, B-Product, I-PRICE, etc.):  O


65: memodernshopping1 -> 

Label (O, B-Product, I-PRICE, etc.):  O


66: t -> 

Label (O, B-Product, I-PRICE, etc.):  O


67: memodernshopping2 -> 

Label (O, B-Product, I-PRICE, etc.):  O


68: አዲስ -> 

Label (O, B-Product, I-PRICE, etc.):  B-LOC


69: አበባ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


70: ዉስጥ -> 

Label (O, B-Product, I-PRICE, etc.):  O


71: ከ150ብር -> 

Label (O, B-Product, I-PRICE, etc.):  B-PRICE


72: እስከ -> 

Label (O, B-Product, I-PRICE, etc.):  I-PRICE


73: 200ብር -> 

Label (O, B-Product, I-PRICE, etc.):  I-PRICE


74: ብቻ -> 

Label (O, B-Product, I-PRICE, etc.):  I-PRICE


75: በማስከፈል -> 

Label (O, B-Product, I-PRICE, etc.):  O


76: ያሉበት -> 

Label (O, B-Product, I-PRICE, etc.):  O


77: ድረስ -> 

Label (O, B-Product, I-PRICE, etc.):  O


78: በፈጣን -> 

Label (O, B-Product, I-PRICE, etc.):  O


79: ሞተረኞቻችን -> 

Label (O, B-Product, I-PRICE, etc.):  O


80: እንልክልዏታለን። -> 

Label (O, B-Product, I-PRICE, etc.):  O


81: ለክፍለ -> 

Label (O, B-Product, I-PRICE, etc.):  O


82: ሀገር -> 

Label (O, B-Product, I-PRICE, etc.):  O


83: ደንበኞቻችን -> 

Label (O, B-Product, I-PRICE, etc.):  O


84: በመነሐሪያ -> 

Label (O, B-Product, I-PRICE, etc.):  O


85: በኩል -> 

Label (O, B-Product, I-PRICE, etc.):  O


86: አድርገን -> 

Label (O, B-Product, I-PRICE, etc.):  O


87: በሹፌር -> 

Label (O, B-Product, I-PRICE, etc.):  O


88: እንልካለን። -> 

Label (O, B-Product, I-PRICE, etc.):  O


89: አድራሻ -> 

Label (O, B-Product, I-PRICE, etc.):  B-LOC


90: ቁጥር -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


91: 1 -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


92: ጉርድሾላ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


93: ከሴንቸሪ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


94: ሞል -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


95: ትንሽ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


96: ዝቅ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


97: እንዳሉ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


98: ሆሊሲቲ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


99: ሴንተር -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


100: ላይ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


101: እንደገቡ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


102: ፊትለፊት -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


103: ከሊፍቱ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


104: በግራ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


105: በኩል -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


106: ሚዛን -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


107: ላይ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


108: m06 -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


109: ቁጥር -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


110: 2 -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


111: ጀሞ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


112: መስታወት -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


113: ፋብሪካ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


114: ጭስ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


115: መዉጫዉ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


116: ፊትለፊት -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


117: ራሐ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


118: ሞል -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


119: ግራዉንድ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


120: ፍሎር -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


121: ከደረጃዉ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


122: ጎንትልቁ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


123: ሚና -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


124: ፈርኒቸር -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


125: ያለበት -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


126: ህንፃ -> 

Label (O, B-Product, I-PRICE, etc.):  I-LOC


127: በሞደርን -> 

Label (O, B-Product, I-PRICE, etc.):  O


128: እቃወዏች -> 

Label (O, B-Product, I-PRICE, etc.):  O


129: ሂወትዎን -> 

Label (O, B-Product, I-PRICE, etc.):  O


130: ሞደርናይዝ -> 

Label (O, B-Product, I-PRICE, etc.):  O


131: ያድርጉ -> 

Label (O, B-Product, I-PRICE, etc.):  O


✅ Labeled 132 tokens in Sample 50


### Save the labled data

In [68]:
with open("../data/labeled_data.conll", "w", encoding="utf-8") as f:
    for sentence in labeled_data:
        for token, label in sentence:
            f.write(f"{token} {label}\n")
        f.write("\n")
print(" Saved")


 Saved


In [69]:
# Read and print the content of labeled_data.conll
with open("../data/labeled_data.conll", "r", encoding="utf-8") as file:
    content = file.read()
    print(content)


skechers B-Product
archfit B-Product
size O
40 O
41 O
42 O
43 O
price B-PRICE
3400 I-PRICE
birr I-PRICE
አድራሻ B-LOC
- I-LOC
ሜክሲኮ I-LOC
ኮሜርስ I-LOC
ጀርባ I-LOC
መዚድ I-LOC
ፕላዛ I-LOC
የመጀመሪያ I-LOC
ደረጃ I-LOC
እንደወጡ I-LOC
101 I-LOC
የቢሮ I-LOC
ቁጥር I-LOC
ያገኙናል I-LOC
or O
call O
0920238243 O
ethiobrand O

magic O
silicone B-Product
dish B-Product
washing O
gloves B-Product
high O
quality O
ወፍራሙ O
ዕቃ B-Product
ለማጠብ O
ቤት O
ለማፅዳት O
መኪና O
ለማጠብ O
ለተለያየ O
አገልግሎት O
የሚውል O
ዋጋ፦ B-PRICE
350ብር I-PRICE
ውስን O
ፍሬ O
ነው O
ያለው O
አድራሻ B-LOC
መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ I-LOC
ቢሮ I-LOC
ቁ I-LOC
s05s06 I-LOC
0902660722 O
0928460606 O
በtelegram O
ለማዘዝ O
ይጠቀሙ O
ለተጨማሪ O
ማብራሪያ O
የቴሌግራም O
ገፃችን O

nike B-Product
just O
do O
it O
size O
39 O
40 O
41 O
42 O
43 O
price B-PRICE
3400 I-PRICE
birr I-PRICE
አድራሻ B-LOC
- I-LOC
ሜክሲኮ I-LOC
ኮሜርስ I-LOC
ጀርባ I-LOC
መዚድ I-LOC
ፕላዛ I-LOC
የመጀመሪያ I-LOC
ደረጃ I-LOC
እንደወጡ I-LOC
101 I-LOC
የቢሮ I-LOC
ቁጥር I-LOC
ያገኙናል I-LOC
or O
call O
0920238243 O
ethiobrand O

air O
jordan B-ProducT
stay O
loyal O
size O
40 O
