In [1]:
import pandas as pd
import re

csv_file_path = '../data/telegram_data.csv'
df = pd.read_csv(csv_file_path)
df['text'] = df['text'].fillna('')
df = df[df['text'].str.strip().astype(bool)]
df_backup = df.copy()

In [2]:
# Data Cleaning 

def clean_text(text):
    """
    Performs basic text cleaning:
    - Converts text to string (handles non-string types)
    - Removes URLs, emojis 
    - Removes special characters and numbers (keeping only letters and basic punctuation)
    - Removes extra whitespace
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+', '', text)
    text = re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+', '', text)
    text = re.sub(r'[^\u1200-\u137F\s\w.,!?-]+', '', text) 
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [3]:
# Normalize
def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip().casefold()
    text = text.replace('\u1369', '').replace('\u136A', '').replace('\u136B', '').replace('\u136C', '')
    text = text.replace('\u136D', '').replace('\u136E', '').replace('\u136F', '').replace('\u1370', '')
    text = text.replace('\u1371', '').replace('\u1372', '').replace('\u1373', '').replace('\u1374', '')
    text = text.replace('\u1375', '').replace('\u1376', '').replace('\u1377', '').replace('\u1378', '')
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df['normalized_text'] = df['cleaned_text'].apply(normalize_text)

In [4]:
# Tokenization
def tokenize(text):
    if not isinstance(text, str):
        return []
    tokens = re.findall(r'[\u1200-\u137F\w]+|[.,!?-]', text)
    return [token for token in tokens if token]

df['tokens'] = df['normalized_text'].apply(tokenize)

In [5]:
sample_df = df[['tokens']].dropna().sample(50, random_state=42).reset_index(drop=True)

# Remove dot and punctuation tokens
sample_df['tokens'] = sample_df['tokens'].apply(lambda toks: [t for t in toks if t not in ['.', '..', '...', ',', '!', '?']])

In [6]:
labeled_data = []
try:
    with open("../data/labeled_data.conll", "r", encoding="utf-8") as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line == "":
                if sentence:
                    labeled_data.append(sentence)
                    sentence = []
            else:
                parts = line.rsplit(" ", 1)
                if len(parts) == 2:
                    sentence.append((parts[0], parts[1]))
    print(f"✅ Loaded {len(labeled_data)} labeled samples from file.")
except FileNotFoundError:
    print("⚠️ No saved labels found. Starting from scratch.")
    labeled_data = []

✅ Loaded 10 labeled samples from file.


In [7]:
# Choose one sentence to label
# sample_index = 0  # Change this to 1, 2, 3... later

# tokens = sample_df.loc[sample_index, 'tokens']

# print(f"Sample {sample_index + 1}/{len(sample_df)}:")
# for i, token in enumerate(tokens):
#     print(f"{i}: {token}")


In [8]:
def label_sentence(index):
    tokens = sample_df.loc[index, 'tokens']
    labeled_sentence = []

    print(f"\nSample {index + 1}/{len(sample_df)}:")

    for i, token in enumerate(tokens):
        print(f"{i}: {token}", end=" -> ")
        label = input("Label (O, B-Product, I-PRICE, etc.): ").strip()

        if " " in label:
            print("❌ Labels should not contain spaces. Try again.")
            label = input("Label (no space!): ").strip()

        if label == "":
            label = "O"

        labeled_sentence.append((token, label))

    labeled_data.append(labeled_sentence)
    print(f"✅ Labeled {len(tokens)} tokens in Sample {index + 1}")

In [None]:
# sample_index = len(labeled_data)
label_sentence(12)


Sample 13/50:
0: vaccuum -> 

Label (O, B-Product, I-PRICE, etc.):  B-Product


1: flask -> 

Label (O, B-Product, I-PRICE, etc.):  I-Product


2: set -> 

Label (O, B-Product, I-PRICE, etc.):  I-Product


3: የፔርሙዝ -> 

Label (O, B-Product, I-PRICE, etc.):  I-Product


4: ማግ -> 

Label (O, B-Product, I-PRICE, etc.):  I-Product


5: 3 -> 

Label (O, B-Product, I-PRICE, etc.):  O


6: መጠጫ -> 

Label (O, B-Product, I-PRICE, etc.):  O


7: ኩባያዎች -> 

Label (O, B-Product, I-PRICE, etc.):  B-Product


8: ያሉት -> 

Label (O, B-Product, I-PRICE, etc.):  O


9: ለ -> 

Label (O, B-Product, I-PRICE, etc.):  O


10: 12 -> 

Label (O, B-Product, I-PRICE, etc.):  O


11: ሰአታት -> 

Label (O, B-Product, I-PRICE, etc.):  O


12: የሞቀዉን -> 

Label (O, B-Product, I-PRICE, etc.):  O


13: አሙቆ -> 

Label (O, B-Product, I-PRICE, etc.):  O


14: የቀዘቀዘዉን -> 

Label (O, B-Product, I-PRICE, etc.):  O


15: አቀዝቅዞ -> 

Label (O, B-Product, I-PRICE, etc.):  O


16: ሚያስቀምጥ -> 

Label (O, B-Product, I-PRICE, etc.):  O


17: ድንገት -> 

Label (O, B-Product, I-PRICE, etc.):  O


18: ቢወድቅ -> 

Label (O, B-Product, I-PRICE, etc.):  O


19: እንኳን -> 

Label (O, B-Product, I-PRICE, etc.):  O


20: ከማይሰበር -> 

Label (O, B-Product, I-PRICE, etc.):  O


21: ማቴሪያል -> 

Label (O, B-Product, I-PRICE, etc.):  O


22: የተሰራ -> 

Label (O, B-Product, I-PRICE, etc.):  O


23: 500ሚሊ -> 

Label (O, B-Product, I-PRICE, etc.):  O


24: ሊትር -> 

Label (O, B-Product, I-PRICE, etc.):  O


25: የመያዝ -> 

Label (O, B-Product, I-PRICE, etc.):  O


26: አቅም -> 

Label (O, B-Product, I-PRICE, etc.):  O


27: ያለዉ -> 

Label (O, B-Product, I-PRICE, etc.):  O


28: በተለያየ -> 

Label (O, B-Product, I-PRICE, etc.):  O


29: የከለር -> 

Label (O, B-Product, I-PRICE, etc.):  O


30: አማራጭ -> 

Label (O, B-Product, I-PRICE, etc.):  O


31: የቀረበ -> 

Label (O, B-Product, I-PRICE, etc.):  O


32: ለስጦታ -> 

Label (O, B-Product, I-PRICE, etc.):  O


33: የሚሆን -> 

Label (O, B-Product, I-PRICE, etc.):  O


34: የራሱ -> 

Label (O, B-Product, I-PRICE, etc.):  O


35: ማሸጊያና -> 

Label (O, B-Product, I-PRICE, etc.):  O


36: መያዣ -> 

Label (O, B-Product, I-PRICE, etc.):  O


37: ያለዉ -> 

Label (O, B-Product, I-PRICE, etc.):  O


38: ዋጋ፦ -> 

Label (O, B-Product, I-PRICE, etc.):  O


39: 1200 -> 

In [10]:
with open("../data/labeled_data.conll", "w", encoding="utf-8") as f:
    for sentence in labeled_data:
        for token, label in sentence:
            f.write(f"{token} {label}\n")
        f.write("\n")
print("✅ Saved labeled data.")


✅ Saved labeled data.


In [3]:
# Read and print the content of labeled_data.conll
with open("../data/labeled_data.conll", "r", encoding="utf-8") as file:
    content = file.read()
    print(content)


skechers B-Product
archfit B-Product
size O
40 O
41 O
42 O
43 O
price B-PRICE
3400 I-PRICE
birr I-PRICE
አድራሻ B-LOC
- I-LOC
ሜክሲኮ I-LOC
ኮሜርስ I-LOC
ጀርባ I-LOC
መዚድ I-LOC
ፕላዛ I-LOC
የመጀመሪያ I-LOC
ደረጃ I-LOC
እንደወጡ I-LOC
101 I-LOC
የቢሮ I-LOC
ቁጥር I-LOC
ያገኙናል I-LOC
or O
call O
0920238243 O
ethiobrand O

magic O
silicone B-Product
dish B-Product
washing O
gloves B-Product
high O
quality O
ወፍራሙ O
ዕቃ B-Product
ለማጠብ O
ቤት O
ለማፅዳት O
መኪና O
ለማጠብ O
ለተለያየ O
አገልግሎት O
የሚውል O
ዋጋ፦ B-PRICE
350ብር I-PRICE
ውስን O
ፍሬ O
ነው O
ያለው O
አድራሻ B-LOC
መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ I-LOC
ቢሮ I-LOC
ቁ I-LOC
s05s06 I-LOC
0902660722 O
0928460606 O
በtelegram O
ለማዘዝ O
ይጠቀሙ O
ለተጨማሪ O
ማብራሪያ O
የቴሌግራም O
ገፃችን O

nike B-Product
just O
do O
it O
size O
39 O
40 O
41 O
42 O
43 O
price B-PRICE
3400 I-PRICE
birr I-PRICE
አድራሻ B-LOC
- I-LOC
ሜክሲኮ I-LOC
ኮሜርስ I-LOC
ጀርባ I-LOC
መዚድ I-LOC
ፕላዛ I-LOC
የመጀመሪያ I-LOC
ደረጃ I-LOC
እንደወጡ I-LOC
101 I-LOC
የቢሮ I-LOC
ቁጥር I-LOC
ያገኙናል I-LOC
or O
call O
0920238243 O
ethiobrand O

air O
jordan B-ProducT
stay O
loyal O
size O
40 O
