In [3]:
# Load the necessary libraries
import pandas as pd

# Load the raw data (assuming you have a file called 'telegram_data.csv' that contains the scraped data)
df = pd.read_csv('../data/raw/telegram_data.csv')

# Show the first few rows to understand the data
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,qnash.com - ·âÖ·äì·àΩ ¬ÆÔ∏è,@qnashcom,4408,‚ö°Ô∏èKemei ¬Æ Rechargeable Hair Clipper \n·ã®·çÄ·åâ·à≠ ·àò·âÅ·à®...,2025-01-18 13:43:51+00:00,data/raw/photos/@qnashcom_4408.jpg
1,qnash.com - ·âÖ·äì·àΩ ¬ÆÔ∏è,@qnashcom,4407,‚ö°Ô∏èKemei ¬Æ Rechargeable Hair Clipper \n·ã®·çÄ·åâ·à≠ ·àò·âÅ·à®...,2025-01-18 08:52:14+00:00,
2,qnash.com - ·âÖ·äì·àΩ ¬ÆÔ∏è,@qnashcom,4405,üì£ Hair Steamer Cap\nüîº High Quality \n\n‚û°Ô∏è·ã®·çÄ·åâ·à≠ ...,2025-01-18 07:38:38+00:00,data/raw/photos/@qnashcom_4405.jpg
3,qnash.com - ·âÖ·äì·àΩ ¬ÆÔ∏è,@qnashcom,4404,üì£ Hair Steamer Cap\nüîº High Quality \n\n‚û°Ô∏è·ã®·çÄ·åâ·à≠ ...,2025-01-17 16:41:32+00:00,
4,qnash.com - ·âÖ·äì·àΩ ¬ÆÔ∏è,@qnashcom,4403,üì£ Hair Steamer Cap\nüîº High Quality \n\n‚û°Ô∏è·ã®·çÄ·åâ·à≠ ...,2025-01-17 11:35:59+00:00,


In [None]:
# Check for NaN values in the 'Message' column and drop them
print("Checking for NaN values in the 'Message' column:")
nan_count = df['Message'].isnull().sum()
print(f"Number of NaN values in 'Message' column: {nan_count}")

# Drop rows with NaN values in the 'Message' column
df = df.dropna(subset=['Message'])

# Show the dataset shape after dropping NaN values
print(f"Dataset shape after dropping NaN values in 'Message' column: {df.shape}")

In [None]:
# Function to remove emojis from the text
import re

def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Apply the emoji removal function to the 'Message' column
df['Message'] = df['Message'].apply(remove_emojis)

# Show the updated DataFrame
df.head()


In [None]:
# We will define a function to label the data for NER, focusing on product names, prices, and locations.

def label_message_with_entities(message):
    """
    Manually label entities in the message.
    - B-PRODUCT: Beginning of product name
    - I-PRODUCT: Inside product name
    - I-PRICE: Inside price entity (e.g., 1000, ·â•·à≠, $)
    - I-LOC: Inside location entity (e.g., Addis Ababa, Bole)
    - O: Other text (non-entity)
    """
    labeled_tokens = []
    
    # Split the message into tokens (words)
    tokens = re.findall(r'\S+', message)
    
    # Label product names as B-PRODUCT (for first word) and I-PRODUCT (for subsequent words)
    if tokens:
        labeled_tokens.append(f"{tokens[0]} B-PRODUCT")  # First token as B-PRODUCT
        for token in tokens[1:]:
            labeled_tokens.append(f"{token} I-PRODUCT")  # Remaining tokens as I-PRODUCT
    
    # Label price tokens (e.g., 1000, ·â•·à≠) as I-PRICE
    for idx, token in enumerate(labeled_tokens):
        if re.match(r'^\d{10,}$', token.split()[0]) or re.match(r'^\d+(\.\d{1,2})?$', token.split()[0]):
            labeled_tokens[idx] = f"{token.split()[0]} I-PRICE"
        elif '·â•·à≠' in token or 'ETB' in token or '$' in token:
            labeled_tokens[idx] = f"{token.split()[0]} I-PRICE"
    
    # Label location tokens (e.g., Addis Ababa, Bole) as I-LOC
    for idx, token in enumerate(labeled_tokens):
        if 'Addis Ababa' in token or '·â¶·àå' in token:  # Add more locations as needed
            labeled_tokens[idx] = f"{token.split()[0]} I-LOC"
    
    # Label the rest as 'O' for other
    labeled_tokens = [f"{token} O" for token in labeled_tokens]
    
    return "\n".join(labeled_tokens)

# Apply the labeling function to each message
df['Labeled_Message'] = df['Message'].apply(label_message_with_entities)

# Display the labeled data
df[['Message', 'Labeled_Message']].head()

In [None]:
 ## 4. Save Labeled Data

# Save the labeled dataset to a file in CoNLL format
labeled_data_path = 'data/labeled/labeled_telegram_product_price_location.txt'
with open(labeled_data_path, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(f"{row['Labeled_Message']}\n\n")

print(f"Labeled data saved to {labeled_data_path}")