In [3]:
import spacy
import json
import pandas as pd
import os

In [4]:
# Load spaCy's pre-trained NER model
nlp = spacy.load("en_core_web_sm")

In [5]:
# Load the processed dataset (from Task 1)
def load_processed_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [6]:
# Path to the processed dataset
processed_data_path = '../data/processed/processed_messages.json'
messages = load_processed_data(processed_data_path)

In [7]:
# Convert to a DataFrame for easier manipulation
df = pd.DataFrame(messages)

In [8]:
# Extract a subset of 50 messages
subset_df = df[['id', 'clean_message', 'tokens']].dropna(subset=['tokens'])
subset_df = subset_df[subset_df['tokens'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
subset_df = subset_df.head(50)

In [9]:
# Reset the index for easier handling
subset_df.reset_index(drop=True, inplace=True)

In [10]:
# Function to automate labeling using spaCy
def automate_labeling(tokens):
    """
    Function to automatically label tokens using a pre-trained NER model (spaCy).
    """
    doc = nlp(" ".join(tokens))  # Create a spaCy document from the tokens
    labeled_tokens = []
    
    for token in doc:
        # Use spaCy's named entity recognition (NER) labels
        if token.ent_type_ == 'PERSON':
            label = 'O'  # We don't need PERSON, map it to 'O'
        elif token.ent_type_ == 'GPE':
            label = 'B-LOC'
        elif token.ent_type_ == 'ORG':
            label = 'O'  # We don't need ORG for now, map to 'O'
        elif token.ent_type_ == 'MONEY':
            label = 'B-PRICE'
        elif token.ent_type_ == 'PRODUCT':
            label = 'B-Product'
        else:
            label = 'O'  # Any token not recognized by NER will be 'O'
        
        labeled_tokens.append((token.text, label))
    
    return labeled_tokens

In [11]:
# Apply automated labeling to the entire subset
def label_subset_automatically(df):
    labeled_data = []
    for _, row in df.iterrows():
        print(f"\nAuto-labeling message ID: {row['id']}\nMessage: {row['clean_message']}\n")
        labeled_message = automate_labeling(row['tokens'])
        labeled_data.append(labeled_message)  # Append the labeled tokens for each message
    return labeled_data

In [12]:
# Function to save the labeled data in CoNLL format
def save_conll_format(labeled_data, output_path):
    """
    Function to save labeled data in CoNLL format.
    """
    if not labeled_data:
        print("No labeled data to save.")
        return
    
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for message in labeled_data:
            if message:  # Ensure there's labeled data
                for token, label in message:
                    f.write(f"{token} {label}\n")
                f.write("\n")  # Separate messages with a newline
    print(f"Labeled data saved to {output_path}")

In [13]:
# Automate the labeling for the subset of 50 messages
labeled_data = label_subset_automatically(subset_df)


Auto-labeling message ID: 14898
Message: mama bag ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 1600 ብር Free delivery 0909003864 0905707448 ሊንኩን በመጫን ቴሌግራማችንን ይቀላቀሉhttpstmesinayelj እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ sinasinaye httpstmesinayelj2 አድራሻ 1ቁጥር1 ገርጂ ኢምፔሪያል ከሳሚ ህንፃ ጎን አልፎዝ ፕላዛ ግራውንድ ላይ እንደገቡ ያገኙናል 2ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስፓልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል 3ቁጥር3 ብስራተ ገብርኤል ላፍቶ ሞል መግቢያው ፊት ለፊት የሚገኘው የብስራተ ገብርኤል ቤተ ክርስቲያን ህንፃ አንደኛ ፎቅ ላይ ደረጃ እንደወጣቹ በስተግራ በኩል ሱቅ ቁጥር FF09 ክቡራን ደምበኞቻችን ገርጂ አልፎዝ ፕላዛ ላይ አራት ኪሎ ቅድስት ስላሴ እንዲሁም ብስራተ ገብሬል ያሉት ሱቆቻችን ሲመጡ አስተማማኝ ሰፊ ፓርኪንግ ያላቸው መሆናቸውን በታላቅ ደስታ እናበስራለን


Auto-labeling message ID: 14897
Message: ኦሪጅናል ማቀፊያ 1400 ብር 0905707448 0909003864 0909003864 0905707448 እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ sinasinaye sinayalj2 አድራሻ 1ቁጥር1 ገርጂ ኢምፔሪያል ከሳሚ ህንፃ ጎን አልፎዝ ፕላዛ ግራውንድ ላይ እንደገቡ ያገኙናል 2ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስፓልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል 3ቁጥር3 ብስራተ ገብር

In [14]:
# Save the labeled data to a file
output_path = '../data/labeled/automated_labeled_data.conll'
save_conll_format(labeled_data, output_path)

Labeled data saved to ../data/labeled/automated_labeled_data.conll
