# **Task 2 : Label a Subset of Dataset in CoNLL Format**

In [1]:
# Import necessary libraries
import pandas as pd
import logging
import os, sys
import matplotlib.pyplot as plt
from matplotlib import font_manager
from collections import Counter
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
# Import data preprocessor class
from text_processor import AmharicTextPreprocessor
from labeler import AmharicNERLabeler

# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

logger.info("Imported libraries and configured logging.")

2024-10-11 10:43:35,510 - INFO - Imported libraries and configured logging.


# Load the data

In [10]:
# Read the data
data = pd.read_csv('../data/telegram_data.csv')
# Explore the first five rows
data.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14901,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14901.jpg
1,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14900,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14900.jpg
2,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14899,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14899.jpg
3,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14898,mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\n 1600 ብር\nFre...,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14898.jpg
4,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14897,ኦሪጅናል ማቀፊያ\n1400 ብር\n0905707448\n0909003864\n\...,2024-10-09 14:01:33+00:00,../data/photos/@sinayelj_14897.jpg


# Preprocess text data

In [11]:
# Preprocess and tokenizes the amharic message
if __name__ == "__main__":
    # Amharic text sample
    amharic_text = "ሰላም እንዴት ነህ? እንኳን ደህና መጣህ።"

    preprocessor = AmharicTextPreprocessor()

    # Preprocess the text
    tokens = preprocessor.preprocess_dataframe(data, 'Message')
    display(tokens)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,preprocessed_message
0,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14901,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14901.jpg,
1,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14900,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14900.jpg,
2,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14899,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14899.jpg,
3,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14898,mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\n 1600 ብር\nFre...,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14898.jpg,ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 1600 ብር 0909003864 090570...
4,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14897,ኦሪጅናል ማቀፊያ\n1400 ብር\n0905707448\n0909003864\n\...,2024-10-09 14:01:33+00:00,../data/photos/@sinayelj_14897.jpg,ኦሪጅናል ማቀፊያ 1400 ብር 0905707448 0909003864 09090...
...,...,...,...,...,...,...,...
4651,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,599,,2021-04-16 18:12:10+00:00,../data/photos/@sinayelj_599.jpg,
4652,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,598,Baby potty\n0905707448\n0945097042,2021-04-16 18:12:10+00:00,../data/photos/@sinayelj_598.jpg,0905707448 0945097042
4653,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,197,,2020-11-26 18:30:52+00:00,,
4654,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,182,ውድ የሲና ኪድስ ደምበኞች በድጋሚ ገብቷል \nየመዋኛ ገንዳ ትልቅ ሳይዝ ...,2020-11-10 06:59:31+00:00,../data/photos/@sinayelj_182.jpg,ውድ የሲና ኪድስ ደምበኞች በድጋሚ ገብቷል የመዋኛ ገንዳ ትልቅ ሳይዝ የራ...


In [12]:
# Drop NaN 
data.dropna(subset='Message', inplace=True)

In [13]:
# Ensure there are no NaN values in the preprocessed column
preprocessed_texts = tokens['preprocessed_message'].dropna().tolist()
data = pd.Series(preprocessed_texts).reset_index(name='message')

In [14]:
data.head()

Unnamed: 0,index,message
0,0,ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 1600 ብር 0909003864 090570...
1,1,ኦሪጅናል ማቀፊያ 1400 ብር 0905707448 0909003864 09090...
2,2,ልጆች ዳዴ ማለት እንዲሉ የሚለማመዱበት በባትሪ የሚሰራ የልጆች መጫዎቻ 1...
3,3,የልጆችን ቀልብ የሚገዛ ውብ ምቹ የሆነ መነሳት የሚችል ስንጅ የሆነ መቀመ...
4,4,የልጆችን ቀልብ የሚገዛ ውብ ምቹ የሆነ መነሳት የሚችል ስንጅ የሆነ መቀመ...


# Label a Subset of Dataset in CoNLL Format

In [16]:
# Initialize the labeler

labeler = AmharicNERLabeler()

# Ensure there are no NaN values in the preprocessed column
preprocessed_texts = tokens['preprocessed_message'].dropna().tolist()
data = pd.Series(preprocessed_texts).reset_index(name='message')
# data = data.iloc[10:15]
data['Tokenized'] = data['message'].apply(lambda x: x.split())
# Label the tokens in the DataFrame
labeled_data = labeler.label_dataframe(data, 'Tokenized')


# Save to CoNLL format
labeler.save_conll_format(labeled_data, '../telegram_labeled_data.conll')

In [17]:
labeled_data.drop(columns=['index'], inplace=True)

In [18]:
labeled_data['message'].duplicated().sum()

np.int64(274)