# **Task 1: Data Ingestion and  Data Preprocessing**

In [34]:
# Import necessary libraries
import pandas as pd
import logging
import os, sys
import matplotlib.pyplot as plt
from matplotlib import font_manager
from collections import Counter
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
# Import data preprocessor class
from text_processor import AmharicTextPreprocessor

# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

logger.info("Imported libraries and configured logging.")

2024-10-11 10:23:59,420 - INFO - Imported libraries and configured logging.


# Load the data

In [35]:
# Read the data
data = pd.read_csv('../data/telegram_data.csv')
# Explore the first five rows
data.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14901,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14901.jpg
1,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14900,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14900.jpg
2,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14899,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14899.jpg
3,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14898,mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\n 1600 ብር\nFre...,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14898.jpg
4,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14897,ኦሪጅናል ማቀፊያ\n1400 ብር\n0905707448\n0909003864\n\...,2024-10-09 14:01:33+00:00,../data/photos/@sinayelj_14897.jpg


In [36]:
# Check the last five rows
data.tail()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
4651,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,599,,2021-04-16 18:12:10+00:00,../data/photos/@sinayelj_599.jpg
4652,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,598,Baby potty\n0905707448\n0945097042,2021-04-16 18:12:10+00:00,../data/photos/@sinayelj_598.jpg
4653,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,197,,2020-11-26 18:30:52+00:00,
4654,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,182,ውድ የሲና ኪድስ ደምበኞች በድጋሚ ገብቷል \nየመዋኛ ገንዳ ትልቅ ሳይዝ ...,2020-11-10 06:59:31+00:00,../data/photos/@sinayelj_182.jpg
4655,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,1,,2020-07-24 10:50:43+00:00,


In [37]:
data.shape

(4656, 6)

In [38]:
# Let's check the missing values
data.isnull().sum()

Channel Title          0
Channel Username       0
ID                     0
Message             3290
Date                   0
Media Path           311
dtype: int64

# Preprocess text data

In [39]:
# Preprocess and tokenizes the amharic message
if __name__ == "__main__":
    # Amharic text sample
    amharic_text = "ሰላም እንዴት ነህ? እንኳን ደህና መጣህ።"

    preprocessor = AmharicTextPreprocessor()

    # Preprocess the text
    tokens = preprocessor.preprocess_dataframe(data, 'Message')
    display(tokens)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,preprocessed_message
0,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14901,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14901.jpg,
1,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14900,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14900.jpg,
2,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14899,,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14899.jpg,
3,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14898,mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\n 1600 ብር\nFre...,2024-10-09 14:07:16+00:00,../data/photos/@sinayelj_14898.jpg,ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 1600 ብር 0909003864 090570...
4,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,14897,ኦሪጅናል ማቀፊያ\n1400 ብር\n0905707448\n0909003864\n\...,2024-10-09 14:01:33+00:00,../data/photos/@sinayelj_14897.jpg,ኦሪጅናል ማቀፊያ 1400 ብር 0905707448 0909003864 09090...
...,...,...,...,...,...,...,...
4651,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,599,,2021-04-16 18:12:10+00:00,../data/photos/@sinayelj_599.jpg,
4652,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,598,Baby potty\n0905707448\n0945097042,2021-04-16 18:12:10+00:00,../data/photos/@sinayelj_598.jpg,0905707448 0945097042
4653,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,197,,2020-11-26 18:30:52+00:00,,
4654,SINA KIDS/ሲና ኪድስⓇ,@sinayelj,182,ውድ የሲና ኪድስ ደምበኞች በድጋሚ ገብቷል \nየመዋኛ ገንዳ ትልቅ ሳይዝ ...,2020-11-10 06:59:31+00:00,../data/photos/@sinayelj_182.jpg,ውድ የሲና ኪድስ ደምበኞች በድጋሚ ገብቷል የመዋኛ ገንዳ ትልቅ ሳይዝ የራ...


In [40]:
# Drop NaN 
data.dropna(subset='Message', inplace=True)

In [41]:
list(data['preprocessed_message'])

['ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 1600 ብር 0909003864 0905707448 ሊንኩን በመጫን ቴሌግራማችንን ይቀላቀሉ/// እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ ///2 አድራሻ 1ቁጥር1 ገርጂ ኢምሪያል ከሳሚ ህንፃ ጎን አልፎዝ ላዛ ግራውንድ ላይ እንደገቡ ያገኙናል 2ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል 3ቁጥር3 ብስራተ ገብርኤል ላፍቶ ሞል መግቢያው ፊት ለፊት የሚገኘው የብስራተ ገብርኤል ቤተ ክርስቲያን ህንፃ አንደኛ ፎቅ ላይ ደረጃ እንደወጣቹ በስተግራ በኩል ሱቅ ቁጥር 09 ክቡራን ደምበኞቻችን ገርጂ አልፎዝ ላዛ ላይ አራት ኪሎ ቅድስት ስላሴ እንዲሁም ብስራተ ገብሬል ያሉት ሱቆቻችን ሲመጡ አስተማማኝ ሰፊ ርኪንግ ያላቸው መሆናቸውን በታላቅ ደስታ እናበስራለን',
 'ኦሪጅናል ማቀፊያ 1400 ብር 0905707448 0909003864 0909003864 0905707448 እቃ ለማዘዝ ከስር ያለውን ሊንኮች በመጫን ማዘዝ ትችላላቹ 2 አድራሻ 1ቁጥር1 ገርጂ ኢምሪያል ከሳሚ ህንፃ ጎን አልፎዝ ላዛ ግራውንድ ላይ እንደገቡ ያገኙናል 2ቁጥር2 4ኪሎ ቅድስት ስላሴ ህንፃ ማለትም ከብልፅግና ዋናፅፈት ቤት ህንፃ በስተ ቀኝ ባለው አስልት 20ሜትር ዝቅ እንዳሉ ሀበሻ ኮፊ የሚገኝበት ቀይ ሸክላ ህንፃ 2ተኛ ፎቅ ላይ ያገኙናል 3ቁጥር3 ብስራተ ገብርኤል ላፍቶ ሞል መግቢያው ፊት ለፊት የሚገኘው የብስራተ ገብርኤል ቤተ ክርስቲያን ህንፃ አንደኛ ፎቅ ላይ ደረጃ እንደወጣቹ በስተግራ በኩል ሱቅ ቁጥር 09 ክቡራን ደምበኞቻችን ገርጂ አልፎዝ ላዛ ላይ አራት ኪሎ ቅድስት ስላሴ እንዲሁም ብስራተ ገብሬል ያሉት ሱቆቻችን ሲ

In [42]:
# Ensure there are no NaN values in the preprocessed column
preprocessed_texts = tokens['preprocessed_message'].dropna().tolist()
data = pd.Series(preprocessed_texts).reset_index(name='message')

In [44]:
data.head()

Unnamed: 0,index,message
0,0,ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 1600 ብር 0909003864 090570...
1,1,ኦሪጅናል ማቀፊያ 1400 ብር 0905707448 0909003864 09090...
2,2,ልጆች ዳዴ ማለት እንዲሉ የሚለማመዱበት በባትሪ የሚሰራ የልጆች መጫዎቻ 1...
3,3,የልጆችን ቀልብ የሚገዛ ውብ ምቹ የሆነ መነሳት የሚችል ስንጅ የሆነ መቀመ...
4,4,የልጆችን ቀልብ የሚገዛ ውብ ምቹ የሆነ መነሳት የሚችል ስንጅ የሆነ መቀመ...


# Store preprocessed data

In [46]:
#Save the preprocessed data
data.to_csv('../data/telegram_data_cleaned.csv', index=False)