In [1]:
import json
import re
import os
import pandas as pd


In [2]:
def load_data(file_path):
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Load the raw data from the file
raw_data_path = '../data/raw/@sinayelj_messages.json'
messages = load_data(raw_data_path)

# Display the first few messages to check the structure
pd.DataFrame(messages).head()


Unnamed: 0,id,sender_id,message,date,media
0,14901,-1001424695758,,2024-10-09 14:07:16+00:00,other
1,14900,-1001424695758,,2024-10-09 14:07:16+00:00,other
2,14899,-1001424695758,,2024-10-09 14:07:16+00:00,other
3,14898,-1001424695758,mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\n 1600 ብር\nFre...,2024-10-09 14:07:16+00:00,other
4,14897,-1001424695758,ኦሪጅናል ማቀፊያ\n1400 ብር\n0905707448\n0909003864\n\...,2024-10-09 14:01:33+00:00,other


In [4]:
def clean_text(text):
   
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s።፥፡፣፤፦፨]', '', text)  # Remove punctuation except Amharic-specific ones
    return text.strip()

# Test the cleaning function on a sample message
sample_message = messages[0]['message']
cleaned_message = clean_text(sample_message)
print("Original:", sample_message)
print("Cleaned:", cleaned_message)


Original: 
Cleaned: 


In [5]:
def tokenize_message(message):
    
    tokens = message.split(' ')
    return tokens

# Test the tokenization on the cleaned sample message
tokens = tokenize_message(cleaned_message)
print("Tokens:", tokens)


Tokens: ['']


In [6]:
def preprocess_data(messages):
    
    for message in messages:
        if message['message']:
            message['clean_message'] = clean_text(message['message'])
            message['tokens'] = tokenize_message(message['clean_message'])
    return messages

# Preprocess all messages
processed_messages = preprocess_data(messages)

# Display the first few processed messages
pd.DataFrame(processed_messages).head()


Unnamed: 0,id,sender_id,message,date,media,clean_message,tokens
0,14901,-1001424695758,,2024-10-09 14:07:16+00:00,other,,
1,14900,-1001424695758,,2024-10-09 14:07:16+00:00,other,,
2,14899,-1001424695758,,2024-10-09 14:07:16+00:00,other,,
3,14898,-1001424695758,mama bag\nኦሪጅናል ማቴሪያል\nበሳይዙ ትልቅ\n 1600 ብር\nFre...,2024-10-09 14:07:16+00:00,other,mama bag ኦሪጅናል ማቴሪያል በሳይዙ ትልቅ 1600 ብር Free del...,"[mama, bag, ኦሪጅናል, ማቴሪያል, በሳይዙ, ትልቅ, 1600, ብር,..."
4,14897,-1001424695758,ኦሪጅናል ማቀፊያ\n1400 ብር\n0905707448\n0909003864\n\...,2024-10-09 14:01:33+00:00,other,ኦሪጅናል ማቀፊያ 1400 ብር 0905707448 0909003864 09090...,"[ኦሪጅናል, ማቀፊያ, 1400, ብር, 0905707448, 0909003864..."


In [7]:
def save_preprocessed_data(messages, filename):

    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(messages, f, ensure_ascii=False, indent=4)

# Save the preprocessed data
save_preprocessed_data(processed_messages, '../data/processed/processed_messages.json')
print("Preprocessing completed and data saved to ../data/processed/processed_messages.json")


Preprocessing completed and data saved to ../data/processed/processed_messages.json
