In [None]:
import nest_asyncio
import asyncio
import re
import pandas as pd
from datetime import datetime
from telethon import TelegramClient

# Enable nested event loop for notebooks
nest_asyncio.apply()

# === Replace with your own Telegram credentials ===
api_id = 28247546
api_hash = '836b5d7a2c2043123b9bf690117196e5'

# Telegram channels to fetch messages from
channels = [
    '@ZemenExpress',
    '@nevacomputer',
    '@meneshayeofficial',
    '@ethio_brand_collection',
    '@Shewabrand'
]

# === Text Cleaning Functions ===
def remove_emojis_symbols(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002700-\U000027BF"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)

def normalize_amharic(text):
    if not text:
        return ""
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = remove_emojis_symbols(text)
    text = re.sub(r'[•▪️✔✅💥⭐️➡️→←➤📌🔖📲📍🔽↘️፦🏷🔺💧⚡🔥‼️]', '', text)
    text = re.sub(r'[\.]{2,}', '', text)
    text = re.sub(r'[^0-9a-zA-Z\u1200-\u137F፡። \s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_prices(text):
    price_matches = re.findall(r'\b[0-9]+(?:,[0-9]+)?\s*(?:ብር|birr)|(?:[Pp]rice)\s*[0-9]+', text)
    return ', '.join(price_matches) if price_matches else None

def extract_phone_numbers(text):
    phones = re.findall(r'\b0\d{8,9}\b', text)
    return ', '.join(phones) if phones else None

def tokenize(text):
    return text.split()

# === Main Async Function to Collect and Clean Data ===
async def main():
    data = []
    async with TelegramClient('amharic_data_session', api_id, api_hash) as client:
        for channel in channels:
            print(f"Fetching from {channel}")
            async for message in client.iter_messages(channel, limit=200):
                if message.message:
                    raw_text = message.message
                    cleaned = normalize_amharic(raw_text)
                    tokens = tokenize(cleaned)
                    phones = extract_phone_numbers(cleaned)
                    prices = extract_prices(cleaned)
                    data.append({
                        'channel': channel,
                        'message_id': message.id,
                        'timestamp': message.date.strftime('%Y-%m-%d %H:%M:%S'),
                        'sender_id': message.sender_id,
                        'raw_text': raw_text,
                        'cleaned_text': cleaned,
                        'tokens': tokens,
                        'phone_numbers': phones,
                        'prices': prices,
                        'has_image': bool(message.photo),
                        'document_file_name': getattr(message.document.attributes[0], 'file_name', None) if message.document and message.document.attributes else None
                    })

    df = pd.DataFrame(data)
    df.to_csv("amharic_telegram_cleaned_data.csv", index=False)
    print("✅ Data collected and saved. Total messages:", len(df))

# === Run it ===
await main()
