In [None]:
from telethon.sync import TelegramClient
from telethon.tl.types import Message, DocumentAttributeFilename
import nest_asyncio
import pandas as pd
import os

nest_asyncio.apply()

api_id = ...
api_hash = '...'

client = TelegramClient('ethioner_session', api_id, api_hash)

channels = [
    "@ZemenExpress",
    "@meneshayeofficial",
    "@kuruwear",
    "@Shewabrand",
    "@Fashiontera"
]

media_dir = "media"
os.makedirs(media_dir, exist_ok=True)

def is_video(document):
    mime_type = getattr(document, 'mime_type', '')
    return mime_type.startswith("video")

async def scrape_telegram_channel(channel_username, limit=300):
    messages = []
    entity = await client.get_entity(channel_username)
    print(f"\nScraping channel: {entity.title} ({channel_username})")

    count = 0
    async for message in client.iter_messages(entity, limit=limit):
        count += 1
        if count % 100 == 0:
            print(f" {count} messages scraped...")

        media_photo_path = None
        media_doc_path = None

        if message.photo:
            media_photo_path = os.path.join(media_dir, f"{channel_username}_{message.id}.jpg")
            await client.download_media(message.photo, media_photo_path)

        if message.document and not is_video(message.document):
            file_name = f"{channel_username}_{message.id}"
            for attr in message.document.attributes:
                if isinstance(attr, DocumentAttributeFilename):
                    file_name = attr.file_name
            media_doc_path = os.path.join(media_dir, file_name)
            await client.download_media(message.document, media_doc_path)

        messages.append({
            "channel_title": entity.title,
            "channel": channel_username,
            "message_id": message.id,
            "text": message.text,
            "date": message.date,
            "photo_path": media_photo_path,
            "document_path": media_doc_path
        })

    print(f"Finished scraping {channel_username}: {count} messages.\n")
    return messages

async def scrape_all_channels():
    all_data = []
    async with client:
        for ch in channels:
            msgs = await scrape_telegram_channel(ch)
            all_data.extend(msgs)
    return all_data

# Run the scraping
import asyncio
all_messages = asyncio.get_event_loop().run_until_complete(scrape_all_channels())

# Save to CSV
df = pd.DataFrame(all_messages)
df.to_csv("../raw/telegram_messages.csv", index=False)
print("All data saved to ../raw/telegram_messages.csv")


Signed in successfully as 🌻ይሀሎኑ በዝ ሰማይ አምላከ እስራኤል አዶናይ🌻; remember to not break the ToS or you will risk an account ban!

Scraping channel: Zemen Express® (@ZemenExpress)
 100 messages scraped...
 200 messages scraped...
 300 messages scraped...
Finished scraping @ZemenExpress: 300 messages.


Scraping channel: መነሻዬ (@meneshayeofficial)
 100 messages scraped...
 200 messages scraped...
 300 messages scraped...
Finished scraping @meneshayeofficial: 300 messages.


Scraping channel: ኩሩ/kuru/ 🇪🇹 wear ®️ (@kuruwear)
 100 messages scraped...
 200 messages scraped...
 300 messages scraped...
Finished scraping @kuruwear: 300 messages.


Scraping channel: Shewa Brand (@Shewabrand)
 100 messages scraped...
 200 messages scraped...
 300 messages scraped...
Finished scraping @Shewabrand: 300 messages.


Scraping channel: Fashion tera (@Fashiontera)
 100 messages scraped...
 200 messages scraped...
 300 messages scraped...
Finished scraping @Fashiontera: 300 messages.

All data saved to ../raw/tele

In [2]:
import pandas as pd
import re
import os

# Load the raw scraped data
df = pd.read_csv("../raw/telegram_messages.csv")

# Step 1: Define Amharic text cleaning function
def clean_amharic_text(text):
    if pd.isna(text):
        return ""
    # Normalize Amharic punctuation
    text = re.sub(r"[።፣]", " ", text)
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    # Remove hashtags and mentions
    text = re.sub(r"[@#]\w+", "", text)
    # Remove non-Amharic and non-space characters
    text = re.sub(r"[^\u1200-\u137F\s]", "", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Step 2: Apply cleaning
df["clean_text"] = df["text"].apply(clean_amharic_text)

# Step 3: Tokenization (basic whitespace-based)
df["tokens"] = df["clean_text"].apply(lambda x: x.split())

# Step 4: Create final structured output
final_columns = [
    "channel_title", "channel", "message_id", "date",
    "clean_text", "tokens", "photo_path", "document_path"
]

df_final = df[final_columns]

# Step 5: Save cleaned data to processed folder
os.makedirs("../processed", exist_ok=True)
df_final.to_csv("../processed/telegram_cleaned.csv", index=False)

print("Cleaned and structured data saved to '../processed/telegram_cleaned.csv'")


Cleaned and structured data saved to '../processed/telegram_cleaned.csv'
