In [2]:
# !pip install telethon pandas

In [5]:
# !pip install telethon pandas python-dotenv openpyxl

In [1]:
from telethon import TelegramClient, events
import os
from dotenv import load_dotenv
import pandas as pd
import json
import datetime
import asyncio

# Configuration
load_dotenv('../.env') 
api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_API_HASH')
phone = os.getenv('TELEGRAM_PHONE')

if not all([api_id, api_hash, phone]):
    raise ValueError("Telegram API credentials (TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_PHONE) not found in .env file.")

SESSION_NAME = 'ethio_mart_scraper'
RAW_DATA_FILE = '../data/raw_messages.jsonl' # Output file for raw data

os.makedirs('../data/photos', exist_ok=True)
os.makedirs(os.path.dirname(RAW_DATA_FILE), exist_ok=True) 

async def get_channels_from_excel(file_path='../data/channels_to_crawl.xlsx'):
    """Reads Telegram channel usernames/links from an Excel file."""
    print(f"Attempting to read channels from: {file_path}")
    try:
        df = pd.read_excel(file_path)
        if 'ChannelLink' in df.columns:
            channels = [str(c).strip() for c in df['ChannelLink'].tolist() if pd.notna(c)]
            print(f"Successfully loaded {len(channels)} channels from {file_path}.")
            return channels
        else:
            print(f"Error: 'ChannelLink' column not found in {file_path}. Please check the Excel file structure.")
            return []
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure the path is correct and the file exists.")
        return []
    except Exception as e:
        print(f"An unexpected error occurred while reading channels from Excel: {e}")
        return []

async def save_raw_message_to_file(message_data):
    """Appends a single message dictionary to the JSON Lines (.jsonl) file."""
    try:
        with open(RAW_DATA_FILE, 'a', encoding='utf-8') as f:
            json.dump(message_data, f, ensure_ascii=False)
            f.write('\n')
    except Exception as e:
        print(f"Error saving message to raw_messages.jsonl: {e}")

async def scrape_channel_history(client_obj, channel_input, media_dir):
    """Scrapes historical messages from a single channel."""
    try:
        entity = await client_obj.get_entity(channel_input)
        channel_title = entity.title or f"Channel {channel_input}"
        channel_username = entity.username or str(entity.id)

        print(f"Starting historical scrape for: '{channel_title}' ({channel_username})...")

        async for message in client_obj.iter_messages(entity, limit=None):
            media_paths = []
            document_paths = []

            if message.media:
                try:
                    if message.photo:
                        filename = f"{channel_username}_{message.id}_photo.jpg"
                        file_path = os.path.join(media_dir, filename)
                        await message.download_media(file=file_path)
                        media_paths.append(file_path)
                    elif message.document:
                        ext = 'bin'
                        if message.document.mime_type:
                            ext = message.document.mime_type.split('/')[-1]
                        elif message.document.attributes:
                            for attr in message.document.attributes:
                                if hasattr(attr, 'file_name') and '.' in attr.file_name:
                                    ext = attr.file_name.split('.')[-1]
                                    break
                        doc_filename = f"{channel_username}_{message.id}_doc.{ext}"
                        doc_path = os.path.join(media_dir, doc_filename)
                        await message.download_media(file=doc_path)
                        document_paths.append(doc_path)
                except Exception as e:
                    print(f"Error downloading media for message {message.id} in {channel_username}: {e}")

            message_data = {
                'message_id': message.id,
                'channel_username': channel_username,
                'channel_title': channel_title,
                'timestamp': message.date.isoformat() if message.date else None,
                'sender_id': message.sender_id,
                'text_content': message.message,
                'views': message.views,
                'image_paths': media_paths,
                'document_paths': document_paths
            }
            await save_raw_message_to_file(message_data)
        print(f"Finished scraping history from '{channel_title}'.")

    except Exception as e:
        print(f"Could not scrape history from {channel_username}: {e}")


@events.register(events.NewMessage)
async def real_time_message_handler(event):
    """Handles new messages as they are posted in real-time."""
    message = event.message
    channel_id = event.chat_id
    channel_title = event.chat.title if event.chat else str(channel_id)
    channel_username = event.chat.username if event.chat and event.chat.username else str(channel_id)

    print(f"\n[REAL-TIME] New message detected in {channel_title} ({channel_username}):")
    print(f"  Text: {message.message[:100]}...")
    print(f"  Date: {message.date}")
    print(f"  Views: {message.views}")

    media_paths = []
    document_paths = []
    if message.media:
        try:
            if message.photo:
                filename = f"{channel_username}_{message.id}_photo_realtime.jpg"
                file_path = os.path.join('../data/photos', filename)
                await message.download_media(file=file_path)
                media_paths.append(file_path)
            elif message.document:
                ext = 'bin'
                if message.document.mime_type:
                    ext = message.document.mime_type.split('/')[-1]
                elif message.document.attributes:
                    for attr in message.document.attributes:
                        if hasattr(attr, 'file_name') and '.' in attr.file_name:
                            ext = attr.file_name.split('.')[-1]
                            break
                doc_filename = f"{channel_username}_{message.id}_doc_realtime.{ext}"
                doc_path = os.path.join('../data/photos', doc_filename)
                await message.download_media(file=doc_path)
                document_paths.append(doc_path)
        except Exception as e:
            print(f"Error downloading real-time media for message {message.id}: {e}")

    message_data = {
        'message_id': message.id,
        'channel_username': channel_username,
        'channel_title': channel_title,
        'timestamp': message.date.isoformat() if message.date else None,
        'sender_id': message.sender_id,
        'text_content': message.message,
        'views': message.views,
        'image_paths': media_paths,
        'document_paths': document_paths
    }
    await save_raw_message_to_file(message_data)


async def main():
    client = TelegramClient(SESSION_NAME, api_id, api_hash)
    async with client:
        print("Telegram client connected!")

        channels_to_scrape = await get_channels_from_excel()
        if not channels_to_scrape:
            print("No channels found to scrape. Please populate channels_to_crawl.xlsx.")
            return

        media_dir = '../data/photos'
        os.makedirs(media_dir, exist_ok=True)

        for channel in channels_to_scrape:
            await scrape_channel_history(client, channel, media_dir)
        print(f"Raw historical data saved to {RAW_DATA_FILE}")
        client.add_event_handler(real_time_message_handler, events.NewMessage)
        await client.run_until_disconnected()
        print("Client disconnected.")

In [None]:
await main()

Telegram client connected!
Attempting to read channels from: ../data/channels_to_crawl.xlsx
Successfully loaded 22 channels from ../data/channels_to_crawl.xlsx.
Starting historical scrape for: 'Zemen Express®' (ZemenExpress)...
