In [6]:
import os
import json
from datetime import datetime
from dotenv import load_dotenv
from telethon.sync import TelegramClient
from telethon.tl.types import MessageMediaPhoto
from telethon.errors import FloodWaitError
import asyncio


In [7]:
load_dotenv()

api_id = int(os.getenv("TELEGRAM_API_ID"))
api_hash = os.getenv("TELEGRAM_API_HASH")

session_name = "notebook_scraper"


In [12]:
channels = {
    "lobelia4cosmetics": "https://t.me/lobelia4cosmetics",
    "tikvahpharma": "https://t.me/tikvahpharma"
}

today = datetime.today().strftime("%Y-%m-%d")
msg_dir = f"../data/raw/telegram_messages/{today}"
img_dir = f"../data/raw/images/{today}"

os.makedirs(msg_dir, exist_ok=True)
os.makedirs(img_dir, exist_ok=True)


In [13]:
import os
import json
from datetime import datetime
from telethon.errors import FloodWaitError
from telethon.tl.types import MessageMediaPhoto

# Convert datetime to ISO format for JSON
def json_safe_converter(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    elif isinstance(obj, bytes):
        return obj.decode(errors="ignore")  # or use base64 if needed
    raise TypeError(f"Type {type(obj)} not serializable")

# Main Scrape Function
async def scrape_channel(client, name, link, download_images=True, limit=200):
    messages = []
    channel_img_dir = os.path.join(img_dir, name)
    os.makedirs(channel_img_dir, exist_ok=True)

    print(f"Scraping channel: {name} (limit={limit})")

    try:
        count = 0
        async for message in client.iter_messages(link, limit=limit):
            count += 1
            msg = message.to_dict()

            if download_images and message.media and isinstance(message.media, MessageMediaPhoto):
                img_path = os.path.join(channel_img_dir, f"{message.id}.jpg")
                await message.download_media(file=img_path)
                msg["downloaded_image_path"] = img_path

            messages.append(msg)

            if count % 100 == 0:
                print(f"{count} messages collected...")

        print(f"Done. Total: {count} messages from {name}")

        # Save JSON with datetime-safe conversion
        filepath = os.path.join(msg_dir, f"{name}.json")
        with open(filepath, "w", encoding="utf-8") as f:
                json.dump(messages, f, ensure_ascii=False, indent=2, default=json_safe_converter)


    except FloodWaitError as e:
        print(f"Flood wait error. Sleep for {e.seconds} seconds.")
    except Exception as e:
        print(f"Error scraping {name}: {e}")


In [14]:
async def main():
    async with TelegramClient(session_name, api_id, api_hash) as client:
        for name, link in channels.items():
            await scrape_channel(client, name, link)

await main()


Scraping channel: lobelia4cosmetics (limit=200)
100 messages collected...
200 messages collected...
Done. Total: 200 messages from lobelia4cosmetics
Scraping channel: tikvahpharma (limit=200)
100 messages collected...
200 messages collected...
Done. Total: 200 messages from tikvahpharma


In [19]:
import json

with open("../data/raw/telegram_messages/2025-07-14/lobelia4cosmetics.json", encoding="utf-8") as f:
    data = json.load(f)
    print(f"Loaded {len(data)} messages from lobelia4cosmetics.json")
    print(f"First message keys: {list(data[0].keys())}")


Loaded 200 messages from lobelia4cosmetics.json
First message keys: ['_', 'id', 'peer_id', 'date', 'message', 'out', 'mentioned', 'media_unread', 'silent', 'post', 'from_scheduled', 'legacy', 'edit_hide', 'pinned', 'noforwards', 'invert_media', 'offline', 'video_processing_pending', 'from_id', 'from_boosts_applied', 'saved_peer_id', 'fwd_from', 'via_bot_id', 'via_business_bot_id', 'reply_to', 'media', 'reply_markup', 'entities', 'views', 'forwards', 'replies', 'edit_date', 'post_author', 'grouped_id', 'reactions', 'restriction_reason', 'ttl_period', 'quick_reply_shortcut_id', 'effect', 'factcheck', 'report_delivery_until_date', 'paid_message_stars', 'downloaded_image_path']


In [20]:
for msg in data:
    assert "id" in msg
    assert "message" in msg or "text" in msg
