<a href="https://colab.research.google.com/github/noobie0149/ollama-python-playground/blob/main/train_vent_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install telethon

In [None]:
from google.colab import userdata
import os
gemini_api=userdata.get('gemini_api')
telegram_id=userdata.get('telegram_id')
telegram_hash=userdata.get('telegram_hash')
print(gemini_api)
print(telegram_id)
print(telegram_hash)

In [7]:
#python
# ─── 1. INSTALL DEPENDENCIES ────────────────────────────────────────────────
!pip install telethon nest_asyncio

# ─── 2. PATCH THE EVENT LOOP FOR JUPYTER/Colab ─────────────────────────────
import nest_asyncio
nest_asyncio.apply()  # Allows us to await coroutines directly in notebook cells

# ─── 3. IMPORT NECESSARY MODULES ───────────────────────────────────────────
from datetime import datetime, timedelta, timezone
from telethon import TelegramClient
from telethon.errors import FloodWaitError
import asyncio

# ─── 4. CONFIGURE YOUR TELEGRAM API CREDENTIALS ─────────────────────────────
#     Replace these with your own values from https://my.telegram.org/apps
api_id   = telegram_id                           # Your API ID (integer) from my.telegram.org
api_hash = telegram_hash # Your API Hash (string) from my.telegram.org

# ─── 5. INITIALIZE A TELETHON CLIENT ────────────────────────────────────────
#     This will create (or reuse) 'fetch_channel.session' locally
session_name = 'fetch_channel'
client = TelegramClient(session_name, api_id, api_hash)  #

# ─── 6. DEFINE THE ASYNC FUNCTION TO FETCH MESSAGES ────────────────────────
async def fetch_last_10_days(channel_username: str):
    """
    Connect to a public channel (by username, no '@') and return a list of
    all Message objects from the last 10 days (UTC).
    """
    # 6.1 START/LOGIN (will prompt on first run, reuse session thereafter)
    await client.start()  #

    # 6.2 RESOLVE CHANNEL ENTITY:
    #     Converts 'python_engineers' into the underlying Channel object
    channel_entity = await client.get_entity(channel_username)  #

    # 6.3 COMPUTE THE "10 DAYS AGO" THRESHOLD IN UTC
    now_utc      = datetime.now(timezone.utc)
    ten_days_ago = now_utc - timedelta(days=10)

    # 6.4 ITERATE OVER MESSAGES (newest → oldest), STOP WHEN OLDER THAN THRESHOLD
    recent_messages = []
    try:
        async for message in client.iter_messages(channel_entity, limit=None):
            # message.date is a timezone‐aware UTC datetime
            if message.date < ten_days_ago:
                break
            recent_messages.append(message)

    except FloodWaitError as e:
        # If Telegram rate‐limits us (“GetHistory” flood‐wait), Telethon raises this.
        print(f"FloodWaitError: must wait {e.seconds} seconds.")  #
        await asyncio.sleep(e.seconds)  # Pause before (optionally) retrying

    finally:
        # 6.5 DISCONNECT WHEN DONE
        await client.disconnect()

    return recent_messages




In [8]:
#python
# ─── 1. SET YOUR TARGET CHANNEL USERNAME (no '@') ───────────────────────────
channel_username = 'TheReporterEthiopia'  # e.g., for https://t.me/python_engineers

# ─── 2. CALL THE COROUTINE DIRECTLY (because nest_asyncio is applied) ────────
messages = await fetch_last_10_days(channel_username)

# ─── 3. OPEN A TEXT FILE TO WRITE ALL MESSAGES ──────────────────────────────
#     Each line will have: [YYYY-MM-DD HH:MM:SS UTC] <full_text_or_[No text]>
output_filename = 'last_10_days.txt'
with open(output_filename, 'w', encoding='utf-8') as f:
    header = f"Messages from the last 10 days in @{channel_username} (total: {len(messages)})\n"
    f.write(header)
    f.write("=" * len(header) + "\n\n")

    for msg in messages:
        # Format timestamp
        ts = msg.date.strftime('%Y-%m-%d %H:%M:%S UTC')
        # Use the full text if available; otherwise indicate “[No text]”
        text_body = msg.text if msg.text else "[No text]"

        # Write to file
        f.write(f"[{ts}] {text_body}\n")
        f.write("-" * 80 + "\n")

# ─── 4. CONFIRMATION PRINTOUT ───────────────────────────────────────────────
print(f"Wrote {len(messages)} messages to '{output_filename}'.")


Wrote 49 messages to 'last_10_days.txt'.


In [14]:
# format_telegram_posts.py

from google import genai
import os

# ─── Step 1: CONFIGURE YOUR GEMINI CLIENT ───────────────────────────────────
# Replace the string below with your actual Gemini/Google Gen AI API key.
# (This matches how "transcript_to_csv.py" instantiates the client.)
client = genai.Client(api_key=gemini_api)  # :contentReference[oaicite:0]{index=0}

# ─── Step 2: CHOOSE A MODEL NAME ─────────────────────────────────────────────
# You can swap this out for any Gemini variant you have access to (e.g., gemini-2.5-flash-preview-04-17).
MODEL_NAME     = "gemini-2.5-flash-preview-04-17"   # :contentReference[oaicite:1]{index=1}

# ─── Step 3: DEFINE INPUT/OUTPUT FILES ───────────────────────────────────────
RAW_FILE       = "last_10_days.txt"                # Contains the raw, 10‐day channel dump
FORMATTED_FILE = "formatted_last_10_days.txt"      # Where Gemini’s output will be saved

def label_posts(raw_text: str) -> str:
    """
    Build a single prompt (system instructions + raw_text) and call Gemini
    to reformat each Telegram post into a bullet point that preserves the UTC timestamp.

    Returns:
        str: The full text response from Gemini.
    """
    # ─── 3.1: SYSTEM PROMPT (INSTRUCTIONS TO GEMINI) ───────────────────────────
    system_prompt = (
        "You are a helpful assistant.  Below are all Telegram‐channel posts from the last 10 days.  "
        "Please reformat each post into a bullet point, preserving the original UTC timestamp and text.  "
        "If a post has no text (e.g., only media), write “[No text]” instead.  Output plain text, "
        "one bullet per line, in this exact form:\n\n"
        "    - [YYYY-MM-DD HH:MM:SS UTC] <message text>\n\n"
        "Separate each bullet with a blank line."
    )
    # ─── 3.2: CONCATENATE SYSTEM PROMPT + RAW TEXT ──────────────────────────────
    full_prompt = f"{system_prompt}\n\n─── RAW MESSAGES START BELOW ───\n\n{raw_text}"

    # ─── 3.3: CALL GEMINI VIA genai.Client ────────────────────────────────────
    # This mirrors how your "transcript_to_csv.py" calls client.models.generate_content(...)
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=full_prompt
    )
    return response.text  # Gemini’s formatted reply


if __name__ == "__main__":
    # ─── Step 4: READ THE RAW FILE ─────────────────────────────────────────────
    if not os.path.exists(RAW_FILE):
        print(f"ERROR: Cannot find '{RAW_FILE}'. Make sure you run the Telegram‐fetch step first.")
        exit(1)

    with open(RAW_FILE, 'r', encoding='utf-8') as infile:
        raw_data = infile.read()

    print(f"Read {len(raw_data)} characters from '{RAW_FILE}'. Beginning Gemini call...")

    # ─── Step 5: SEND TO GEMINI FOR FORMATTING ─────────────────────────────────
    formatted_output = label_posts(raw_data)

    # ─── Step 6: WRITE GEMINI’S RESPONSE ───────────────────────────────────────
    with open(FORMATTED_FILE, 'w', encoding='utf-8') as outfile:
        outfile.write(formatted_output)

    print(f"Gemini’s formatted result has been written to '{FORMATTED_FILE}'.")


Read 32140 characters from 'last_10_days.txt'. Beginning Gemini call...
Gemini’s formatted result has been written to 'formatted_last_10_days.txt'.
