In [5]:
# Install required libraries
!pip install google-generativeai firebase-admin google-auth google-cloud-firestore



In [6]:
import google.generativeai as genai
import firebase_admin
from firebase_admin import credentials, firestore
import json
import os
from google.colab import userdata

In [8]:
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# IMPORTANT: AUTHENTICATION SETUP
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

# 1. Firebase Authentication
# To run this, you need your Firebase service account key.
# Go to your Firebase Project -> Project Settings -> Service accounts
# Click "Generate new private key" and upload the downloaded JSON file to your Colab session.
# Replace 'path/to/your/serviceAccountKey.json' with the actual file path.
# For example, if you uploaded it directly, the path would be '/content/serviceAccountKey.json'
SERVICE_ACCOUNT_KEY_PATH = '/content/kairos-skr-firebase-adminsdk-fbsvc-accd57f171.json'
try:
    if not firebase_admin._apps:
        cred = credentials.Certificate(SERVICE_ACCOUNT_KEY_PATH)
        firebase_admin.initialize_app(cred)
    db = firestore.client()
    print("Successfully connected to Firestore.")
except Exception as e:
    print(f"Error connecting to Firestore: {e}")
    print("Please make sure you have uploaded your service account key and updated the path.")

Successfully connected to Firestore.


In [9]:
# 2. Google AI (Gemini) Authentication
# Use Colab's secret manager to securely store your API key.
# Click the "Key" icon on the left sidebar of Colab, add a new secret named "GEMINI_API_KEY",
# and paste your key there.
GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GEMINI_API_KEY)
print("Gemini API configured.")

Gemini API configured.


In [10]:
# Configure the "teacher" model
# We use Gemini 1.5 Flash - it's fast, capable, and has a great free tier.
model = genai.GenerativeModel(model_name='gemini-1.5-flash-latest')

# This is the "brain" of our teacher - a carefully engineered prompt template.
prompt_template = """
You are an expert financial analyst. Your task is to analyze a news article and transform it into a structured JSON object for training a machine learning model.

INSTRUCTION:
Analyze the following news article. Identify the core event, its direct first-order impacts, and hypothesize potential second-order effects on specific sectors or companies. Provide a confidence score for your hypotheses.

INPUT ARTICLE:
{article_text}

GUIDELINES FOR THE OUTPUT JSON:
1.  The entire output must be a single, valid JSON object.
2.  `event_summary`: A concise, one-sentence summary of the core event.
3.  `event_type`: Classify the event into one of the following categories: [Monetary Policy, Corporate Earnings, Geopolitical Tension, Market Trend, Regulation, Economic Report].
4.  `first_order_impacts`: An array of strings describing the immediate, obvious consequences.
5.  `second_order_hypotheses`: An array of JSON objects. Each object must contain:
    - `hypothesis`: A string describing a potential downstream effect.
    - `reasoning`: A string explaining the logic behind the hypothesis, connecting cause and effect.
    - `confidence_score`: A float between 0.0 and 1.0.
6.  The final output JSON should not contain any markdown formatting like ```json.

OUTPUT JSON:
"""

def generate_training_data(documents, filename="training_data.jsonl"):
    """Fetches articles, generates analysis via Gemini, and saves to a .jsonl file."""
    count = 0
    with open(filename, 'w') as f:
        for doc in documents:
            try:
                doc_data = doc.to_dict()
                # We'll use title and description as the primary text.
                article_text = f"Title: {doc_data.get('title', '')}\\n\\nDescription: {doc_data.get('description', '')}"

                if not article_text or len(article_text) < 100:
                    print(f"Skipping document {doc.id} due to insufficient text.")
                    continue

                # Format the prompt with the current article's text
                full_prompt = prompt_template.format(article_text=article_text)

                # Call the Gemini API
                response = model.generate_content(full_prompt)

                # Create the JSONL entry
                jsonl_entry = {
                    "instruction": "Analyze the following news article. Identify the core event, its direct first-order impacts, and hypothesize potential second-order effects on specific sectors or companies. Provide a confidence score for your hypotheses.",
                    "input": article_text,
                    "output": response.text  # The raw JSON string from Gemini
                }

                # Write the JSON object as a new line in the file
                f.write(json.dumps(jsonl_entry) + "\\n")
                count += 1
                print(f"Successfully processed document {count}: {doc_data.get('title', '')[:50]}...")

            except Exception as e:
                print(f"Could not process document {doc.id}. Error: {e}")

    print(f"\\n--- DONE! ---")
    print(f"Successfully generated {count} training examples in '{filename}'.")

In [11]:
# --- SCRIPT EXECUTION ---
# Fetch the first 100 raw documents from Firestore
print("Fetching documents from Firestore...")
docs_ref = db.collection('raw_documents').limit(100).stream()
documents_to_process = list(docs_ref)
print(f"Found {len(documents_to_process)} documents to process.")

# Run the factory!
generate_training_data(documents_to_process)

Fetching documents from Firestore...
Found 77 documents to process.
Successfully processed document 1: Sabah looks beyond royalties to boost oil gains...
Successfully processed document 2: S&P 500 Gains and Losses Today: Centene Pulls Guid...
Successfully processed document 3: To L and Back Podcast: Ultimatum Queer Love Editio...
Successfully processed document 4: South Korea president says 'doing utmost' for trad...
Successfully processed document 5: A saturated job market is driving more entrepreneu...
Successfully processed document 6: Indonesia Free Meal Plan Stunted By Delays, Protes...
Successfully processed document 7: Amazon rolls out a stricter performance review pro...
Successfully processed document 8: NAPCO Security Technologies, Inc. (NASDAQ:NSSC) Sh...
Skipping document -3289438501650949891 due to insufficient text.
Successfully processed document 9: First Solana staking ETF hits $12M in ‘healthy’ fi...
Successfully processed document 10: Payment Hubs Rescue Banks From $1