In [4]:
import google.generativeai as genai
import pandas as pd
import json
import os
import time
import sys

# --- File Configuration ---
# We will now use a progress file to track our state.
INPUT_FILE = "clinton_emails.json"
OUTPUT_JSON_FILE = "clinton_analysis_results.json"
PROGRESS_FILE = "clinton_analysis_progress.txt"


def configure_api_and_test():
    """
    Configures the genai API key from Colab Secrets or env variables
    and runs a simple test prompt to confirm it's working.
    """
    API_KEY = None
    try:
        # Check if running in Google Colab and get the key from Colab's "Secrets"
        import google.colab.userdata
        API_KEY = google.colab.userdata.get('GEMINI_API_KEY')
        if not API_KEY:
            print("="*50)
            print("ERROR: 'GEMINI_API_KEY' not found in Colab Secrets.")
            print("Please click the 'Key' (ðŸ”‘) icon in the left sidebar and add a new secret named 'GEMINI_API_KEY'.")
            print("="*50)
            sys.exit(1)
        print("Successfully loaded API key from Colab Secrets.")

    except ImportError:
        # Not in Colab, fall back to standard environment variables
        try:
            API_KEY = os.environ["GEMINI_API_KEY"]
            print("Successfully loaded API key from environment variable.")
        except KeyError:
            print("="*50)
            print("ERROR: 'GEMINI_API_KEY' not found in environment variables.")
            print("If not in Colab, please set the environment variable (e.g., 'export GEMINI_API_KEY=YOUR_KEY')")
            print("="*50)
            sys.exit(1)
    except Exception as e:
        print(f"Error loading API key: {e}")
        sys.exit(1)

    # Configure the API
    try:
        genai.configure(api_key=API_KEY)
    except Exception as e:
        print(f"Error configuring Generative AI: {e}")
        sys.exit(1)



# --- AI Prompt Engineering ---

# This SYSTEM_PROMPT defines the AI's role.
SYSTEM_PROMPT = """
You are an AI assistant specializing in the analysis of government and diplomatic correspondence.
Your task is to analyze the text of a single email (which includes headers, body, and metadata) and extract two distinct categories of features:
1.  **Privacy Features**: Information that must be protected, identified, or redacted.
2.  **Utility Features**: The core, non-sensitive insights or purpose of the communication.
"""

# This USER_PROMPT_TEMPLATE is the detailed instruction for *each* email.
# It is tailored specifically to the clinton_emails.json dataset.
USER_PROMPT_TEMPLATE = """
Analyze the following email data (which includes metadata and the full body) and provide the output as a single, minified JSON object.
Use "N/A" if a specific feature is not found.

---
### **FEATURE DEFINITIONS**
---

#### 1. Privacy Features (Data to Protect/Identify)
* `Name`: Full names of people (e.g., "Hillary Clinton", "Lanny Davis").
* `Email Address`: All email addresses (e.g., "hrod17@clintonemail.com").
* `Employment Details`: Specific job titles linked to individuals (e.g., "Special counsel to President Bill Clinton").
* `Private Organization`: Names of private firms or non-governmental entities (e.g., "Lanny J. Davis & Associates LLC").
* `Personal URL`: Specific, non-public URLs.
* `Classification Marking`: Security headers (e.g., "UNCLASSIFIED", "RELEASE IN PART", "CONFIDENTIAL").
* `Redaction Code`: Exemption codes replacing text (e.g., "B6").
* `Case/Document Number`: Legal/archival identifiers (e.g., "Case No. F-2014-20439", "Doc No. C05775652").
* `Sender-Recipient Pair`: The primary sender and recipient as a string (e.g., "Hillary Clinton -> Jake Sullivan").
* `Timestamp`: The primary date and time of the email (e.g., "2010-06-09 04:06").

#### 2. Utility Features (Valuable Insights to Extract)
* `Topic`: The core subject or theme in a few words (e.g., "Gaza flotilla incident op-ed", "Memo on Colombia", "Greek debt crisis").
* `Summary`: A brief, 1-2 sentence non-private summary of the email's content.
* `Key Entities (Non-PII)`: Public figures, governments, or public organizations *being discussed* (e.g., "Turkish PM Erdogan", "Israel", "Washington Post").
* `Intent/Stance`: The purpose of the email (e.g., "Action-Required", "Forwarding", "Persuasive", "Informational").
* `Action Item`: Any specific, direct request or task (e.g., "Pls read and discuss", "Pls print 2 copies").

---
### **EXAMPLE**
---
**Example Input:**
Email Metadata:
From: Hillary Clinton
To: Jake Sullivan
Date: 2010-06-09 04:06
Subject: TO TURKISH PM ERDOGAN: TIME TO CONSIDER ALL THE FACTS

Email Body:
UNCLASSIFIED U.S. Department of State Case No. F-2014-20439 Doc No. C05775652...RELEASE IN PART B6...From: H <hrod17@clintonemail.com>...To: 'sullivanjj@state.gov'...Pls read and discuss...Original Message...From: Lanny Davis...Subject: To Turkish PM Erdogan: Time to Consider all the Facts...[Op-ed text follows]...Lanny J. Davis...Special counsel to President Bill Clinton...

**Example Output JSON:**
{{"Name": "Hillary Clinton, Jake Sullivan, Lanny Davis, Recep Tayyip Erdogan", "Email Address": "hrod17@clintonemail.com, sullivanjj@state.gov", "Employment Details": "Special counsel to President Bill Clinton", "Private Organization": "Lanny J. Davis & Associates LLC", "Personal URL": "http://community.icontact.com/p/wwwlannydavis", "Classification Marking": "UNCLASSIFIED, RELEASE IN PART", "Redaction Code": "B6", "Case/Document Number": "Case No. F-2014-20439, Doc No. C05775652", "Sender-Recipient Pair": "Hillary Clinton -> Jake Sullivan", "Timestamp": "2010-06-09 04:06", "Topic": "Op-ed on Gaza flotilla incident and Turkish-Israeli relations", "Summary": "A forwarded op-ed from Lanny Davis arguing that Turkish PM Erdogan should consider all facts regarding the Gaza flotilla incident before criticizing Israel, drawing parallels to Turkey's own history.", "Key Entities (Non-PII)": "Turkish PM Erdogan, Israel, Turkey, Washington Post", "Intent/Stance": "Action-Required, Forwarding, Persuasive", "Action Item": "Pls read and discuss."}}

---
### **EMAIL TO ANALYZE**
---
{email_context}
"""

def analyze_single_email(email_obj):
    """
    Analyzes a single email object using the Gemini API.
    """
    email_body = email_obj.get('body')
    email_id = email_obj.get('email_id', 'N/A')

    if not email_body:
        print(f"  > Skipped email_id: {email_id} (no body text)")
        return None

    # --- PROMPT SIZE FIX ---
    # Instead of dumping the *entire* JSON object, we build a clean
    # context string. This is *much* smaller and avoids token limits.
    context_string = f"""
    Email Metadata:
    From: {email_obj.get('from', 'N/A')}
    To: {email_obj.get('to', 'N/A')}
    Date: {email_obj.get('date', 'N/A')}
    Subject: {email_obj.get('subject', 'N/A')}

    Email Body:
    {email_body}
    """

    print(f"  > Analyzing email_id: {email_id}...")

    model = genai.GenerativeModel(
        model_name='gemini-2.5-flash',
        system_instruction=SYSTEM_PROMPT
    )

    full_prompt = USER_PROMPT_TEMPLATE.format(email_context=context_string)

    try:
        response = model.generate_content(full_prompt)
        response_text = response.text.strip()

        if response_text.startswith("```json"):
            response_text = response_text[7:-3].strip()

        json_object = json.loads(response_text)
        json_object['original_email_id'] = email_id
        json_object['original_text'] = email_body # Added original body text post-analysis

        print(f"  > Success for email_id: {email_id}")
        return json_object

    except json.JSONDecodeError:
        print(f"  > FAILED to decode JSON for email_id: {email_id}")
        print(f"  > Raw response: {response_text}")
        return None
    except Exception as e:
        print(f"  > ERROR during API call for email_id: {email_id}: {e}")
        # This will catch API errors, token limits, quota issues, etc.
        # Re-raise the exception to trigger the graceful stop
        raise e

def load_existing_results(json_path):
    """Loads existing analysis results from the JSON file if it exists."""
    if not os.path.exists(json_path):
        print("No existing results file found. Starting fresh.")
        return []

    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            results = json.load(f)
            print(f"Loaded {len(results)} existing results from '{json_path}'.")
            return results
    except (json.JSONDecodeError, Exception) as e:
        print(f"Error loading '{json_path}': {e}. Starting with a new list.")
        # If file is corrupt, start fresh to avoid losing data
        return []

def get_start_index(progress_file):
    """Reads the progress file to find where to resume."""
    if not os.path.exists(progress_file):
        print("No progress file found. Starting from index 0.")
        return 0

    try:
        with open(progress_file, 'r') as f:
            start_index = int(f.read().strip())
            print(f"Resuming analysis from index {start_index}.")
            return start_index
    except Exception as e:
        print(f"Error reading progress file '{progress_file}': {e}. Starting from 0.")
        return 0

def save_progress(progress_file, index):
    """Saves the index of the *next* email to process."""
    try:
        with open(progress_file, 'w') as f:
            f.write(str(index))
        print(f"Progress saved: Next run will start at index {index}.")
    except Exception as e:
        print(f"Error saving progress to '{progress_file}': {e}")

def save_final_results(all_results, output_json_path):
    """Saves the final, complete list of results to the JSON file."""
    if not all_results:
        print("No results to save.")
        return

    print(f"\nSaving {len(all_results)} total results to '{output_json_path}'...")

    # We *overwrite* the file with the full, updated list
    # This ensures the JSON is always valid
    try:
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print(f"Successfully saved all results to '{output_json_path}'")
    except Exception as e:
        print(f"Error saving to JSON file: {e}")

def process_emails(input_json_path, output_json_path, progress_file, limit=None):
    """
    Reads a JSON file of emails, processes each one, and handles graceful saving.
    """
    print(f"Starting to process '{input_json_path}'...")

    try:
        with open(input_json_path, 'r', encoding='utf-8') as f:
            all_emails = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file '{input_json_path}' was not found.")
        print("Please make sure 'clinton_emails.json' is uploaded to your Colab environment.")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{input_json_path}'.")
        return

    # --- Resume Logic ---
    all_results = load_existing_results(output_json_path)
    start_index = get_start_index(progress_file)

    # This tracks the index of the last *successful* analysis in this run
    last_successful_index = -1

    # --- Slice emails to process ---
    # Apply sample limit *first* if it exists
    total_emails_to_process = all_emails
    if limit is not None:
        # Ensure limit doesn't go out of bounds, especially with start_index
        end_limit = min(start_index + limit, len(all_emails))
        total_emails_to_process = all_emails[:end_limit]
        print(f"--- SAMPLE MODE: Processing a max of {limit} emails, starting from {start_index}. ---")

    # Now, get the actual slice of emails we need to work on
    emails_to_process_slice = total_emails_to_process[start_index:]

    if not emails_to_process_slice:
        print("No new emails to process. All analysis may be complete.")

    try:
        # Use enumerate(start=start_index) to get the *correct global index*
        for global_index, email_obj in enumerate(total_emails_to_process[start_index:], start=start_index):

            print(f"\nProcessing email (Global Index {global_index}) of {len(all_emails)}...")

            analysis_result = analyze_single_email(email_obj)

            if analysis_result:
                all_results.append(analysis_result)
                # We record the *global index* of the last success
                last_successful_index = global_index

            time.sleep(1)

    except KeyboardInterrupt:
        print("\n="*50)
        print("STOP command (Ctrl+C) received.")
        print("Terminating analysis and saving results/progress...")
        print("="*50)
    except Exception as e:
        print("\n="*50)
        print(f"An unexpected error occurred: {e}")
        print("This may be an API token limit or quota issue.")
        print("Terminating analysis and saving partial results/progress...")
        print("="*50)

    finally:
        # --- Save Progress ---
        if last_successful_index != -1:
            # If we successfully processed email at index `X`,
            # the *next* run should start at index `X + 1`.
            next_index_to_process = last_successful_index + 1
            save_progress(progress_file, next_index_to_process)
        else:
            print("No new emails were successfully processed in this run. Progress file not updated.")

        # --- Save Full Results ---
        # This saves the combined list of (old + new) results
        save_final_results(all_results, output_json_path)
        print("Analysis finished.")

# --- How to run the script ---
if __name__ == "__main__":

    # --- 1. CONFIGURE AND TEST API ---
    # This will exit if the key is missing or the test prompt fails.
    configure_api_and_test()

    # --- 2. SET FILEPATHS AND LIMIT ---
    # Filepaths are defined at the top of the script

    # Set SAMPLE_LIMIT to 3 to test the first 3 emails (from the start_index)
    # Set SAMPLE_LIMIT to None to process all emails in the file (from the start_index)
    SAMPLE_LIMIT = None

    # --- 3. RUN ANALYSIS ---
    process_emails(
        input_json_path=INPUT_FILE,
        output_json_path=OUTPUT_JSON_FILE,
        progress_file=PROGRESS_FILE,
        limit=SAMPLE_LIMIT
    )

Successfully loaded API key from Colab Secrets.
Starting to process 'clinton_emails.json'...
Loaded 1004 existing results from 'clinton_analysis_results.json'.
Resuming analysis from index 1093.

Processing email (Global Index 1093) of 2050...
  > Analyzing email_id: 17336...


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 1668.63ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 2808.25ms



=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
STOP command (Ctrl+C) received.
Terminating analysis and saving results/progress...
No new emails were successfully processed in this run. Progress file not updated.

Saving 1004 total results to 'clinton_analysis_results.json'...
Successfully saved all results to 'clinton_analysis_results.json'
Analysis finished.
