# 2. LLM-Powered Feature Generation (Final, Robust Version)

### 1. Setup and Configuration

In [None]:
import pandas as pd
import google.generativeai as genai
import os
import time
from google.colab import drive, userdata
from tqdm import tqdm
import numpy as np
import pyarrow.parquet as pq
from google.api_core import retry

# --- Mount Google Drive ---
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Drive mounted.")

# --- Configure API using Colab Secrets ---
print("Configuring Gemini API...")
try:
    api_key = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=api_key)
    print("Gemini API configured successfully.")
except userdata.SecretNotFoundError:
    raise ValueError("API key not found. Please add 'GEMINI_API_KEY' to Colab Secrets (🔑 icon on the left).")

# --- Define File Paths ---
input_path = '/content/drive/MyDrive/Colab Notebooks/llm-feature-engineering/data/master_dataframe.parquet'
sample_output_path = '/content/drive/MyDrive/Colab Notebooks/llm-feature-engineering/data/sampled_df_with_llm_features.parquet'

print("-" * 50)
print(f"Input file: {input_path}")
print(f"Sample output file: {sample_output_path}")
print("✅ Setup complete. Ready to process.")
print("-" * 50)

### 2. Prompt and Generation Function (with Exponential Backoff)

In [None]:
# This is the DEFINITIVE version of the generation function.
# 1. It disables the client library's default retry mechanism.
# 2. It implements its own robust exponential backoff for retries.
# 3. It includes a standard 5-second delay on success to respect the RPM limit.

prompt_template = """
**Role**: You are a world-class film critic and movie analyst.
**Instruction**: Analyze the provided movie title and plot overview to identify its core underlying themes.
**Steps**: Distill these themes into a concise, comma-separated list of 5-7 thematic keywords. Focus on capturing the mood, central conflicts, and deeper meaning of the story. Avoid simply listing plot points.
**End Goal**: Return ONLY the comma-separated list of keywords.

---
**Movie Title**: The Matrix
**Plot Overview**: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.
**Thematic Keywords**: simulated reality, dystopian future, chosen one, rebellion, philosophical, cyberpunk
---
**Movie Title**: Forrest Gump
**Plot Overview**: The presidencies of Kennedy and Johnson, the Vietnam War, the Watergate scandal and other historical events unfold from the perspective of an Alabama man with an IQ of 75, whose only desire is to be reunited with his childhood sweetheart.
**Thematic Keywords**: historical epic, innocence, destiny, love, American history, serendipity
---

Now, generate the keywords for the following movie:
**Movie Title**: {title}
**Plot Overview**: {overview}
**Thematic Keywords**:
"""

model = genai.GenerativeModel('gemini-1.5-flash')

def generate_llm_keywords(title, overview, max_retries=5):
    # Define a custom retry predicate. We only want to retry on 429 errors.
    def is_retryable(e):
        return isinstance(e, Exception) and "429" in str(e)

    # Create a custom retry object with exponential backoff.
    # It will wait 10s, then 20s, then 40s, etc.
    custom_retry = retry.Retry(predicate=is_retryable, initial=10.0, maximum=300.0, multiplier=2.0)

    try:
        # Pass the custom retry object to the request.
        response = model.generate_content(
            prompt_template.format(title=title, overview=overview),
            request_options={'retry': custom_retry}
        )
        # On success, wait 5 seconds to respect the 15 RPM limit.
        time.sleep(5)
        return response.text.strip()
    except Exception as e:
        if "quota" in str(e).lower():
            print(f"DAILY QUOTA EXCEEDED. Cannot process '{title}'. Please wait for the quota to reset.")
            # If we hit a quota error, we must stop and return an error.
            return "ERROR: Daily quota exceeded"
        else:
            print(f"An unrecoverable error occurred for '{title}': {e}")
            return "ERROR: Unrecoverable API error"

print("✅ Robust keyword generation function defined.")

### 3. Sampling and Sequential Processing

In [None]:
# This cell creates a random sample and processes it sequentially.
# It uses a standard for-loop to guarantee one-at-a-time processing.

# --- Configuration ---
SAMPLE_SIZE = 10_000

# --- 1. Open a connection to the full Parquet file ---
print(f"Opening connection to full Parquet file: {input_path}")
parquet_file = pq.ParquetFile(input_path)
total_rows = parquet_file.metadata.num_rows
print(f"Total rows available in file: {total_rows:,}")

# --- 2. Create a random sample of the data ---
print(f"
Creating a random sample of {SAMPLE_SIZE:,} rows...")

# Memory-efficient sampling by iterating through chunks
sampling_fraction = SAMPLE_SIZE / total_rows
sample_chunks = []
for chunk in parquet_file.iter_batches(batch_size=200_000):
    chunk_df = chunk.to_pandas()
    sampled_chunk = chunk_df.sample(frac=sampling_fraction, random_state=42)
    sample_chunks.append(sampled_chunk)

sample_df = pd.concat(sample_chunks, ignore_index=True)
sample_df = sample_df.head(SAMPLE_SIZE)

print(f"Successfully created a sample of {len(sample_df):,} rows.")

# --- 3. Check for and resume a previously stopped job ---
start_index = 0
if os.path.exists(sample_output_path):
    processed_df = pd.read_parquet(sample_output_path)
    start_index = len(processed_df)
    print(f"Resuming from index {start_index} of the sample.")
    # To resume, we need to align the already processed data with the sample
    sample_df = sample_df.iloc[start_index:].copy()
    print(f"Starting to process {len(sample_df):,} remaining rows.")
else:
    print("Starting a new processing job.")

# --- 4. Process the SAMPLE DataFrame using a sequential for loop ---
if not sample_df.empty:
    # Create an empty list to store the results
    llm_keywords_list = []

    # Use tqdm directly on the iterator to show progress
    for index, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Processing Rows"):
        # Call the function for each row, one by one
        keywords = generate_llm_keywords(row['title'], row['plot_overview'])
        llm_keywords_list.append(keywords)
        
        # If we hit a hard quota error, stop the entire process.
        if "quota" in keywords:
            print("Stopping execution due to daily quota limit.")
            break

    # Assign the results back to the DataFrame
    processed_chunk = sample_df.iloc[:len(llm_keywords_list)].copy()
    processed_chunk['llm_keywords'] = llm_keywords_list

    # --- 5. Append the fully processed chunk to the output file ---
    if start_index == 0:
        # If we started from scratch, create the new file
        processed_chunk.to_parquet(sample_output_path, engine='pyarrow', index=False)
    else:
        # If we resumed, append to the existing file
        processed_chunk.to_parquet(sample_output_path, engine='pyarrow', index=False, append=True)

print("-" * 50)
print("✅ Processing finished.")
print("-" * 50)