# LLM-Powered Feature Generation

### Setup and Configuration

In [None]:
import pandas as pd
import google.generativeai as genai
import os
import time
from google.colab import drive, userdata
from tqdm import tqdm
import numpy as np
import pyarrow.parquet as pq
from google.api_core import retry

# Mount Google Drive to access project files.
drive.mount('/content/drive')
print("Drive mounted.")

# Securely configure the Gemini API using Colab Secrets.
print("Configuring Gemini API...")
try:
    api_key = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=api_key)
    print("Gemini API configured successfully.")
except userdata.SecretNotFoundError:
    raise ValueError("API key not found. Please add 'GEMINI_API_KEY' to Colab Secrets (🔑 icon on the left).")

# Define input and output file paths in Google Drive.
input_path = '/content/drive/MyDrive/Colab Notebooks/llm-feature-engineering/data/master_dataframe.parquet'
sample_output_path = '/content/drive/MyDrive/Colab Notebooks/llm-feature-engineering/data/sampled_df_with_llm_features.parquet'

print("-" * 50)
print(f"Input file: {input_path}")
print(f"Sample output file: {sample_output_path}")
print("✅ Setup complete. Ready to process.")
print("-" * 50)

### Prompt and Generation Function

In [None]:
# Definitive generation function with custom exponential backoff and rate limit delay.
prompt_template = """
**Role**: You are a world-class film critic and movie analyst.
**Instruction**: Analyze the provided movie title and plot overview to identify its core underlying themes.
**Steps**: Distill these themes into a concise, comma-separated list of 5-7 thematic keywords. Focus on capturing the mood, central conflicts, and deeper meaning of the story. Avoid simply listing plot points.
**End Goal**: Return ONLY the comma-separated list of keywords.

---
**Movie Title**: The Matrix
**Plot Overview**: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.
**Thematic Keywords**: simulated reality, dystopian future, chosen one, rebellion, philosophical, cyberpunk
---
**Movie Title**: Forrest Gump
**Plot Overview**: The presidencies of Kennedy and Johnson, the Vietnam War, the Watergate scandal and other historical events unfold from the perspective of an Alabama man with an IQ of 75, whose only desire is to be reunited with his childhood sweetheart.
**Thematic Keywords**: historical epic, innocence, destiny, love, American history, serendipity
---

Now, generate the keywords for the following movie:
**Movie Title**: {title}
**Plot Overview**: {overview}
**Thematic Keywords**:
"""

model = genai.GenerativeModel('gemini-1.5-flash')

def generate_llm_keywords(title, overview, max_retries=5):
    # Define a custom retry predicate for rate limit errors.
    def is_retryable(e):
        return isinstance(e, Exception) and "429" in str(e)

    # Configure exponential backoff for retries.
    custom_retry = retry.Retry(predicate=is_retryable, initial=10.0, maximum=300.0, multiplier=2.0)

    try:
        # Generate content with the custom retry configuration.
        response = model.generate_content(
            prompt_template.format(title=title, overview=overview),
            request_options={'retry': custom_retry}
        )
        # Pause to respect the API's rate limit.
        time.sleep(5)
        return response.text.strip()
    except Exception as e:
        if "quota" in str(e).lower():
            print(f"DAILY QUOTA EXCEEDED. Cannot process '{title}'. Please wait for the quota to reset.")
            # Stop processing if the daily quota is exceeded.
            return "ERROR: Daily quota exceeded"
        else:
            print(f"An unrecoverable error occurred for '{title}': {e}")
            return "ERROR: Unrecoverable API error"

print("✅ Robust keyword generation function defined.")

### Sampling and Sequential Processing

In [None]:
# Creates a random sample and processes it sequentially, one row at a time.
# A standard for-loop guarantees one-at-a-time processing.

# --- Configuration ---
SAMPLE_SIZE = 10_000

# Open a connection to the full Parquet file.
print(f"Opening connection to full Parquet file: {input_path}")
parquet_file = pq.ParquetFile(input_path)
total_rows = parquet_file.metadata.num_rows
print(f"Total rows available in file: {total_rows:,}")

# Create a random sample of the data.
print(f"\nCreating a random sample of {SAMPLE_SIZE:,} rows...")

# Sample data efficiently by iterating through chunks.
sampling_fraction = SAMPLE_SIZE / total_rows
sample_chunks = []
for chunk in parquet_file.iter_batches(batch_size=200_000):
    chunk_df = chunk.to_pandas()
    sampled_chunk = chunk_df.sample(frac=sampling_fraction, random_state=42)
    sample_chunks.append(sampled_chunk)

sample_df = pd.concat(sample_chunks, ignore_index=True)
sample_df = sample_df.head(SAMPLE_SIZE)

print(f"Successfully created a sample of {len(sample_df):,} rows.")

# Check for and resume a previously stopped job.
start_index = 0
if os.path.exists(sample_output_path):
    processed_df = pd.read_parquet(sample_output_path)
    start_index = len(processed_df)
    print(f"Resuming from index {start_index} of the sample.")
    # Align the sample with the already processed data to resume.
    sample_df = sample_df.iloc[start_index:].copy()
    print(f"Starting to process {len(sample_df):,} remaining rows.")
else:
    print("Starting a new processing job.")

# Process the sample sequentially using a for loop.
if not sample_df.empty:
    # Store the generated keywords in a list.
    llm_keywords_list = []

    # Use tqdm for a progress bar.
    for index, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Processing Rows"):
        # Generate keywords for each row.
        keywords = generate_llm_keywords(row['title'], row['plot_overview'])
        llm_keywords_list.append(keywords)
        
        # Stop processing if the daily quota is exceeded.
        if "quota" in keywords:
            print("Stopping execution due to daily quota limit.")
            break

    # Assign the results back to the DataFrame
    processed_chunk = sample_df.iloc[:len(llm_keywords_list)].copy()
    processed_chunk['llm_keywords'] = llm_keywords_list

    # Append the processed chunk to the output file.
    if start_index == 0:
        # Create a new file if starting from scratch.
        processed_chunk.to_parquet(sample_output_path, engine='pyarrow', index=False)
    else:
        # Append to the existing file if resuming.
        processed_chunk.to_parquet(sample_output_path, engine='pyarrow', index=False, append=True)

print("-" * 50)
print("✅ Processing finished.")
print("-" * 50)