In [None]:
import os
import pandas as pd
import random
import time
import requests
from IPython.display import display  # For displaying DataFrame as a Jupyter table

# Global variable for maximum number of rows to process (can be altered as needed)
MAX_PROCESSED_ROWS = 100000

def generate_context_from_model(crime_type, last_outcome, location, i):
    """
    Generates a retrospective context for a crime that has already occurred using the free Google Gemini model via Hugging Face API.
    After generation, it strips away the prompt from the output, filters out unwanted characters, and prints the generated context.
    
    Parameters:
        crime_type (str): The type of crime that was committed.
        last_outcome (str): The "Last outcome category" field.
        location (str): The "Location" field.
        i (int): The current row number (for logging purposes).
        
    Returns:
        str: The generated context text.
    """
    # Get the API key from the environment variable.
    HF_API_TOKEN = os.environ.get("HF_API_KEY")
    if not HF_API_TOKEN:
        raise EnvironmentError("HF_API_KEY not found in environment variables.")

    # Specify the model ID for Google Gemini.
    model_id = "google/gemma-2-2b-it"
    API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
    
    # Generate a random seed.
    seed = random.randint(1, 100000)
    print(f"Generating row {i} via seed: {seed}")
    
    # Construct the prompt (sent in full, not trimmed by Python).
    # The prompt instructs the model to use British English and professional language.
    # It specifies that the crime has already occurred and that the generated context should be retrospective,
    # professional, and concise, including details from the "Last outcome category" and "Location" fields.
    # The context should be under 150 characters and no bold or italic syntax should be used.
    prompt = (
        f"Seed {seed}: Use British English and professional language. Do not use bold or italic syntax."
        f" Generate realistic, concise retrospective context for a crime of. Write context in a few sentences on one line. Do not write over multiple lines."
        f" Type: {crime_type}, Last outcome: {last_outcome}, Location: {location}."
        f" Keep under 150 characters. Only include the context."
    )
    
    headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
    
    max_retries = 5
    for attempt in range(max_retries):
        response = requests.post(API_URL, headers=headers, json={"inputs": prompt})
        result = response.json()
        
        # Check for errors or if the model is loading.
        if "error" in result:
            error_message = result["error"]
            if "loading" in error_message.lower():
                wait_time = result.get("estimated_time", 5)
                print(f"Attempt {attempt + 1}: Model is loading. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Attempt {attempt + 1}: Error: {error_message}. Retrying in 2 seconds...")
                time.sleep(2)
        else:
            break
    else:
        return "No additional details available."

    # Process the result.
    if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
        output = result[0]["generated_text"]
        # Remove the prompt from the output.
        generated_text = output[len(prompt):].strip()
        # Filter out newline characters (\n), asterisks (*), and underscores (_).
        generated_text = generated_text.replace('\n', ' ').replace('*', '').replace('_', '').strip()
        # Print the generated context after generation.
        print(f"Generated context: {generated_text}\n")
        return generated_text
    else:
        return "No additional details available."

def process_csv_files(directory):
    """
    Processes CSV files in subdirectories of the given directory.
    It processes up to MAX_PROCESSED_ROWS rows by adding a generated 'Context' (based on the 'Crime type',
    'Last outcome category', and 'Location' columns) for rows whose "Month" column matches the subfolder name.
    Subfolder names must be one of 2024-11, 2024-10, or 2024-9.
    
    Only rows with an empty or null "Context" field will be updated.
    Every 20 responses, the file is saved (i.e. a backup is performed).
    
    After processing, the function displays the processed rows as a Jupyter table.
    """
    total_processed = 0  # Total number of rows processed across all CSV files.
    processed_rows = []  # List to collect processed rows (as DataFrames)
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                try:
                    df = pd.read_csv(file_path)
                    print(f"Total rows: {df.shape[0]}")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
                    continue
                
                # Ensure required columns exist.
                required_cols = {"Crime type", "Month", "Last outcome category", "Location", "Context"}
                if not required_cols.issubset(df.columns):
                    print(f"Skipping {file_path}, missing one or more required columns: {required_cols}")
                    continue
                else:
                    print("Required columns found.")

                # Process each row in the DataFrame.
                for idx in df.index:
                    if total_processed >= MAX_PROCESSED_ROWS:
                        break
                    
                    # Only update Context if it is empty or null.
                    current_context = df.at[idx, "Context"]
                    if pd.notna(current_context) and str(current_context).strip():
                        continue

                    crime_type = df.at[idx, "Crime type"]
                    last_outcome = df.at[idx, "Last outcome category"]
                    location = df.at[idx, "Location"]
                    
                    context = generate_context_from_model(crime_type, last_outcome, location, idx)
                    df.at[idx, "Context"] = context
                    processed_rows.append(df.loc[[idx]])
                    total_processed += 1
                    
                    # Save a backup every 20 responses.
                    if total_processed % 20 == 0:
                        df.to_csv(file_path, index=False)
                        print(f"Backup saved after {total_processed} responses in file: {file_path}")

                # Save the updated DataFrame back to the CSV.
                df.to_csv(file_path, index=False)
                print(f"Finished processing file: {file_path}")
                if total_processed >= MAX_PROCESSED_ROWS:
                    break
        if total_processed >= MAX_PROCESSED_ROWS:
            break
    
    print(f"Total processed rows: {total_processed}")
    
    # Combine and display the processed rows as a Jupyter table.
    if processed_rows:
        all_processed = pd.concat(processed_rows, ignore_index=True)
        print("Processed rows:")
        display(all_processed)
    else:
        print("No rows were processed.")

# Example usage:
if __name__ == "__main__":
    # Replace "csv" with the path to your main directory that contains the subfolders.
    process_csv_files("csv")


Processing file: csv\kc_crimes_part1.csv
Total rows: 4553
Required columns found.
Generating row 720 via seed: 63469
Generated context: Optimum character count: 80 characters. The incident took place near Optimum Car Sales, on the Sydenham Road, where a group of youth aggressively behaved and were disrespectful towards local shopkeepers.

Generating row 721 via seed: 560
Generated context: The burglary occurred at a residence that was being occupied by the elderly couple.  The couple's son, residing elsewhere, had noticed the house was ransacked and raising immediate concerns. The son called the police, who conducted a thorough investigation. No suspect was identified.

Generating row 722 via seed: 81207
Generated context: The investigation into a suspected arson and criminal damage incident near Bedford Place remains unsolved. Despite numerous leads, no suspect has been identified. The damage was extensive, with the incident reported at around 09:00 on 18th August.

Generating row 723