In [None]:
# CONFIGURATION - Edit this section for each report
import os

BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__ if '__file__' in globals() else '.'))))
REPORTS_DIR = os.path.join(BASE_DIR, "reports")

report_config = {
    "report_name": "ANNUAL_REPORT_2023",  # Options: ANNUAL_REPORT_2023, ART7_REPORT_2022, CAMBODIA_CMR_2023, CAMBODIA_MINES_2023, IWP_2023
    
    # Model configuration
    "models": {
        "mistral:instruct": "mistral-instruct",
        "vicuna": "vicuna", 
        "llama3": "llama3"
    },
    
    # Processing settings
    "request_timeout": 2000,
    "use_groq": False,  # Set True for Groq API models
    "use_azure_openai": False  # Set True for Azure OpenAI models
}

# Predefined configurations with generic, relative paths
predefined_configs = {
    "ANNUAL_REPORT_2023": {
        "input_csv": os.path.join(REPORTS_DIR, "ANNUAL-REPORT-2023-output.csv"),
        "output_csv": os.path.join(REPORTS_DIR, "ANNUAL-REPORT-2023-output.csv")
    },
    "ART7_REPORT_2022": {
        "input_csv": os.path.join(REPORTS_DIR, "2023-Cambodia-Art7Report-for2022.csv"),
        "output_csv": os.path.join(REPORTS_DIR, "2023-Cambodia-Art7Report-for2022-output.csv")
    },
    "CAMBODIA_CMR_2023": {
        "input_csv": os.path.join(REPORTS_DIR, "CAMBODIA_CLEARING_CMR_2023.csv"),
        "output_csv": os.path.join(REPORTS_DIR, "CAMBODIA_CLEARING_CMR_2023-output.csv")
    },
    "CAMBODIA_MINES_2023": {
        "input_csv": os.path.join(REPORTS_DIR, "Cambodia_Clearing_the_Mines_2023.csv"),
        "output_csv": os.path.join(REPORTS_DIR, "Cambodia_Clearing_the_Mines_2023-output.csv")
    },
    "IWP_2023": {
        "input_csv": os.path.join(REPORTS_DIR, "IWP-2023.csv"),
        "output_csv": os.path.join(REPORTS_DIR, "IWP-2023-output.csv")
    }
}

# Apply predefined config if specified
if report_config["report_name"] in predefined_configs:
    config = predefined_configs[report_config["report_name"]]
    report_config.update(config)

print(f"Processing report: {report_config['report_name']}")
print(f"Input CSV: {report_config.get('input_csv', 'Not specified')}")
print(f"Output CSV: {report_config.get('output_csv', 'Not specified')}")
print(f"Base directory: {BASE_DIR}")
print(f"Reports directory: {REPORTS_DIR}")


In [None]:
# IMPORTS
import time
import pandas as pd
import os
import torch
from llama_index.llms.ollama import Ollama

# Optional imports (uncomment if needed)
# import openparse  # For PDF parsing
# from openai import AzureOpenAI  # For Azure OpenAI
# from groq import Groq  # For Groq API

print("Libraries imported successfully")


In [None]:
# LOAD DATA
if "input_csv" not in report_config:
    raise ValueError("input_csv must be specified in report_config")

prompts = pd.read_csv(report_config["input_csv"])
print(f"Loaded {len(prompts)} prompts from {report_config['input_csv']}")
print(f"Columns: {list(prompts.columns)}")

# Ensure output directory exists
output_dir = os.path.dirname(report_config["output_csv"])
os.makedirs(output_dir, exist_ok=True)


In [None]:
# MODEL PROCESSING
def process_with_models(prompts_df, models_dict, config):
    """
    Process prompts with specified models
    """
    results_df = prompts_df.copy()
    
    for model_name, column_name in models_dict.items():
        print(f"\n=== Processing with model: {model_name} ===")
        
        # Initialize the appropriate LLM
        if config.get("use_groq", False):
            # Groq initialization (uncomment and configure as needed)
            # client = Groq(api_key="your_groq_api_key")
            # llm = client  # Configure appropriately
            raise NotImplementedError("Groq configuration needed")
        elif config.get("use_azure_openai", False):
            # Azure OpenAI initialization (uncomment and configure as needed)
            # client = AzureOpenAI(azure_endpoint="your_endpoint", api_key="your_key", api_version="version")
            # llm = client  # Configure appropriately
            raise NotImplementedError("Azure OpenAI configuration needed")
        else:
            # Default: Ollama
            llm = Ollama(model=model_name, request_timeout=config.get("request_timeout", 2000))
        
        # Process each prompt
        try:
            for i, row in results_df.iterrows():
                # Check if already processed
                if column_name in results_df.columns and pd.notna(row[column_name]):
                    print(f"Skipping prompt {i + 1} for model {model_name} (already processed)")
                    continue
                
                prompt = row['Prompts']
                print(f"Processing prompt {i + 1}/{len(results_df)} with {model_name}")
                
                start_time = time.time()
                
                # Generate response based on model type
                if config.get("use_groq", False) or config.get("use_azure_openai", False):
                    # For API-based models (implement as needed)
                    response_text = "API response not implemented"
                else:
                    # For Ollama
                    response = llm.complete(prompt)
                    response_text = str(response)
                
                end_time = time.time()
                processing_time = end_time - start_time
                
                # Update DataFrame
                results_df.at[i, column_name] = response_text
                results_df.at[i, f'Processing Time ({column_name})'] = processing_time
                
                # Save progress
                results_df.to_csv(config["output_csv"], index=False)
                
                print(f"Completed in {processing_time:.2f}s")
                
        except Exception as e:
            print(f"Error with model {model_name}: {e}")
            continue
        
        finally:
            # Clear GPU memory if using CUDA
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        print(f"Finished processing with {model_name}")
    
    return results_df

# Run processing
results = process_with_models(prompts, report_config["models"], report_config)
print(f"\n=== Processing Complete ===")
print(f"Results saved to: {report_config['output_csv']}")


In [None]:
# SUMMARY
print("\n=== Processing Summary ===")
print(f"Report: {report_config['report_name']}")
print(f"Total prompts: {len(results)}")
print(f"Models processed: {list(report_config['models'].keys())}")
print(f"Output file: {report_config['output_csv']}")

# Show completion status for each model
for model_name, column_name in report_config["models"].items():
    if column_name in results.columns:
        completed = results[column_name].notna().sum()
        print(f"  {model_name}: {completed}/{len(results)} prompts completed")
    else:
        print(f"  {model_name}: Column not found")

print("\n=== Done ===")
