In [1]:
import pandas as pd
import json
import os
import traceback

from pprint import pprint
from src.utils.passport_processing import postprocess
from src.utils.results_utils import ResultsAgent

In [2]:
def process_file(file_path):
    # Handle both _full_results.csv and _results.csv files
    if "_full_results.csv" in file_path:
        # Replace with the aggregated results file
        file_path = file_path.replace('_full_results.csv', '_results.csv')
        print(f"Switching to aggregated results file: {os.path.basename(file_path)}")
    
    project_name = os.path.basename(file_path).replace('_results.csv', '')
    country = project_name.split(' - ')[0].strip()
    print(f"Detected country/dataset: {country}")
    
    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    print(f"Read {len(df)} rows from {os.path.basename(file_path)}")
    
    # Check if 'output' column exists
    if 'output' not in df.columns:
        print("Warning: 'output' column not found. Available columns:", df.columns.tolist())
        # Try to reconstruct output from individual output columns
        output_cols = [col for col in df.columns if col.startswith('outputs.')]
        if output_cols:
            print(f"Found {len(output_cols)} output columns, reconstructing output JSON...")
            df['output'] = df.apply(
                lambda row: json.dumps({col.split('.')[1]: row[col] for col in output_cols if not pd.isna(row[col])}),
                axis=1
            )
        else:
            raise ValueError("No output data found in the CSV file")
    
    # Apply postprocessing to each row
    processed_rows = []
    
    for i, (_, row) in enumerate(df.iterrows()):
        try:
            # Extract output data from full JSON response field
            if pd.isna(row['output']) or row['output'] == '':
                print(f"\nWarning: Empty output for row {i}, skipping postprocessing")
                processed_rows.append(row)
                continue
                
            output_dict = json.loads(row['output'])
            
            # Apply postprocessing
            from src.utils.passport_processing import postprocess
            processed = postprocess(output_dict)
            
            # Create new row with processed values
            new_row = row.copy()
            
            # Update outputs with processed values
            for key, value in processed.items():
                col_name = f'outputs.{key}'
                new_row[col_name] = value
            
            processed_rows.append(new_row)
            
            # Show progress
            if (i + 1) % 10 == 0 or i == len(df) - 1:
                print(f"Processed {i + 1}/{len(df)} rows", end='\r')
        
        except Exception as e:
            print(f"\nError processing row {i}: {e}")
            traceback.print_exc()
            processed_rows.append(row)  # Keep original row on error
    
    print("\nPostprocessing completed")
    
    # Create new dataframe with processed data
    processed_df = pd.DataFrame(processed_rows)
    
    # Update the output column with processed values
    processed_df['output'] = processed_df.apply(
        lambda row: json.dumps({key.split('.')[1]: row[key] for key in row.index if key.startswith("outputs.") and not pd.isna(row[key])}), 
        axis=1
    )
    
    # Create results directory if it doesn't exist
    results_dir = "processed_results/"
    os.makedirs(results_dir, exist_ok=True)
    
    # Save the processed results
    output_file = f"{results_dir}{project_name}_processed_results.csv"
    processed_df.to_csv(output_file, index=False)
    
    return output_file, country

def upload_to_sheets(output_file, res_agent, country):
    """Upload processed results to Google Sheets."""
    try:
        res_agent.country = country
        res_agent.upload_results(output_file)

    except Exception as e:
        print(f"Error during upload: {e}")
        traceback.print_exc()


In [3]:
res_agent = ResultsAgent()

Loading consolidated data from ./static/consolidated_data.parquet...
Loaded 155168 records from consolidated file.


In [4]:
file_path = "results/Ethiopia - gemini-2.5-pro - 765_results.csv"

output_file, country = process_file(file_path)

upload_to_sheets(output_file, res_agent, country)

Detected country/dataset: Ethiopia
Read 198 rows from Ethiopia - gemini-2.5-pro - 765_results.csv


Processed 198/198 rows
Postprocessing completed
224
