In [1]:
import pandas as pd
import json
import os
import traceback

# Import postprocessing functions
from src.utils.passport_processing import postprocess
from src.utils.results_utils import ResultsAgent

In [3]:
data = {
 "number": "P3704125B",
 "country": "PHL",
 "name": "JUDITH RICEZA",
 "surname": "GASPAR",
 "middle name": "",
 "gender": "F",
 "place of birth": "SINILOAN LAGUNA",
 "birth date": "27 JUN 1992",
 "issue date": "30 OCT 2019",
 "expiry date": "29 OCT 2029",
 "mother name": "",
 "father name": "",
 "spouse name": "",
 "place of issue": "DFA MANILA",
 "country of issue": "PHILIPPINES",
 "mrzLine1": "P<PHLGASPAR<<JUDITH<RICEZA<<<<<<<<<<<<<<<<<<<",
 "mrzLine2": "P3704125B4PHL9206274F2910297<<<<<<<<<<<<<<<08"
}

json_data = json.dumps(data)

out = postprocess(json_data)

print(out)

ValueError: dictionary update sequence element #0 has length 1; 2 is required

In [2]:
def process_file(file_path):
    project_name = os.path.basename(file_path).replace('_results.csv', '')
    country = project_name.split(' - ')[0].strip()
    print(f"Detected country/dataset: {country}")
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    print(f"Read {len(df)} rows from {os.path.basename(file_path)}")
    
    # Apply postprocessing to each row
    processed_rows = []
    
    for i, (_, row) in enumerate(df.iterrows()):
        try:
            # Extract output data
            if 'output' in row:
                output_dict = json.loads(row['output'])
            else:
                # Create output dict from output.* columns
                output_dict = {}
                for col in row.index:
                    if col.startswith('outputs.'):
                        field_name = col.replace('outputs.', '')
                        output_dict[field_name] = row[col]
            
            # Apply postprocessing
            from src.utils.passport_processing import postprocess
            processed = postprocess(output_dict)
            
            # Create new row with processed values
            new_row = row.copy()
            
            # Update outputs with processed values
            for key, value in processed.items():
                col_name = f'outputs.{key}'
                new_row[col_name] = value
            
            processed_rows.append(new_row)
            
            # Show progress
            if (i + 1) % 10 == 0 or i == len(df) - 1:
                print(f"Processed {i + 1}/{len(df)} rows", end='\r')
        
        except Exception as e:
            print(f"\nError processing row {i}: {e}")
            processed_rows.append(row)  # Keep original row on error
    
    print("\nPostprocessing completed")
    
    # Create new dataframe with processed data
    processed_df = pd.DataFrame(processed_rows)
    
    # Update the output column with processed values
    processed_df['output'] = processed_df.apply(
        lambda row: json.dumps({key.split('.')[1]: row[key] for key in row.index if key.startswith("outputs.")}), 
        axis=1
    )
    
    # Save the processed results
    results_dir = "processed_results/"
    output_file = f"{results_dir}{project_name}_processed_results.csv"
    processed_df.to_csv(output_file, index=False)
    
    return output_file, country

def upload_to_sheets(output_file, res_agent, country):
    """Upload processed results to Google Sheets."""
    try:
        res_agent.country = country
        res_agent.upload_results(output_file)

    except Exception as e:
        print(f"Error during upload: {e}")
        traceback.print_exc()


In [3]:
res_agent = ResultsAgent()

In [7]:
file_path = "results/Kenya - gemini-2.5-pro - 347_results.csv"

output_file, country = process_file(file_path)

upload_to_sheets(output_file, res_agent, country)

Detected country/dataset: Kenya
Read 135 rows from Kenya - gemini-2.5-pro - 347_results.csv
Processed 135/135 rows
Postprocessing completed
142
