In [None]:
# CONFIGURATION - Edit this section for each processing task
import os
import pandas as pd
import re

# Get the base directory dynamically
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__ if '__file__' in globals() else '.')))
TRIPLE_DIR = os.path.join(BASE_DIR, "Triple_preprocessing")

processing_config = {
    # Choose processing target
    "target": "RANDOM_150",  # Options: RANDOM_150, ANNUAL_2023, ART7_2022, CAMBODIA_CMR_2023, CAMBODIA_MINES_2023, IWP_2023, IWP_2024
    
    # Processing settings
    "save_deleted_rows": True,
    "save_empty_report": True,
    "create_output_folder": True
}

# Predefined target configurations
target_configs = {
    "RANDOM_150": {
        "input_csv": os.path.join(TRIPLE_DIR, "compute_metrics.csv"),
        "output_folder": TRIPLE_DIR,
        "output_prefix": "cleaned_output_random_150",
        "columns_to_clean": ["mistral:instruct-7b", "vicuna-7b", "llama3-8b", "GPT-4o", "llama3-70b"],
        "description": "Process random 150 sample data"
    },
    "ANNUAL_2023": {
        "input_csv": os.path.join(TRIPLE_DIR, "All_triples_A1.csv"),
        "output_folder": os.path.join(TRIPLE_DIR, "Annual_report_2023"),
        "output_prefix": "cleaned_output",
        "columns_to_clean": ["mistral-instruct", "vicuna", "llama3", "GPT-4o", "llama3-70b"],
        "description": "Process Annual Report 2023 data"
    },
    "ART7_2022": {
        "input_csv": os.path.join(TRIPLE_DIR, "All_triples_A1.csv"),
        "output_folder": os.path.join(TRIPLE_DIR, "2023-Cambodia-Art7Report-for2022"),
        "output_prefix": "cleaned_output",
        "columns_to_clean": ["mistral-instruct", "vicuna", "llama3", "GPT-4o", "llama3-70b"],
        "description": "Process Art7 Report 2022 data"
    },
    "CAMBODIA_CMR_2023": {
        "input_csv": os.path.join(TRIPLE_DIR, "All_triples_A1.csv"),
        "output_folder": os.path.join(TRIPLE_DIR, "CAMBODIA_CLEARING_CMR_2023"),
        "output_prefix": "cleaned_output",
        "columns_to_clean": ["mistral-instruct", "vicuna", "llama3", "GPT-4o", "llama3-70b"],
        "description": "Process Cambodia CMR 2023 data"
    },
    "CAMBODIA_MINES_2023": {
        "input_csv": os.path.join(TRIPLE_DIR, "All_triples_A1.csv"),
        "output_folder": os.path.join(TRIPLE_DIR, "Cambodia_Clearing_the_Mines_2023"),
        "output_prefix": "cleaned_output",
        "columns_to_clean": ["mistral-instruct", "vicuna", "llama3", "GPT-4o", "llama3-70b"],
        "description": "Process Cambodia Mines 2023 data"
    },
    "IWP_2023": {
        "input_csv": os.path.join(TRIPLE_DIR, "All_triples_A1.csv"),
        "output_folder": os.path.join(TRIPLE_DIR, "IWP-2023"),
        "output_prefix": "cleaned_output",
        "columns_to_clean": ["mistral-instruct", "vicuna", "llama3", "GPT-4o", "llama3-70b"],
        "description": "Process IWP 2023 data"
    },
    "IWP_2024": {
        "input_csv": os.path.join(TRIPLE_DIR, "All_triples_A1.csv"),
        "output_folder": os.path.join(TRIPLE_DIR, "IWP-2024"),
        "output_prefix": "cleaned_output",
        "columns_to_clean": ["mistral-instruct", "vicuna", "llama3", "GPT-4o", "llama3-70b"],
        "description": "Process IWP 2024 data"
    }
}

# Apply predefined config if specified
if processing_config["target"] in target_configs:
    config = target_configs[processing_config["target"]]
    processing_config.update(config)

print(f"Target: {processing_config['target']}")
print(f"Description: {processing_config.get('description', 'Custom processing')}")
print(f"Input CSV: {processing_config.get('input_csv', 'Not specified')}")
print(f"Output folder: {processing_config.get('output_folder', 'Not specified')}")
print(f"Columns to clean: {processing_config.get('columns_to_clean', [])}")
print(f"Base directory: {BASE_DIR}")
print(f"Triple directory: {TRIPLE_DIR}")


In [None]:
# UTILITY FUNCTIONS
def extract_triples(text):
    """
    Extract and clean triple format from text.
    Removes indexing, filters valid triples, and cleans output prefixes.
    """
    if pd.isna(text) or text == '':
        return ''
    
    # Split the text into lines
    lines = str(text).strip().split("\n")
    
    # Regex pattern to match triples with or without indexing
    triple_pattern = re.compile(r'\w+\(.*\)$')
    
    # Filter out lines that match the triple format and strip indexing if present
    triples = [re.sub(r'^[^A-Za-z]*', '', line.strip()) for line in lines if triple_pattern.search(line.strip())]
    
    # Check if the first triple is prefixed with "Output:" or similar and remove it
    if triples and triples[0].startswith("Output:"):
        triples[0] = re.sub(r'^Output:\s*', '', triples[0])
    
    # Join the filtered triples back into a single string
    return "\n".join(triples)

def create_output_paths(config):
    """
    Create output file paths based on configuration.
    """
    output_folder = config['output_folder']
    output_prefix = config['output_prefix']
    
    # Create output folder if it doesn't exist
    if config.get('create_output_folder', True):
        os.makedirs(output_folder, exist_ok=True)
    
    paths = {
        'cleaned_csv': os.path.join(output_folder, f"{output_prefix}.csv"),
        'deleted_csv': os.path.join(output_folder, "deleted_rows.csv"),
        'empty_report': os.path.join(output_folder, "empty_cells_report.txt")
    }
    
    return paths

def process_triples_data(config):
    """
    Main processing function for triple data cleaning.
    """
    print(f"\n{'='*60}")
    print(f"PROCESSING: {config['target']}")
    print(f"{'='*60}")
    
    # Load CSV file
    print(f"Loading data from: {config['input_csv']}")
    df = pd.read_csv(config['input_csv'])
    print(f"Loaded {len(df)} rows")
    
    # Get columns to clean
    columns_to_clean = config['columns_to_clean']
    print(f"Processing columns: {columns_to_clean}")
    
    # Verify columns exist in dataframe
    missing_columns = [col for col in columns_to_clean if col not in df.columns]
    if missing_columns:
        print(f"WARNING: Missing columns in data: {missing_columns}")
        columns_to_clean = [col for col in columns_to_clean if col in df.columns]
        print(f"Processing available columns: {columns_to_clean}")
    
    # Apply the extraction function to each cell in the specified columns
    print("Extracting and cleaning triples...")
    for column in columns_to_clean:
        df[column] = df[column].apply(extract_triples)
    
    # Count empty cells after cleaning
    empty_cells_count = {}
    for column in columns_to_clean:
        empty_cells_count[column] = (df[column] == '').sum()
    
    # Create output paths
    paths = create_output_paths(config)
    
    # Save empty cells report
    if config.get('save_empty_report', True):
        with open(paths['empty_report'], 'w') as f:
            for column, count in empty_cells_count.items():
                line = f"Column '{column}' has {count} empty cells after cleaning.\n"
                f.write(line)
                print(line.strip())
        print(f"Empty cells report saved to: {paths['empty_report']}")
    
    # Identify and save rows to be deleted
    if config.get('save_deleted_rows', True):
        rows_to_delete = df[(df[columns_to_clean] == '').any(axis=1)]
        rows_to_delete.to_csv(paths['deleted_csv'], index=False)
        print(f"Deleted {len(rows_to_delete)} rows with empty cells")
        print(f"Deleted rows saved to: {paths['deleted_csv']}")
    
    # Remove rows where any of the specified columns are empty after cleaning
    df_cleaned = df[(df[columns_to_clean] != '').all(axis=1)]
    
    # Save the cleaned DataFrame
    df_cleaned.to_csv(paths['cleaned_csv'], index=False)
    print(f"Cleaned data ({len(df_cleaned)} rows) saved to: {paths['cleaned_csv']}")
    
    return df_cleaned, empty_cells_count

print("Utility functions loaded successfully")


In [None]:
# EXECUTE TRIPLE PROCESSING
try:
    # Run the processing
    result_df, empty_counts = process_triples_data(processing_config)
    
    # Display summary statistics
    print(f"\n{'='*60}")
    print("PROCESSING SUMMARY")
    print(f"{'='*60}")
    print(f"Target: {processing_config['target']}")
    print(f"Final cleaned rows: {len(result_df)}")
    print(f"Columns processed: {len(processing_config['columns_to_clean'])}")
    
    print(f"\nEmpty cells by column:")
    for column, count in empty_counts.items():
        print(f"  {column}: {count} empty cells")
    
    print(f"\nProcessing completed successfully!")
    
except Exception as e:
    print(f"Error during processing: {str(e)}")
    import traceback
    traceback.print_exc()
