In [None]:
# CONFIGURATION - Edit this section for each construction task
import os
import pandas as pd
import re

# Get the base directory dynamically
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__ if '__file__' in globals() else '.')))

construction_config = {
    # Choose construction strategy
    "strategy": "DOMAIN_PARAGRAPH",  # Options: DOMAIN_PARAGRAPH, RANDOM_PARAGRAPH, DOMAIN_SENTENCE
    
    # Example configuration
    "random_example": {
        "sentence": "**Germany Vallon VMH3**: In June 2023, CMAC's R&D teams additionally conducted another important study on the use and maintenance of the Germany Vallon VMH3 mine detector machine. The study focused on testing the detector head (60 mm) to detect and signalize metal under the ground.",
        "output": "isSupportedByOrganization(study, CMAC's R&D teams)\nisPartOf(Germany Vallon VMH3, mine detector machine)\nCausedBy(infrastructure damage, old wartime munitions)"
    },
    
    # Processing settings
    "debug_output": True,
    "save_intermediate": True
}

# Predefined strategy configurations
strategy_configs = {
    "DOMAIN_PARAGRAPH": {
        "input_annotations": os.path.join(BASE_DIR, "eval", "Annotations.csv"),
        "output_csv": os.path.join(BASE_DIR, "prompts", "domain_paragraph_prompts.csv"),
        "description": "Create domain-specific paragraph examples using ontology-based matching"
    },
    "RANDOM_PARAGRAPH": {
        "input_prompts": os.path.join(BASE_DIR, "prompts", "prompts_with_onto_demo.csv"),
        "output_csv": os.path.join(BASE_DIR, "prompts", "random_paragraph_prompts.csv"),
        "description": "Replace examples with random paragraph text"
    },
    "DOMAIN_SENTENCE": {
        "input_annotations": os.path.join(BASE_DIR, "eval", "Annotations.csv"),
        "output_csv": os.path.join(BASE_DIR, "prompts", "domain_sentence_prompts.csv"),
        "description": "Create domain-specific sentence-level examples"
    }
}

# Apply predefined config if specified
if construction_config["strategy"] in strategy_configs:
    config = strategy_configs[construction_config["strategy"]]
    construction_config.update(config)

print(f"Strategy: {construction_config['strategy']}")
print(f"Description: {construction_config.get('description', 'Custom strategy')}")
print(f"Input Annotations: {construction_config.get('input_annotations', 'Not specified')}")
print(f"Input Prompts: {construction_config.get('input_prompts', 'Not specified')}")
print(f"Output CSV: {construction_config.get('output_csv', 'Not specified')}")
print(f"Base directory: {BASE_DIR}")


In [None]:
# UTILITY FUNCTIONS
def extract_entity_relation_types(prompt):
    """Extract entity_types and relation_types from a prompt"""
    entity_types_match = re.search(r'entity_types:(.*?)\n', prompt, re.DOTALL)
    relation_types_match = re.search(r'relation_types:(.*?)\n', prompt, re.DOTALL)
    
    entity_types = entity_types_match.group(1).strip() if entity_types_match else None
    relation_types = relation_types_match.group(1).strip() if relation_types_match else None
    
    return entity_types, relation_types

def extract_sentence_from_prompt(prompt, sentence_number=2):
    """Extract the Nth sentence from a prompt (default: second sentence)"""
    sentences = re.split(r'Sentence:', prompt)
    
    if len(sentences) > sentence_number:
        sentence_block = sentences[sentence_number].strip()
        sentence = re.split(r'(Output:|Sentence:)', sentence_block)[0].strip()
        return sentence
    
    return None

def get_annotation_for_prompt(df, prompt):
    """Get the corresponding annotation for a given prompt"""
    row = df[df['Prompts'] == prompt]
    if not row.empty:
        return row.iloc[0]['Annotation']
    return None

def replace_example_in_prompt(prompt, new_example_text):
    """Replace the example section in a prompt with new text"""
    new_prompt = re.sub(r'Example:.*?Output:.*?(?=\n|$)', new_example_text, prompt, flags=re.DOTALL)
    return new_prompt

def create_example_text(sentence, output):
    """Create formatted example text"""
    return f'''Example: 
Sentence: {sentence}
Output: {output}
'''

print("Utility functions loaded successfully")


In [None]:
# DOMAIN PARAGRAPH STRATEGY
def construct_domain_paragraph_prompts(config):
    """
    Create domain-specific paragraph examples using ontology-based matching.
    This replicates the functionality of Construct_Prompts_with_domain_Paragraph.ipynb
    """
    print("Starting domain paragraph construction...")
    
    # Load annotations
    df = pd.read_csv(config['input_annotations'])
    print(f"Loaded {len(df)} annotations from {config['input_annotations']}")
    
    # Extract entity and relation types
    df['entity_types'], df['relation_types'] = zip(*df['Prompts'].apply(extract_entity_relation_types))
    
    # Find shortest and second shortest matched prompts
    shortest_matched_prompts = []
    second_shortest_matched_prompts = []
    
    for index, row in df.iterrows():
        current_entity_types = row['entity_types']
        current_relation_types = row['relation_types']
        
        # Find matching prompts with same entity/relation types
        matching_prompts = df[(df['entity_types'] == current_entity_types) & 
                             (df['relation_types'] == current_relation_types)]
        
        if config.get('debug_output', False):
            print(f"\nRow {index}: Found {len(matching_prompts)} matching prompts")
            print(f"Entity types: {current_entity_types}")
            print(f"Relation types: {current_relation_types}")
        
        if matching_prompts.empty:
            shortest_matched_prompts.append(None)
            second_shortest_matched_prompts.append(None)
            continue
        
        # Sort by annotation length
        matching_prompts = matching_prompts.sort_values(by='Annotation', key=lambda x: x.str.len())
        
        # Get shortest annotation prompt
        shortest_annotation_prompt = matching_prompts.iloc[0]['Prompts']
        shortest_matched_prompts.append(shortest_annotation_prompt)
        
        # Get second shortest if available
        if len(matching_prompts) > 1:
            second_shortest_annotation_prompt = matching_prompts.iloc[1]['Prompts']
            second_shortest_matched_prompts.append(second_shortest_annotation_prompt)
        else:
            second_shortest_matched_prompts.append(None)
    
    # Add matched prompts to dataframe
    df['Shortest_Matched_Prompt'] = shortest_matched_prompts
    df['Second_shortest_Matched_Prompt'] = second_shortest_matched_prompts
    
    # Extract paragraphs from prompts
    df['Paragraph'] = df['Prompts'].apply(lambda prompt: extract_sentence_from_prompt(prompt) if pd.notna(prompt) else None)
    df['Paragraph_shortest'] = df['Shortest_Matched_Prompt'].apply(lambda prompt: extract_sentence_from_prompt(prompt) if pd.notna(prompt) else None)
    df['Paragraph_second_shortest'] = df['Second_shortest_Matched_Prompt'].apply(lambda prompt: extract_sentence_from_prompt(prompt) if pd.notna(prompt) else None)
    
    # Get corresponding annotations
    df['Annotation_shortest'] = df['Shortest_Matched_Prompt'].apply(lambda prompt: get_annotation_for_prompt(df, prompt) if pd.notna(prompt) else None)
    df['Annotation_second_shortest'] = df['Second_shortest_Matched_Prompt'].apply(lambda prompt: get_annotation_for_prompt(df, prompt) if pd.notna(prompt) else None)
    
    # Create ontology-based prompts
    df['prompts_with_ontology_demo'] = df.apply(lambda row: 
        replace_example_in_prompt(row['Prompts'], 
                                create_example_text(row['Paragraph_shortest'], row['Annotation_shortest'])) 
        if pd.notna(row['Paragraph_shortest']) and pd.notna(row['Annotation_shortest']) 
        else row['Prompts'], axis=1)
    
    print(f"Domain paragraph construction completed. Generated {len(df)} prompts.")
    return df

print("Domain paragraph strategy function loaded")


In [None]:
# RANDOM PARAGRAPH STRATEGY
def construct_random_paragraph_prompts(config):
    """
    Replace examples with random paragraph text.
    This replicates the functionality of Construct_Prompts_with_random_Paragraph.ipynb
    """
    print("Starting random paragraph construction...")
    
    # Load existing prompts
    df = pd.read_csv(config['input_prompts'])
    print(f"Loaded {len(df)} prompts from {config['input_prompts']}")
    
    # Create new example text using configured random example
    new_example_text = create_example_text(
        config['random_example']['sentence'],
        config['random_example']['output']
    )
    
    # Apply the replacement to each prompt
    df['prompt_with_random_paragraph'] = df['Prompts'].apply(
        lambda prompt: replace_example_in_prompt(prompt, new_example_text)
    )
    
    # Create final output with renamed columns
    output_df = df[['Prompts', 'prompts_with_ontology_demo', 'prompt_with_random_paragraph', 'Annotation']].copy()
    output_df.rename(columns={
        'prompts_with_ontology_demo': 'prompts_with_ontology_paragraph',
        'Prompts': 'prompts_with_random_sentence'
    }, inplace=True)
    
    print(f"Random paragraph construction completed. Generated {len(output_df)} prompts.")
    return output_df

print("Random paragraph strategy function loaded")


In [None]:
# DOMAIN SENTENCE STRATEGY
def construct_domain_sentence_prompts(config):
    """
    Create domain-specific sentence-level examples.
    This extends the minimal functionality from Construct_Prompts_with_domain_Sentence.ipynb
    """
    print("Starting domain sentence construction...")
    
    # Load annotations
    df = pd.read_csv(config['input_annotations'])
    print(f"Loaded {len(df)} annotations from {config['input_annotations']}")
    
    # Extract entity and relation types
    df['entity_types'], df['relation_types'] = zip(*df['Prompts'].apply(extract_entity_relation_types))
    
    # Extract first sentence (instead of paragraph) from each prompt
    df['sentence'] = df['Prompts'].apply(lambda prompt: extract_sentence_from_prompt(prompt, 1) if pd.notna(prompt) else None)
    
    # Create sentence-based prompts using first sentence as example
    df['prompts_with_sentence_demo'] = df.apply(lambda row: 
        replace_example_in_prompt(row['Prompts'], 
                                create_example_text(row['sentence'], row['Annotation'])) 
        if pd.notna(row['sentence']) and pd.notna(row['Annotation']) 
        else row['Prompts'], axis=1)
    
    print(f"Domain sentence construction completed. Generated {len(df)} prompts.")
    return df

print("Domain sentence strategy function loaded")


In [None]:
# MAIN EXECUTION ENGINE
def run_prompt_construction(config):
    """Main function to execute the selected prompt construction strategy"""
    
    strategy = config['strategy']
    print(f"\n{'='*60}")
    print(f"EXECUTING STRATEGY: {strategy}")
    print(f"{'='*60}")
    
    # Execute the appropriate strategy
    if strategy == "DOMAIN_PARAGRAPH":
        result_df = construct_domain_paragraph_prompts(config)
    elif strategy == "RANDOM_PARAGRAPH":
        result_df = construct_random_paragraph_prompts(config)
    elif strategy == "DOMAIN_SENTENCE":
        result_df = construct_domain_sentence_prompts(config)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
    
    # Save results
    if 'output_csv' in config:
        result_df.to_csv(config['output_csv'], index=False)
        print(f"\nResults saved to: {config['output_csv']}")
    
    # Save intermediate results if requested
    if config.get('save_intermediate', False):
        intermediate_path = config['output_csv'].replace('.csv', '_intermediate.csv')
        result_df.to_csv(intermediate_path, index=False)
        print(f"Intermediate results saved to: {intermediate_path}")
    
    return result_df

print("Main execution engine loaded")


In [None]:
# EXECUTE PROMPT CONSTRUCTION
try:
    # Run the configured strategy
    result_df = run_prompt_construction(construction_config)
    
    # Display summary statistics
    print(f"\n{'='*60}")
    print("CONSTRUCTION SUMMARY")
    print(f"{'='*60}")
    print(f"Strategy: {construction_config['strategy']}")
    print(f"Total prompts generated: {len(result_df)}")
    print(f"Output columns: {list(result_df.columns)}")
    
    if construction_config.get('debug_output', False):
        print(f"\nFirst 3 rows preview:")
        print(result_df.head(3))
    
    print(f"\nConstruction completed successfully!")
    
except Exception as e:
    print(f"Error during construction: {str(e)}")
    import traceback
    traceback.print_exc()
