In [12]:
import os
import pandas as pd

# Set the directory containing the synthetic sentence files
input_directory = "output"
output_directory = "output/validation_issues"

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Function to validate that all rows in each column contain the target term
def validate_target_in_files(directory):
    issues_summary = []  # To store summary of issues
    all_problematic_rows_combined = pd.DataFrame()  # To store all unique problematic rows across files

    # Loop through each file in the directory
    for file_name in os.listdir(directory):
        if file_name.endswith(".csv"):
            # Extract the target term from the filename
            target_term = file_name.split("_")[0]
            file_path = os.path.join(directory, file_name)

            try:
                # Read the CSV file
                df = pd.read_csv(file_path)

                # Check if the required columns exist
                if {'baseline', 'positive_variation', 'negative_variation'}.issubset(df.columns):
                    all_problematic_rows = pd.DataFrame()

                    # Use a set to track rows we've already added to avoid duplicates
                    seen_rows = set()

                    for column in ['baseline', 'positive_variation', 'negative_variation']:
                        # Identify rows that do not contain the target term
                        problematic_rows = df[~df[column].str.contains(target_term, case=False, na=False)].copy()

                        if not problematic_rows.empty:
                            # Add epoch and row number for identification
                            problematic_rows['epoch'] = file_name.split("_")[1].split(".")[0]  # Extract epoch from filename
                            problematic_rows['row_number'] = problematic_rows.index
                            problematic_rows['problem_column'] = column

                            # Only add rows that have not been added before (check via 'row_number')
                            problematic_rows = problematic_rows[~problematic_rows['row_number'].isin(seen_rows)]
                            seen_rows.update(problematic_rows['row_number'])

                            # Append to the combined DataFrame for this file
                            all_problematic_rows = pd.concat([all_problematic_rows, problematic_rows])

                    # If there are problematic rows, save them to a new file
                    if not all_problematic_rows.empty:
                        # Combine with the master list of all rows across all files
                        all_problematic_rows_combined = pd.concat([all_problematic_rows_combined, all_problematic_rows])

                        issues_summary.append((file_name, len(all_problematic_rows)))

                else:
                    print(f"File {file_name} is missing required columns.")
            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

    # Remove duplicates across the entire combined DataFrame
    if not all_problematic_rows_combined.empty:
        all_problematic_rows_combined = all_problematic_rows_combined.drop_duplicates(subset=['baseline', 'positive_variation', 'negative_variation', 'row_number'])

        # Save the unique problematic rows to a new file
        output_file = os.path.join(output_directory, f"validation_issues.csv")
        all_problematic_rows_combined.to_csv(output_file, index=False)

        # Print summary of issues
        print("The following files have issues:")
        for file, count in issues_summary:
            print(f"File: {file}, Number of problematic rows: {count}")
    else:
        print("No validation issues found.")

# Run the validation
validate_target_in_files(input_directory)

The following files have issues:
File: abuse_1975-1979.synthetic_sentences.csv, Number of problematic rows: 3
File: abuse_1980-1984.synthetic_sentences.csv, Number of problematic rows: 2
File: abuse_1985-1989.synthetic_sentences.csv, Number of problematic rows: 5
File: abuse_1990-1994.synthetic_sentences.csv, Number of problematic rows: 1
File: abuse_1995-1999.synthetic_sentences.csv, Number of problematic rows: 1
File: abuse_2000-2004.synthetic_sentences.csv, Number of problematic rows: 3
File: abuse_2005-2009.synthetic_sentences.csv, Number of problematic rows: 7
