In [None]:
import os
import re
import csv

# Define the patterns to be filtered out
patterns = [
    r"Mme la présidente",
    r"Mme le présidente",
    r"M\. le président",
    r"Mme le président",
    r"Mme la président"
]

# Compile a regular expression pattern to match any of the target patterns
combined_pattern = re.compile("|".join(patterns))

# Function to filter lines in a CSV file and remove the 'Date_seances' column
def filter_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        # Remove 'Date_seances' from fieldnames if it exists
        fieldnames = [field for field in reader.fieldnames if field != 'Date_seances']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            # If the line does not match any of the patterns, remove 'Date_seances' and write the row
            if not combined_pattern.search(" ".join(row.values())):
                # Remove the 'Date_seances' column from the row
                row.pop('Date_seances', None)  # Safely remove the column if it exists
                writer.writerow(row)

# Function to process all CSV files in a folder
def process_csv_files(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Check if the input folder exists
    if not os.path.exists(input_folder):
        print(f"Error: Input folder '{input_folder}' not found.")
        return  # Exit the function if the folder doesn't exist

    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):  # Process only CSV files
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            print(f"Processing file: {input_file_path}")
            filter_csv(input_file_path, output_file_path)
            print(f"Filtered file saved to: {output_file_path}")

input_folder = 'compte_intervenants_et_pages'
output_folder = 'output_cleaned'
# Process the files
process_csv_files(input_folder, output_folder)

print("All CSV files have been processed and saved to the output folder.")


Processing file: compte_intervenants_et_pages/s20220222.csv
Filtered file saved to: output_cleaned/s20220222.csv
All CSV files have been processed and saved to the output folder.


In [None]:
import os
import re
import csv

# Define the patterns to be filtered out
patterns = [
    r"Mme la présidente",
    r"Mme le présidente",
    r"M\. le président",
    r"Mme le président",
    r"Mme la président"
]

# Compile a regular expression pattern to match any of the target patterns
combined_pattern = re.compile("|".join(patterns))

# Function to capitalize only the first letter of each word in the name
def format_name(name):
    # Remove any leading title like "M." or "Mme"
    name = re.sub(r'\b(Mme|M\.)\b', '', name).strip()

    # Capitalize the first letter of each word
    return " ".join(word.capitalize() for word in name.split())

# Function to filter lines in a CSV file and modify 'Nom_intervenants' column
def filter_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.DictReader(infile)
        # Remove 'Date_seances' from fieldnames if it exists
        fieldnames = [field for field in reader.fieldnames if field != 'Date_seances']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            # If the line does not match any of the patterns, remove 'Date_seances' and modify 'Nom_intervenants'
            if not combined_pattern.search(" ".join(row.values())):
                # Remove the 'Date_seances' column from the row
                row.pop('Date_seances', None)  # Safely remove the column if it exists

                # Apply name formatting to 'Nom_intervenants'
                if 'Nom_intervenants' in row:
                    row['Nom_intervenants'] = format_name(row['Nom_intervenants'])

                writer.writerow(row)

# Function to process all CSV files in a folder
def process_csv_files(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Check if the input folder exists
    if not os.path.exists(input_folder):
        print(f"Error: Input folder '{input_folder}' not found.")
        return  # Exit the function if the folder doesn't exist

    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):  # Process only CSV files
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            print(f"Processing file: {input_file_path}")
            filter_csv(input_file_path, output_file_path)
            print(f"Filtered file saved to: {output_file_path}")

input_folder = 'compte_intervenants_et_pages'
output_folder = 'output_cleaned'
# Process the files
process_csv_files(input_folder, output_folder)

print("All CSV files have been processed and saved to the output folder.")


Processing file: compte_intervenants_et_pages/s20220222.csv
Filtered file saved to: output_cleaned/s20220222.csv
All CSV files have been processed and saved to the output folder.
