In [3]:
import pandas as pd
import json
import os
import time
import logging
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
# Initialize logging
logging.basicConfig(filename="pii_detection_errors.log", level=logging.ERROR, 
                    format="%(asctime)s - %(levelname)s - %(message)s")



# Add PII system path

pii_script_path = r"C:\Users\ousse\Desktop\pii\pii_protection"
sys.path.append(pii_script_path)

# Now import it
from pii import PIIProtectionLayer
# Import the PII detection system


In [4]:
import tqdm


try:
    from pii import PIIProtectionLayer
except ModuleNotFoundError as e:
    print(f"❌ Error: Unable to import PII module. Check if 'pii.py' exists in: {pii_script_path}")
    raise e

# File Paths
file_path = "C:\\Users\\ousse\\Desktop\\pii\\augmented_data_partial.csv"
output_json = "C:\\Users\\ousse\\Desktop\\pii\\pii_detection_results.json"
output_csv = "C:\\Users\\ousse\\Desktop\\pii\\pii_detection_results.csv"

# Load dataset
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'augmented_text' not in df.columns:
            raise ValueError("Missing 'augmented_text' column in CSV")
        return df
    except Exception as e:
        print(f"❌ Error loading CSV: {str(e)}")
        return None

def process_texts(df, detector):
    results = []
    errors = []
    
    print("🔍 Starting PII detection...\n")
    for index, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Processing Texts", unit="text"):
        text = row['augmented_text']
        try:
            result = detector.analyze_text(text, language='en')
            results.append(result)
        except Exception as e:
            error_msg = f"Error processing row {index}: {str(e)}"
            print(f"❌ {error_msg}")
            errors.append(error_msg)
    
    return results, errors

# Fix JSON serialization issue
def save_results(results, output_json, output_csv):
    def convert_set(obj):
        if isinstance(obj, set):
            return list(obj)
        raise TypeError(f"Type {type(obj)} not serializable")
    
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4, ensure_ascii=False, default=convert_set)
    
    flat_results = []
    for res in results:
        for entity in res['detected_entities']:
            flat_results.append({
                "original_text": res["original_text"],
                "redacted_text": res["redacted_text"],
                "entity_text": entity["text"],
                "entity_type": entity["type"],
                "confidence": entity["score"],
                "method": entity["method"],
                "start": entity["start"],
                "end": entity["end"]
            })
    
    df = pd.DataFrame(flat_results)
    df.to_csv(output_csv, index=False, encoding='utf-8')
    
    print(f"✅ Results saved to {output_json} and {output_csv}")

def generate_summary(results):
    entity_count = sum(len(r["detected_entities"]) for r in results)
    method_counts = {}
    for res in results:
        for entity in res["detected_entities"]:
            method = entity["method"]
            method_counts[method] = method_counts.get(method, 0) + 1
    
    print("\n📊 Summary of Detected PII:")
    print(f"Total Texts Processed: {len(results)}")
    print(f"Total Entities Detected: {entity_count}")
    print("Detection Methods Used:")
    for method, count in method_counts.items():
        print(f"  - {method}: {count} detections")
    return entity_count, method_counts

# Main Execution
df = load_data(file_path)

if df is not None:
    detector = PIIProtectionLayer()
    results, errors = process_texts(df, detector)
    
    save_results(results, output_json, output_csv)
    generate_summary(results)
    
    if errors:
        print("\n⚠️ Errors Encountered:")
        for error in errors[:5]:
            print(f"  - {error}")
        print(f"  (Total Errors: {len(errors)})")
else:
    print("❌ Could not process texts due to errors in loading data.")

Both `device` and `device_map` are specified. `device` will override `device_map`. You will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`.


🔍 Starting PII detection...



Processing Texts: 100%|██████████| 126/126 [05:32<00:00,  2.64s/text]

✅ Results saved to C:\Users\ousse\Desktop\pii\pii_detection_results.json and C:\Users\ousse\Desktop\pii\pii_detection_results.csv

📊 Summary of Detected PII:
Total Texts Processed: 126
Total Entities Detected: 86
Detection Methods Used:
  - spacy: 65 detections
  - regex: 21 detections



