# Automized Anonymization of Job Ads

## Libraries and settings

In [1]:
# Settings
import warnings
warnings.filterwarnings("ignore")

# Libraries
import sys
import os
import re
import gzip
import time
import json
import spacy
import spacy.cli
import boto3
import ollama
import random
import subprocess
import pandas as pd
from io import BytesIO
from openai import OpenAI
from pprint import pprint
from langdetect import detect
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import clear_output

# Load spacy models
def load_model(model_name):
    try:
        return spacy.load(model_name)
    except OSError:
        try:
            print(f"Model {model_name} not found. Attempting to install...")
            subprocess.run([sys.executable, "-m", "spacy",
                           "download", model_name], check=True)
            return spacy.load(model_name)
        except Exception as e:
            print(f"Failed to install or load {model_name}: {e}")
            return None

nlp_models = {
    "de": load_model("de_core_news_sm"),
    "en": load_model("en_core_web_sm"),
    "fr": load_model("fr_core_news_sm"),
    "it": load_model("it_core_news_sm"),
}

# Display current working directory
print(os.getcwd())


/home/ec2-user/SageMaker/skill_framework/notebooks/explorative


## Read job ads from S3 bucket

In [2]:
# Define bucket and key
bucket_name = 'jc-innosuisse'
object_key = 'ad_data/full_datapool_100000.json'

# Initialize S3 client
s3_client = boto3.client('s3')

# Fetch object from S3
response = s3_client.get_object(Bucket=bucket_name, Key=object_key)

# Check if file is gzipped JSON or regular JSON
if object_key.endswith('.json.gz'):
    # Read and decompress the gzipped content
    with gzip.GzipFile(fileobj=BytesIO(response['Body'].read())) as gz:
        json_bytes = gz.read()
    # Load JSON data into pandas DataFrame
    data = json.loads(json_bytes.decode('utf-8'))
elif object_key.endswith('.json'):
    # For regular JSON, read directly
    json_bytes = response['Body'].read()
    data = json.loads(json_bytes.decode('utf-8'))
else:
    raise ValueError(
        f"Unsupported file format: {object_key}. Expected .json or .json.gz")

# Create DataFrame
df = pd.DataFrame(data)

# Show shape
print(df.shape)


(10000, 27)


## Create subset of columns

In [3]:
# Create subset of columns
df_sub = df[['id', 'title', 'leadText', 'html', 'profession', 'updatedAt']]

# Show shape
print(df_sub.shape)

# Show data
df_sub.head()


(10000, 6)


Unnamed: 0,id,title,leadText,html,profession,updatedAt
0,5fed0746-a0d0-4559-8062-01df1ef952a0,Supply Chain Manager,<b>Sie sind ein geschickter Verhandlungspartne...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",Supply Chain Manager,2018-08-28T13:59:11+02:00
1,5104eb99-c228-4891-b147-4cc0269e4957,Solution Manager (w/m) mit Ausprägung Projekt-...,<p></p><p>Für unseren Bereich IT-Services in B...,"<!DOCTYPE html>\n<html lang=""de"">\n<head>\n <...",Solution Manager (w/m) mit Ausprägung Projekt-...,2018-08-28T13:59:12+02:00
2,e3ee1b00-5560-4f3d-9fb6-ead9fac78a7f,PMO Assistant,For our customer in Zurich we are looking for ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",PMO Assistant,2018-08-28T13:59:12+02:00
3,e62ad9c3-a045-455f-bb08-22cf448fbef3,Développeur z/OS,Développeur z/OS,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",Développeur z/OS,2018-08-28T13:59:14+02:00
4,ee06443e-a30e-4405-9d9a-af0370cfd43a,Marketing Manager Konsumgüter,<p>Unser Kunde ist ein etabliertes innovatives...,"<html>\n <head>\n <meta http-equiv=""content-t...",Marketing Manager Konsumgüter,2018-08-28T13:59:15+02:00


## Parse HTML with beautifulsoup

### Parsing function and function call

In [4]:
# Safe text extraction function
def extract_text_from_html(html):
    if pd.isna(html):
        return ""
    soup = BeautifulSoup(str(html), 'html.parser')
    return soup.get_text(separator=' ', strip=True)

# Apply with null handling
df_sub['text'] = df_sub['html'].apply(extract_text_from_html)

### Show extracted text

In [5]:
# Show parsed HTML
df_sub['text'][0]


'Supply Chain Manager Sie sind ein geschickter Verhandlungspartner!? Unsere Mandantin ist ein weltweit marktfrendes Unternehmen der High Tech-Industrie. Supply Chain Manager Ihre Aufgaben Mitglied im interdisziplinen Projektteam f ProduktentwicklungenErstellen verschiedener KonzepteAkquisition von neuen, internationalen LieferantenVom Prototyp bis zur Serienreife einkaufen der benigten MaterialienBerksichtigen der Termine, Kosten + QualitKennzahlen ermitteln + erwachenAushandeln und Unterzeichnen von Vertren + Vereinbarungen Ihr Anforderungsprofil Techn. Oder kaufm. Lehre in einem mit der Technik verbundenen BetriebHF-AbschlussMehrjrige Projekt- und Einkaufserfahrung (operativ/strategisch)Deutsch und EnglischEDV-Kenntnisse MS-Office, SAP-System R/3 Das Angebot lebhaftes, spannendes Arbeitsumfeld Roger Heggli freut sich auf Ihre Kontaktnahme. Bitte senden Sie Ihre vollstdigen Bewerbungsunterlagen per E-Mail. > > > Weitere Stellen - Angebote < < < ZIWALIG AG Ri Herr Roger Heggli Ferrachs

## Function to anonymize 'text' and 'html' entries using language detection

In [6]:
# Detect language from text only
def detect_language(text):
    try:
        return detect(text)
    except:
        return "en"

# Anonymize a given string using language model
def anonymize_with_lang(text, lang_code):
    if not isinstance(text, str):
        return ""
    nlp = nlp_models.get(lang_code, nlp_models["en"])
    try:
        doc = nlp(text)
    except:
        return text

    redacted_text = text
    spans_to_replace = []

    # Personal names redaction
    for ent in doc.ents:
        if ent.label_ in {"PER", "PERSON"}:
            spans_to_replace.append(
                (ent.start_char, ent.end_char, "[REDACTED NAME]"))

    # Sort and replace in reverse order to preserve positions
    for start, end, replacement in sorted(spans_to_replace, reverse=True):
        redacted_text = redacted_text[:start] + \
            replacement + redacted_text[end:]

    # Phone redaction
    phone_patterns = [
        r'(\+|00)?41[\s\-]?\d{2}([\s\-]?\d{2}){3}',
        r'\b\d{3}[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}\b',
        r'(?:(?:\+|00)41\s?|0)\d{2}\s?\d{3}\s?\d{2}\s?\d{2}',
        r'(?:(?:\+|00)\s*41\s*)?\(?0?\d{2}\)?[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}'
    ]
    for pattern in phone_patterns:
        redacted_text = re.sub(pattern, "[REDACTED PHONE]", redacted_text)

    # Email redaction
    redacted_text = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+',
                           "[REDACTED EMAIL]", redacted_text)

    return redacted_text


## Apply anonymization using language detection

In [7]:
%%time

# Limit to first 100 rows
df_sub = df_sub.iloc[:100].copy()

# Detect language from 'text' → save in 'text_lang'
df_sub['text_lang'] = df_sub['text'].apply(
    lambda x: detect_language(x) if pd.notnull(x) else "en"
)

# Anonymize 'text' using detected 'text_lang'
df_sub['text_redacted'] = df_sub.apply(
    lambda row: anonymize_with_lang(
        row['text'], row['text_lang']
    ) if pd.notnull(row['text']) else "",
    axis=1
)

# Anonymize 'html' using same 'text_lang'
df_sub['html_redacted'] = df_sub.apply(
    lambda row: anonymize_with_lang(
        row['html'], row['text_lang']
    ) if pd.notnull(row['html']) else "",
    axis=1
)


CPU times: user 18.2 s, sys: 64.1 ms, total: 18.2 s
Wall time: 18.2 s


## Show result

In [8]:
df_sub[['text', 'text_lang', 'text_redacted',
        'html', 'html_redacted']].head()


Unnamed: 0,text,text_lang,text_redacted,html,html_redacted
0,Supply Chain Manager Sie sind ein geschickter ...,de,[REDACTED NAME] Manager Sie sind ein geschickt...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...","<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
1,Solution Manager (w/m) mit Ausprägung Projekt-...,de,Solution Manager (w/m) mit Ausprägung Projekt-...,"<!DOCTYPE html>\n<html lang=""de"">\n<head>\n <...","<!DOCTYPE html>\n<html lang=""de"">\n<head>\n <..."
2,PMO Assistant Die Harvey Nash AG ist einer der...,en,PMO Assistant Die Harvey Nash AG ist einer der...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...","<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
3,Développeur z/OS Find an Office | Contact Us |...,fr,Développeur z/OS Find an Office | Contact Us |...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...","<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
4,Marketing Manager Konsumger Unser Kunde ist ei...,de,[REDACTED NAME] Manager Konsumger Unser Kunde ...,"<html>\n <head>\n <meta http-equiv=""content-t...","<html>\n <head>\n <meta http-equiv=""content-t..."


## Manually check for remaining personally identifiable information (PII)

In [9]:
# Define indices
indices = random.sample(range(1, len(df_sub)), 10)

# Loops through indices
for i in indices:
    print("Index:", i, "Sprache:", df_sub['text_lang'][i].upper())
    print(df_sub['text_redacted'][i])
    time.sleep(5)
    clear_output(wait=True)


Index: 74 Sprache: DE
[REDACTED NAME] Officer 510185 Interim positions for Banking - Finance - Sales & HR Unser Kunde  ist ein international tÃ¤tiger und namhafter Hersteller von  KonsumgÃ¼tern. In seinem Auftrag suchen wir fÃ¼r den Standort Glattbrugg ab sofort fÃ¼r ca. ein Jahr eine/einen Interim [REDACTED NAME] 510185 Ihre Aufgaben In  dieser Funktion sind Sie verantwortlich fÃ¼r die gesamten  Lohnabrechnungen der Mitarbeitenden zustÃ¤ndig. ZusÃ¤tzlich gelten Sie als  kompetente Ansprechperson in allen Sozial- und Versicherungsrechtlichen  Fragen und unterstÃ¼tzen das HR Team bei der allgemeinen  SalÃ¤radministration. Â Ihr Profil FÃ¼r  diese vielseitige Aufgabe verfÃ¼gen Sie Ã¼ber eine kaufmÃ¤nnische  Grundausbildung von Vorteil mit einer Weiterbildung im Bereich  SalÃ¤rwesen. Sie kommunizieren stilsicher in Deutsch und verstÃ¤ndigen  sich ohne MÃ¼he auf Englisch. Abgerundet wird Ihr Profil durch versierte  Kenntnisse der MS-Office Palette. Erste Erfahrungen mit SAP sind von  Vorte

## Use a random sample of the data to check for remaining personally identifiable information (PII)

### LLM settings (local LLM used via Ollama, secure but slow :-))

In [10]:
# Load environment variables from .env (optional, if OpenAI API ist used)
load_dotenv()

# Function to check for personally identifiable information (PII)
def check_for_remaining_pii(text_block, label, model_name="gemma3:1b"):
    system_prompt = (
        "You are a data privacy auditor. Review the following redacted "
        "content and report if any personally identifiable information "
        "(PII) such as names, emails, phone numbers, addresses, locations, "
        "or company names remain."
    )

    user_prompt = (
        f"Here is the {label} content to review:\n\n{text_block}\n\n"
        "Does it contain any remaining personal data? If yes, specify "
        "what and where. If not, say 'No PII found.'"
    )

    try:
        response = ollama.chat(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        return response['message']['content'].strip()
    except Exception as e:
        return f"Error: {str(e)}"


### Run LLM check on sampled rows

In [11]:
# Sample rows (include e.g. 'random_state=42' to set a seed)
sample_df = df_sub.sample(n=5)[['text_redacted', 'html_redacted']]

# Run LLM check
for idx, row in sample_df.iterrows():
    print(f"\n Row {idx} - text_redacted:\n{'-'*60}")
    result_text = check_for_remaining_pii(
        row['text_redacted'], 'text_redacted')
    print(result_text)

    print(f"\n Row {idx} - html_redacted:\n{'-'*60}")
    result_html = check_for_remaining_pii(
        row['html_redacted'], 'html_redacted')
    print(result_html)



 Row 56 - text_redacted:
------------------------------------------------------------
No PII found.

Here’s a breakdown of why:

*   **Names:** "Claudia Reist" – This is a name, but not a specific identifier.
*   **Emails:** “[REDACTED EMAIL]” – The email address is redacted, but it's present.
*   **Phone Numbers:** Not present in the text.
*   **Addresses:** Not present in the text.
*   **Locations:** Not present in the text.
*   **Company Names:** "REVIVAL Consulting AG" – This is a company name, but it's not a PII element.

 Row 56 - html_redacted:
------------------------------------------------------------
No PII found.

Here’s a breakdown of why:

* **Meta Content:** The `Meta` tag clearly states it's a “CI Layout” and includes a “ISO-8859-1” encoding. This is technical metadata and not personal data.
* **Styling:** The CSS styles are for branding and are not identifiable information.
* **Table Data:** The table contains information about employees – names, titles, addresses, an

## Write anonymized data to json file

In [12]:
# File path
file_path = '../../data/jobcloud/full_datapool_100000_anonymized.json'

# Write to json
df_sub[['id', 'title', 'leadText', 'html', 'profession', 
        'updatedAt', 'text', 'text_lang', 'text_redacted', 
         'html_redacted']].to_json(file_path, 
                                         orient='records', 
                                         lines=True, 
                                         force_ascii=False)
