In [1]:
import json
import pandas as pd
import matplotlib as mp
# for language detection
from presidio_analyzer import AnalyzerEngine, RecognizerResult
import pandas as pd
import spacy
import re

In [2]:
filepath = "../../data/cleaned/cleaned_csv_test.JSONL"
data = []

with open(filepath, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue

df = pd.DataFrame(data)
print(df)

        doc_id                                               text  \
0            0  bernadine brings to the agency over twenty yea...   
1            1  a cute denim skirt by dickies!\npair this jean...   
2            2  see the latest foothills homes february 2016 r...   
3            4  shape of the world is stu maxwell's passion pr...   
4            5  function dunfile(startcoord, rawpillardata, du...   
...        ...                                                ...   
197785  220743  and it was not "free to threaten". it was unde...   
197786  220744  catch up on business tips at bizbarcelona as t...   
197787  220745  problem i have is trying to figure out whether...   
197788  220746  i don't know why anybody's gonna get their pan...   
197789  220748  what is a tool or technique that can be used t...   

                                                      url  element_count  \
0       https://www.4rfv.com/V7BTWGJ6BUYI/AA/bernadine...              0   
1       http://www.

In [3]:
df

Unnamed: 0,doc_id,text,url,element_count,non-utf8_count,language
0,0,bernadine brings to the agency over twenty yea...,https://www.4rfv.com/V7BTWGJ6BUYI/AA/bernadine...,0,0,en
1,1,a cute denim skirt by dickies!\npair this jean...,http://www.uk-beaders.co.uk/dickies-denim-skir...,0,0,en
2,2,see the latest foothills homes february 2016 r...,https://www.premiertucsonhomes.com/foothills-h...,0,0,en
3,4,shape of the world is stu maxwell's passion pr...,https://killscreen.com/articles/shape-world-wa...,0,0,en
4,5,"function dunfile(startcoord, rawpillardata, du...",https://github.com/doggan/diablo-file-formats/...,0,0,en
...,...,...,...,...,...,...
197785,220743,"and it was not ""free to threaten"". it was unde...",,0,0,en
197786,220744,catch up on business tips at bizbarcelona as t...,https://www.barcelona-metropolitan.com/events/...,0,0,en
197787,220745,problem i have is trying to figure out whether...,,0,0,en
197788,220746,i don't know why anybody's gonna get their pan...,,0,0,en


In [3]:
# REduce df size for analysis
df = df[:2500]

# PII counts

In [4]:
analyzer = AnalyzerEngine()

In [5]:
def count_pii(text):
    """
    Count PII entities in a single text string using Presidio.
    Returns (total_hits, entity_counts_dict)
    """
    if not isinstance(text, str) or not text.strip():
        return 0, {}

    # Analyze text for common PII entities
    results = analyzer.analyze(
        text=text,
        entities=["EMAIL_ADDRESS", "PHONE_NUMBER"],
        language="en"
    )

    # Count occurrences
    total_hits = len(results)
    entity_counts = {}
    for r in results:
        entity_counts[r.entity_type] = entity_counts.get(r.entity_type, 0) + 1

    return total_hits, entity_counts

In [6]:
df["pii_results"] = df["text"].apply(count_pii)
df["pii_count"] = df["pii_results"].apply(lambda x: x[0])
df["pii_breakdown"] = df["pii_results"].apply(lambda x: x[1])
df = df.drop(columns="pii_results")

In [9]:
df.head(50)

Unnamed: 0,doc_id,text,url,element_count,non-utf8_count,language,pii_count,pii_breakdown
0,0,bernadine brings to the agency over twenty yea...,https://www.4rfv.com/V7BTWGJ6BUYI/AA/bernadine...,0,0,en,0,{}
1,1,a cute denim skirt by dickies!\npair this jean...,http://www.uk-beaders.co.uk/dickies-denim-skir...,0,0,en,0,{}
2,2,see the latest foothills homes february 2016 r...,https://www.premiertucsonhomes.com/foothills-h...,0,0,en,0,{}
3,4,shape of the world is stu maxwell's passion pr...,https://killscreen.com/articles/shape-world-wa...,0,0,en,0,{}
4,5,"function dunfile(startcoord, rawpillardata, du...",https://github.com/doggan/diablo-file-formats/...,0,0,en,0,{}
5,6,we offer a number of products.\nwhen purchasin...,http://lendsense.com/mortgage-products/,0,0,en,0,{}
6,7,"def example_lab_to_rgb():\n """"""\n conver...",https://github.com/gtaylor/python-colormath/bl...,0,0,en,0,{}
7,8,nowadays the most important protection element...,http://cybersecurity.mk/services/consulting,0,0,en,0,{}
8,10,"def select_host(self, metric):\n """"""\n ...",https://github.com/brutasse/graphite-api/blob/...,0,0,en,0,{}
9,11,welcome to my travel blog. my name is james fr...,https://www.jamesfrazermann.co.uk/,0,0,en,0,{}


# spacey

In [4]:
nlp = spacy.load("en_core_web_lg")

In [5]:
def count_pii_spacy(text):
    doc = nlp(text)
    
    entity_counts = {}
    total_hits = 0

    # Named Entity Recognition (NER) from spaCy
    for ent in doc.ents:
        if ent.label_ in ["PERSON"]:  # Common PII-related entities
            entity_counts[ent.label_] = entity_counts.get(ent.label_, 0) + 1
            total_hits += 1

    # Regex patterns for things spaCy misses (emails, phones, SSNs)
    patterns = {
        "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
        "PHONE": r"\b(?:\+?61|0)[2-478](?:[ -]?\d){8}\b",
    }

    for label, pattern in patterns.items():
        matches = re.findall(pattern, text)
        if matches:
            entity_counts[label] = len(matches)
            total_hits += len(matches)
    
    return total_hits, entity_counts

In [6]:
df["pii_results"] = df["text"].apply(count_pii_spacy)
df["pii_count"] = df["pii_results"].apply(lambda x: x[0])
df["pii_breakdown"] = df["pii_results"].apply(lambda x: x[1])
df = df.drop(columns="pii_results")


In [12]:
df.iloc[0]

doc_id                                                            0
text              bernadine brings to the agency over twenty yea...
url               https://www.4rfv.com/V7BTWGJ6BUYI/AA/bernadine...
element_count                                                     0
non-utf8_count                                                    0
language                                                         en
pii_count                                                        33
pii_breakdown        {'PERSON': 8, 'ORG': 12, 'GPE': 12, 'NORP': 1}
Name: 0, dtype: object

# Regex PII

In [15]:
def mask_text(text):
    phone = re.compile(r'\b(?:\+?61|0)[2-478](?:[ -]?\d){8}\b')
    email = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
    tfn = re.compile(r'\b\d{3}\s?\d{3}\s?\d{3}\b')

    masked_items = []

    if re.search(email, text):
        text = re.sub(email, "[EMAIL_MASKED]", text)
        masked_items.append("email")

    if re.search(phone, text):
        text = re.sub(phone, "[PHONE_MASKED]", text)
        masked_items.append("phone")

    if re.search(tfn, text):
        text = re.sub(tfn, "[TFN_MASKED]", text)
        masked_items.append("tfn")

    return text, ", ".join(masked_items) if masked_items else None

In [16]:
df[['text', 'masked_items']] = df['text'].apply(lambda x: pd.Series(mask_text(x)))

In [22]:
df['masked_items'].value_counts()

masked_items
email           86
tfn             16
phone            3
email, phone     1
Name: count, dtype: int64

In [41]:
emails = df[df['masked_items'] == 'email']
emails['text'].iloc[0]

'is there a way to create a direct link to a file on an internal network share on mac os x. in windows you can do this with file://filepath, but i can\'t find a solution that works on mac\nthank you\ngmail has a very nice ability to check for your pop accounts and send email through your own smtp settings. it will however always show something like "from:[EMAIL_MASKED] as [EMAIL_MASKED]" in the header of some more advanced email clients (like outlook).\nif you can handle setting up your own email server, go for it, but it will be hard and costly to achieve the same uptime and quality of service gmail does. if you have your own domain, i highly recommend using the google apps service so you get the quality of gmail, but with your own email address.'

# Toxic

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from multiprocessing import Pool, cpu_count

# --- Model setup ---
model_name = "unitary/toxic-bert"  # original Toxic-BERT
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = "cpu"  # CPU-only
model.to(device)
model.eval()  # important for inference

# --- Prediction function for a batch of texts ---
def predict_batch(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
    return probs[:, 1].tolist()  # probability of toxic class

# --- Function to process batches (used in multiprocessing) ---
def process_batch(batch_texts):
    return predict_batch(batch_texts)

# --- Split data into batches ---
batch_size = 64  # adjust based on RAM
texts = df['text'].astype(str).tolist()
batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]

# --- Use multiprocessing to process batches in parallel ---
num_cores = min(cpu_count(), len(batches))
with Pool(num_cores) as pool:
    results_list = pool.map(process_batch, batches)

# --- Flatten results ---
toxicity_scores = [score for batch in results_list for score in batch]
df['toxicity_score'] = toxicity_scores
df['is_toxic'] = df['toxicity_score'] >= 0.5

print("Toxic rows flagged:", df['is_toxic'].sum())


In [13]:
df

Unnamed: 0,doc_id,text,url,element_count,non-utf8_count,language,toxicity_score,is_toxic
0,0,bernadine brings to the agency over twenty yea...,https://www.4rfv.com/V7BTWGJ6BUYI/AA/bernadine...,0,0,en,0.094488,False
1,1,a cute denim skirt by dickies!\npair this jean...,http://www.uk-beaders.co.uk/dickies-denim-skir...,0,0,en,0.001384,False
2,2,see the latest foothills homes february 2016 r...,https://www.premiertucsonhomes.com/foothills-h...,0,0,en,0.093094,False
3,4,shape of the world is stu maxwell's passion pr...,https://killscreen.com/articles/shape-world-wa...,0,0,en,0.082762,False
4,5,"function dunfile(startcoord, rawpillardata, du...",https://github.com/doggan/diablo-file-formats/...,0,0,en,0.012584,False
...,...,...,...,...,...,...,...,...
2495,2761,the sad part is the invisibility of those suff...,,0,0,en,0.053282,False
2496,2762,goat poo is a fabulous fertilizer and is easi...,,0,0,en,0.020194,False
2497,2763,coming from a c# background the naming convent...,,0,0,en,0.091265,False
2498,2764,we have a team of highly qualified in their ro...,http://www.arab-adventures.com/about/our-team/,0,0,en,0.077829,False
