In [4]:
# ============================================================
# 0. INSTALL DEPENDENCIES  (RUN ONCE)
# ============================================================
!pip install -q spacy nltk pandas sentence-transformers tqdm

# Download spaCy English model
!python -m spacy download en_core_web_sm

import nltk
nltk.download("punkt")


# ============================================================
# 1. IMPORTS & BASIC SETUP
# ============================================================
import re
import json
import itertools
import numpy as np
import pandas as pd
import spacy
import nltk

from google.colab import files
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity

# ============================================================
# 2. LOAD NLP MODELS
# ============================================================

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Increase spaCy max length to handle large documents
nlp.max_length = 2_000_000  # required for large GDPR / Microsoft policy files

# Load BERT-based sentence embedding model (fast & efficient)
bert_model = SentenceTransformer("all-MiniLM-L6-v2")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# ============================================================
# 2. UPLOAD TEXT FILES
# ============================================================
print("=== STEP 1: Upload REGULATORY (e.g., GDPR) TEXT FILE ===")

def upload_text_file(prompt="Upload a .txt file"):
    print(prompt)
    uploaded = files.upload()
    filename = list(uploaded.keys())[0]
    text = uploaded[filename].decode("utf-8", errors="ignore")
    print(f"\nLoaded file: {filename} ({len(text)} characters)")
    return text, filename

reg_text, reg_filename = upload_text_file(
    "Upload REGULATORY document (.txt) e.g., GDPR"
)

print("\n=== STEP 2: Upload COMPANY SECURITY / PRIVACY POLICY TEXT FILE ===")
policy_text, policy_filename = upload_text_file(
    "Upload COMPANY SECURITY / PRIVACY POLICY (.txt)"
)

# ============================================================
# 3. PREPROCESSING & SENTENCE SPLITTING
# ============================================================
def basic_clean(text: str) -> str:
    text = text.replace("\r", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def split_into_sentences(text: str):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

clean_reg_text = basic_clean(reg_text)
clean_policy_text = basic_clean(policy_text)

reg_sentences = split_into_sentences(clean_reg_text)
policy_sentences = split_into_sentences(clean_policy_text)

print(f"\nRegulatory document: {len(reg_sentences)} sentences")
print(f"Company policy document: {len(policy_sentences)} sentences")

# ============================================================
# 4. RULE-BASED PATTERNS & ARTICLE TRACKING
# ============================================================
POLICY_PATTERNS = {
    "data_protection": [
        "personal data", "data protection", "privacy", "processing of personal data",
        "data minimization", "data minimisation", "purpose limitation"
    ],
    "data_subject_rights": [
        "right to access", "right of access", "right to rectification",
        "right to erasure", "right to be forgotten", "data subject rights"
    ],
    "security_measures": [
        "encryption", "pseudonymisation", "pseudonymization", "access control",
        "security measures", "technical and organisational", "technical and organizational",
        "multi-factor", "two-factor", "mfa", "risk assessment"
    ],
    "data_retention": [
        "retention", "storage limitation", "data retention", "retain data for"
    ],
    "breach_notification": [
        "data breach", "breach notification", "notify the supervisory authority",
        "personal data breach", "72 hours"
    ],
    "third_parties": [
        "processor", "sub-processor", "third party", "joint controller",
        "data processing agreement"
    ],
}

def classify_sentence(sentence: str):
    s = sentence.lower()
    matched = []
    for cat, kws in POLICY_PATTERNS.items():
        if any(kw in s for kw in kws):
            matched.append(cat)
    return matched

def is_potential_requirement(sentence: str):
    s = sentence.lower()
    obligation_words = ["shall", "must", "required to", "obliged to", "ensure that"]
    return any(w in s for w in obligation_words)

# Track article headings and propagate to following sentences
def assign_articles_to_sentences(sentences):
    """
    When we see "Article 32 ..." we set current_article = "Article 32".
    All following sentences inherit that article until the next article heading.
    """
    article_pattern = re.compile(r"(Article|Art\.)\s*\d+[A-Za-z0-9()]*")
    current_article = ""
    article_by_index = []

    for s in sentences:
        m = article_pattern.search(s)
        if m:
            current_article = m.group(0).replace("Art.", "Article")
        article_by_index.append(current_article)

    return article_by_index

reg_article_labels = assign_articles_to_sentences(reg_sentences)

print("\nSample classification from regulatory document:")
for s in reg_sentences[:1]:
    print("  SENT:", s[:120], "...")
    print("    categories:", classify_sentence(s))
    print("    is_requirement:", is_potential_requirement(s))

# ============================================================
# 5. BUILD COMPLIANCE REQUIREMENTS
# ============================================================
def build_compliance_requirements(sentences, article_labels, source_doc_name):
    requirements = []
    counter = itertools.count(1)

    for sent, art in zip(sentences, article_labels):
        categories = classify_sentence(sent)
        if not categories:
            continue  # skip sentences that do not look like policy/security related

        if not is_potential_requirement(sent):
            continue  # keep only "hard" requirement-like sentences

        req_id = next(counter)
        requirements.append({
            "requirement_id": f"REQ-{req_id:04d}",
            "text": sent,
            "categories": ";".join(categories),
            "article": art,             # assigned by context
            "source_document": source_doc_name,
            "status": "not_checked",
            "evidence": ""
        })

    return requirements

requirements = build_compliance_requirements(reg_sentences, reg_article_labels, reg_filename)
print(f"\nExtracted {len(requirements)} compliance requirements from regulatory document.")

checklist_df = pd.DataFrame(requirements)
# (Preview of checklist removed as requested)

# ============================================================
# 6. FILTERING MODES
# ============================================================
# Choose ONE filtering strategy by changing FILTER_MODE:
#   "articles"   -> only specific GDPR articles
#   "categories" -> only security-related categories
#   "none"       -> use all extracted requirements
FILTER_MODE = "none"    # <-- change to "articles", "categories", or "none"

TARGET_ARTICLES = ["Article 25", "Article 32", "Article 33", "Article 34"]
SECURITY_CATEGORY_REGEX = r"(security_measures|breach_notification)"

print(f"\nFiltering mode: {FILTER_MODE}")
print(f"Requirements BEFORE filtering: {len(checklist_df)}")

if FILTER_MODE == "articles":
    mask = checklist_df["article"].isin(TARGET_ARTICLES)
    filtered_df = checklist_df[mask]
elif FILTER_MODE == "categories":
    mask = checklist_df["categories"].str.contains(SECURITY_CATEGORY_REGEX, na=False)
    filtered_df = checklist_df[mask]
else:
    filtered_df = checklist_df

print(f"Requirements AFTER filtering : {len(filtered_df)}")

# If filtering accidentally removed everything, fall back to ALL
if len(filtered_df) == 0:
    print("\n[WARN] Filter removed all requirements. Falling back to ALL requirements.")
    filtered_df = checklist_df

filtered_requirements = filtered_df.to_dict(orient="records")

# ============================================================
# 7. BERT MODEL & EMBEDDINGS
# ============================================================
print("\nLoading BERT sentence embedding model (SentenceTransformers)...")
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # small & fast
model = bert_model  # reuse loaded model

def encode_sentences(sentences, batch_size=64):
    if not sentences:
        return np.zeros((0, 384))
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Encoding batches"):
        batch = sentences[i:i+batch_size]
        emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        embeddings.append(emb)
    return np.vstack(embeddings)

# Encode requirement texts and policy sentences
req_texts = [r["text"] for r in filtered_requirements]
pol_texts = policy_sentences

print("\nEncoding regulatory requirements with BERT...")
req_embeddings = encode_sentences(req_texts)

print("Encoding company policy sentences with BERT...")
pol_embeddings = encode_sentences(pol_texts)

# Compute cosine similarity matrix
if len(req_embeddings) > 0 and len(pol_embeddings) > 0:
    sim_matrix = cosine_similarity(req_embeddings, pol_embeddings)
else:
    sim_matrix = np.zeros((len(req_embeddings), len(pol_embeddings)))

print(f"\nSimilarity matrix shape (requirements x policy_sentences): {sim_matrix.shape}")

# ============================================================
# 8. EVALUATE COVERAGE & OVERALL SCORE
# ============================================================
def evaluate_policy_coverage(requirements, policy_sentences, sim_matrix,
                             high_thr=0.6, mid_thr=0.4):
    best_policy_sentence = []
    best_score = []
    status = []

    for i, req in enumerate(requirements):
        if sim_matrix.shape[1] == 0:
            best_policy_sentence.append("")
            best_score.append(0.0)
            status.append("non_compliant")
            continue

        sims = sim_matrix[i]
        j = sims.argmax()
        score = float(sims[j])
        best_policy_sentence.append(policy_sentences[j])
        best_score.append(score)

        if score >= high_thr:
            status.append("compliant")
        elif score >= mid_thr:
            status.append("partially_compliant")
        else:
            status.append("non_compliant")

    df = pd.DataFrame(requirements)
    df["best_policy_match"] = best_policy_sentence
    df["similarity_score"] = best_score
    df["status"] = status
    df["evidence"] = df["best_policy_match"]
    return df

evaluated_df = evaluate_policy_coverage(filtered_requirements, policy_sentences, sim_matrix)

# Overall stats + optimistic scoring: score = (C + 0.7 * P) / T * 100
total = len(evaluated_df)
num_c = (evaluated_df["status"] == "compliant").sum()
num_p = (evaluated_df["status"] == "partially_compliant").sum()
num_n = (evaluated_df["status"] == "non_compliant").sum()

if total > 0:
    score = (num_c + 0.7 * num_p) / total * 100.0
else:
    score = 0.0

print("\n==================== OVERALL COMPLIANCE SUMMARY ====================")
print(f"Regulatory source: {reg_filename}")
print(f"Company policy:   {policy_filename}")
print(f"Filtering mode:   {FILTER_MODE}")
if FILTER_MODE == "articles":
    print(f"Target articles:  {TARGET_ARTICLES}")
elif FILTER_MODE == "categories":
    print(f"Target categories regex: {SECURITY_CATEGORY_REGEX}")
print(f"Total requirements considered: {total}")
if total > 0:
    print(f"  Compliant            : {num_c} ({num_c/total*100:.1f}%)")
    print(f"  Partially compliant  : {num_p} ({num_p/total*100:.1f}%)")
    print(f"  Non-compliant        : {num_n} ({num_n/total*100:.1f}%)")
else:
    print("  [No requirements after filtering; score is 0.]")
print("---------------------------------------------------------------------")
print("Scoring formula: score = (C + 0.7 * P) / T * 100")
print(f"Overall compliance score (0–100): {score:.2f}")
print("=====================================================================\n")

print("Evaluated compliance preview (first 5 rows):")
print(evaluated_df.head())

# ============================================================
# 9. SAVE RESULTS (FILTERED CHECKLIST + BERT EVALUATION)
# ============================================================
checklist_filtered_csv = "compliance_checklist_filtered.csv"
checklist_filtered_json = "compliance_checklist_filtered.json"
evaluated_csv = "compliance_evaluation_bert_filtered.csv"
evaluated_json = "compliance_evaluation_bert_filtered.json"

filtered_df.to_csv(checklist_filtered_csv, index=False)
filtered_df.to_json(checklist_filtered_json, orient="records", indent=2)

evaluated_df.to_csv(evaluated_csv, index=False)
evaluated_df.to_json(evaluated_json, orient="records", indent=2)

print(f"\nSaved filtered checklist to: {checklist_filtered_csv} / {checklist_filtered_json}")
print(f"Saved evaluated compliance report to: {evaluated_csv} / {evaluated_json}")

# If you want auto-download, uncomment:
# files.download(checklist_filtered_csv)
# files.download(checklist_filtered_json)
# files.download(evaluated_csv)
# files.download(evaluated_json)

=== STEP 1: Upload REGULATORY (e.g., GDPR) TEXT FILE ===
Upload REGULATORY document (.txt) e.g., GDPR


Saving CELEX_32016R0679_EN_TXT.txt to CELEX_32016R0679_EN_TXT (1).txt

Loaded file: CELEX_32016R0679_EN_TXT (1).txt (362155 characters)

=== STEP 2: Upload COMPANY SECURITY / PRIVACY POLICY TEXT FILE ===
Upload COMPANY SECURITY / PRIVACY POLICY (.txt)


Saving google_privacy_policy_en.txt to google_privacy_policy_en.txt

Loaded file: google_privacy_policy_en.txt (53082 characters)

Regulatory document: 1574 sentences
Company policy document: 336 sentences

Sample classification from regulatory document:
  SENT: ﻿4.5.2016 EN Official Journal of the European Union L 119/1 I (Legislative acts) REGULATIONS REGULATION (EU) 2016/679 OF ...
    categories: ['data_protection']
    is_requirement: False

Extracted 185 compliance requirements from regulatory document.

Filtering mode: none
Requirements BEFORE filtering: 185
Requirements AFTER filtering : 185

Loading BERT sentence embedding model (SentenceTransformers)...

Encoding regulatory requirements with BERT...


Encoding batches:   0%|          | 0/3 [00:00<?, ?it/s]

Encoding company policy sentences with BERT...


Encoding batches:   0%|          | 0/6 [00:00<?, ?it/s]


Similarity matrix shape (requirements x policy_sentences): (185, 336)

Regulatory source: CELEX_32016R0679_EN_TXT (1).txt
Company policy:   google_privacy_policy_en.txt
Filtering mode:   none
Total requirements considered: 185
  Compliant            : 19 (10.3%)
  Partially compliant  : 135 (73.0%)
  Non-compliant        : 31 (16.8%)
---------------------------------------------------------------------
Scoring formula: score = (C + 0.7 * P) / T * 100
Overall compliance score (0–100): 61.35

Evaluated compliance preview (first 5 rows):
  requirement_id                                               text  \
0       REQ-0001  The right to the protection of personal data i...   
1       REQ-0002  In order to ensure that natural persons are no...   
2       REQ-0003  In order to ensure that the personal data are ...   
3       REQ-0004  Every reasonable step should be taken to ensur...   
4       REQ-0005  In order to ensure that consent is freely give...   

                      categorie