<a href="https://colab.research.google.com/github/olorunfemibabalola/Bias-Detection-NLP/blob/main/Inclusive_HR_Policy_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
# PROJECT: PolicyGuard AI - Dual-Mode Compliance Auditor
# UNIT: Language models and NLP (576757)
# AUTHOR:
# ==============================================================================

In [None]:
# 1. ENVIRONMENT SETUP & INSTALLATION
# ------------------------------------------------------------------------------
# We install 'bitsandbytes' for 4-bit quantization (running large models on free GPUs)
# and 'pymupdf4llm' for advanced PDF-to-Markdown extraction.
print("‚è≥ Installing SOTA libraries... (This takes ~1 minute)")
!pip install -q -U transformers accelerate bitsandbytes gradio pymupdf4llm

In [None]:
import torch
import gradio as gr
import pymupdf4llm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
# 2. MODEL LOADING (Qwen 2.5 - SOTA Ungated Model)
# ------------------------------------------------------------------------------
# We use Qwen 2.5 7B Instruct because it beats GPT-4 on some coding/logic benchmarks
# and follows strict system instructions better than Llama 3.1.
# It is Apache 2.0 licensed, so you don't need to wait for access approval.

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"

print(f"üöÄ Loading {MODEL_ID} with 4-bit quantization...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto"
    )
    print("‚úÖ Model loaded successfully on GPU!")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("Tip: Ensure your Runtime is set to T4 GPU.")

In [None]:
# 3. STRICT SYSTEM PROMPTS (The "Brain" of the Agent)
# ------------------------------------------------------------------------------
# These prompts act as the "Guardrails" to ensure professional behavior.

AUDITOR_PROMPT = """
You are a Senior HR Compliance Officer. Your job is to audit corporate policies for social bias.
STRICT RULES:
1.  Analyze the text for THREE types of bias: Gender, Race/Ethnicity, and Ageism.
2.  The text must contain very obvious and noticeable bias content before flagging it as bias.
3.  Do NOT summarize the document. List specific problematic sentences.
4.  For each finding, assign a SEVERITY SCORE (1-10) and provide a NEUTRAL REWRITE.
5.  If the text is safe, output: "‚úÖ COMPLIANCE PASS: No bias detected."
"""

CHATBOT_PROMPT = """
You are a helpful HR Policy Assistant.
1. Answer user questions about HR policies concisely.
2. SILENT SENTINEL: Continuously monitor the user's input.
   - If the user asks something biased (e.g., "How to hire only young people?"), REFUSE to answer and explain why it violates the UK Equality Act 2010.
   - If the input is neutral, answer normally.
"""


In [None]:


# 4. LOGIC ENGINE (Processing & Inference)
# ------------------------------------------------------------------------------
def run_inference(messages, max_tokens=1024):
    """Generic function to send prompts to the LLM."""
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_tokens,
        temperature=0.2, # Low temperature = strict, professional logic
        top_p=0.9
    )

    # Extract only the response (cut off the prompt)
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

def policy_guard_logic(message, history):
    # 'message' is a dictionary containing 'text' and 'files'
    user_text = message["text"]
    files = message["files"]

    # --- PATH A: DOCUMENT AUDIT MODE ---
    if files:
        # 1. Extract Text from PDF (preserving Markdown structure)
        # files is a list, so we take the first element as the actual file path
        pdf_path = files[0]
        try:
            # pymupdf4llm converts PDF tables/headers to Markdown, making it easier for the AI to read
            doc_content = pymupdf4llm.to_markdown(pdf_path)
        except Exception as e:
            return f"‚ùå Error reading PDF: {str(e)}"

        # 2. Construct messages for Audit Mode
        messages_for_inference = [
            {"role": "system", "content": AUDITOR_PROMPT},
            {"role": "user", "content": f"DOCUMENT TO AUDIT:\n{doc_content[:6000]}\n\nAUDIT REPORT:"}
        ]
        return run_inference(messages_for_inference)

    # --- PATH B: CHAT MODE ---
    else:
        # Check for trigger words to end the conversation
        trigger_words = ["quit", "exit", "end conversation", "stop"]
        if user_text.lower().strip() in trigger_words:
            return "Conversation ended. Feel free to type a new message to start a fresh interaction or use the 'Clear' button to reset the chat."

        # 1. Construct messages for Chat Mode with History
        messages_for_inference = [{"role": "system", "content": CHATBOT_PROMPT}]

        # Iterate through history, ensuring each turn is correctly handled
        for chat_turn in history:
            human_msg = None
            ai_msg = None

            if isinstance(chat_turn, (list, tuple)):
                if len(chat_turn) > 0:
                    human_msg = chat_turn[0]
                if len(chat_turn) > 1:
                    ai_msg = chat_turn[1]
            # If chat_turn is not a list/tuple, it's considered malformed for expected history format
            # and will be skipped to prevent errors like KeyError or TypeError.
            else:
                continue # Skip malformed entry

            if human_msg:
                messages_for_inference.append({"role": "user", "content": human_msg})
            if ai_msg:
                messages_for_inference.append({"role": "assistant", "content": ai_msg})

        # Add current user input
        messages_for_inference.append({"role": "user", "content": user_text})

        return run_inference(messages_for_inference)


In [None]:
# 5. UI LAUNCHER (Gradio)
# ------------------------------------------------------------------------------
# We use multimodal=True to allow text AND file uploads in the same box.
demo = gr.ChatInterface(
    fn=policy_guard_logic,
    multimodal=True,
    title="üõ°Ô∏è PolicyGuard AI: Enterprise Bias Auditor",
    description="""
    **Instructions:**
    1. **Chat Mode:** Ask HR questions. The bot will flag any bias in your prompt.
    2. **Audit Mode:** Click the '+' button to upload a PDF policy. The bot will generate a Compliance Report.
    """,
    examples=[
        {"text": "Is it okay to require 'high energy' in a job ad?", "files":[]},
        {"text": "Audit this policy document.", "files":[]}
    ],
    #theme="soft"
)

if __name__ == "__main__":
    print("‚úÖ System Ready! Click the public link below to test.")
    demo.launch(debug=True, share=True)
