<a href="https://colab.research.google.com/github/menonkrishna57/DomesticSafetyNet/blob/main/DomesticSafetyNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow transformers pandas scikit-learn spacy
!python -m spacy download en_core_web_sm

In [None]:
import json
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import spacy

print("Loading spaCy model...")
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' loaded successfully.")
except IOError:
    print("Error loading 'en_core_web_sm'. Please ensure it's installed.")


def anonymize_text(text):
    text = re.sub(r'(\+91[\-\s]?)?[0]?[789]\d{9}', '[PHONE]', text)
    text = re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '[AADHAAR]', text)
    text = re.sub(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', '[PHONE]', text)
    text = re.sub(r'\S+@\S+', '[EMAIL]', text)
    text = re.sub(r'[A-Z]{2}[0-9]{1,2}[A-Z]{1,2}[0-9]{1,4}', '[LICENSE_PLATE]', text)
    text = re.sub(r'\b\d{1,5}\s[\w\s,.-]+(?:Street|St|Road|Rd|Avenue|Ave|Drive|Dr|Lane|Ln|Court|Ct|Colony|Nagar|Vihar)\b', '[ADDRESS]', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(?:Apartment|Apt\.?|Flat|Flt\.|House No\.?|H\.?No\.?|C-|B-)\s?[\w\s\d-]+', '[ADDRESS]', text, flags=re.IGNORECASE)
    text = re.sub(r'[A-Z]{5}[0-9]{4}[A-Z]{1}', '[PAN]', text)

    if 'nlp' in globals():
        doc = nlp(text)
        new_text = text
        for ent in reversed(doc.ents):
            if ent.label_ in ["PERSON", "GPE", "LOC", "ORG"]:
                new_text = new_text[:ent.start_char] + f"[{ent.label_}]" + new_text[ent.end_char:]
        text = new_text

    return text


def anonymize_and_clean_for_bert(text):
    anonymized_text = anonymize_text(text)

    cleaned_text = anonymized_text.lower()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [17]:
file_path = './dataset/synthetic_data_1000.json'

print(f"Loading data from {file_path}...")
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    print(f"Successfully loaded {len(df)} entries.")
except FileNotFoundError:
    print(f"ERROR: File not found at {file_path}. Please upload the file.")


if 'df' in locals():
    df['risk_level'] = df['labels'].apply(lambda x: x.get('risk_level'))
    df = df[['text', 'risk_level']]
    df.dropna(inplace=True)

    print("\nAnonymizing and cleaning text data (using advanced NER method)...")
    df['processed_text'] = df['text'].apply(anonymize_and_clean_for_bert)

    possible_labels = df.risk_level.unique()
    label_dict = {label: i for i, label in enumerate(possible_labels)}
    id_to_label = {i: label for label, i in label_dict.items()}

    print(f"\nLabel mapping created: {label_dict}")

    df['label_id'] = df['risk_level'].replace(label_dict)

    X = df['processed_text'].tolist()
    y = df['label_id'].tolist()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"\nData split into {len(X_train)} training samples and {len(X_test)} testing samples.")

Loading data from ./dataset/synthetic_data_1000.json...
Successfully loaded 999 entries.

Anonymizing and cleaning text data (using advanced NER method)...

Label mapping created: {'High-Urgency': 0, 'Medium': 1, 'Immediate-Threat': 2, 'Low': 3}

Data split into 799 training samples and 200 testing samples.


  df['label_id'] = df['risk_level'].replace(label_dict)


In [None]:
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

print("Tokenizing training and testing data...")
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)

print("Tokenization complete.")

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

print("TensorFlow Datasets created.")

In [None]:
print("Loading pre-trained DistilBERT model...")
model = TFDistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(possible_labels),
    use_safetensors=False
)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
print("Starting model fine-tuning...")
history = model.fit(
    train_dataset,
    epochs=15,
    validation_data=test_dataset
)

model.save_pretrained("./FineTunedBurt_model")
print("Model fine-tuning complete and saved to ./FineTunedBurt_model.")

In [None]:
!mkdir -p FineTunedBurt_model
!mv tf_model.h5 FineTunedBurt_model/
!mv config.json FineTunedBurt_model/

In [None]:
print("Loading the saved model...")
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
    "./FineTunedBurt_model",
    local_files_only=True,
    num_labels=len(possible_labels)
)
print("Model loaded successfully.")

new_sentence_to_predict = "This is a test sentence for prediction."

cleaned_sentence_to_predict = anonymize_and_clean_for_bert(new_sentence_to_predict)

inputs_to_predict = tokenizer(cleaned_sentence_to_predict, return_tensors="tf", truncation=True, padding=True, max_length=128)

outputs_predict = loaded_model(inputs_to_predict)
logits_predict = outputs_predict.logits

predicted_id_predict = tf.argmax(logits_predict, axis=-1).numpy()[0]

predicted_label_predict = id_to_label[predicted_id_predict]

print(f"\nOriginal Sentence: {new_sentence_to_predict}")
print(f"Cleaned Sentence: {cleaned_sentence_to_predict}")
print(f"Predicted Risk Level: {predicted_label_predict}")

In [None]:
print("Evaluating model on the test set...")
test_predictions = model.predict(test_dataset)
logits = test_predictions.logits

predicted_ids = tf.argmax(logits, axis=-1).numpy()

print("\n--- Model Evaluation Results ---")
accuracy = accuracy_score(y_test, predicted_ids)
print(f"Overall Model Accuracy: {accuracy * 100:.2f}%\n")

print("Detailed Classification Report:")
print(classification_report(y_test, predicted_ids, target_names=label_dict.keys()))

In [None]:
new_sentence = "He is threatening me again and following my car. His name is Yash . His Pan is GTUDS3456T His number is 9876543210 and email is yash@email.com. I'm scared he has a weapon."
print(f"Original Sentence:\n{new_sentence}\n")

cleaned_sentence = anonymize_and_clean_for_bert(new_sentence)
print(f"Cleaned Sentence:\n{cleaned_sentence}\n")

inputs = tokenizer(cleaned_sentence, return_tensors="tf", truncation=True, padding=True, max_length=128)

outputs = loaded_model(inputs)
logits = outputs.logits

predicted_id = tf.argmax(logits, axis=-1).numpy()[0]

predicted_label = id_to_label[predicted_id]

print(f"Predicted Risk Level: {predicted_label}")

In [None]:
print("Testing the anonymize_and_clean_for_bert function with various inputs:")

test_cases = [
    "My name is John Doe and my email is john.doe@example.com. My phone number is 123-456-7890.",
    "Call me at +91 9876543210 or email me at test@test.org. My license plate is AB12CD3456.",
    "The meeting is at 123 Main Street, Apartment 4B.",
    "My Aadhaar number is 1234 5678 9012.",
    "This sentence has no PII.",
    "Mixed case and extra spaces:  My NAME is Jane Smith.  Email: Jane.Smith@EXAMPLE.ORG   Phone: (999) 888-7777 .",
    "Multiple phone numbers: 111-222-3333 and 444.555.6666.",
    "Address with different formats: House No. 10, Gandhi Nagar and C-5, Vihar.",
    "Sentence with a person name and location: [PERSON] is in [GPE].",
    "Sentence with a person name, location and organization: [PERSON] from [ORG] visited [LOC].",
    "he is threatening me again, his name is Aditya and he lives in Chandigarh and his car number is MP25CE3456 and his phone number is [PHONE]",
    "Bring your aadhar card with you to Andheri tomorrow."
]

for i, test_text in enumerate(test_cases):
    anonymized_cleaned_text = anonymize_and_clean_for_bert(test_text)
    print(f"\n--- Test Case {i+1} ---")
    print(f"Original: {test_text}")
    print(f"Processed: {anonymized_cleaned_text}")
    new_sentence = "He is threatening me again and following my car. His name is Aditya. His number is 9876543210 and email is aditya@email.com. I'm scared he has a weapon."

    inputs = tokenizer(anonymized_cleaned_text, return_tensors="tf", truncation=True, padding=True, max_length=128)
    outputs = loaded_model(inputs)
    logits = outputs.logits
    predicted_id = tf.argmax(logits, axis=-1).numpy()[0]
    predicted_label = id_to_label[predicted_id]
    print(f"Predicted Risk Level: {predicted_label}")

In [None]:
!pip install gradio --quiet

import gradio as gr
import tensorflow as tf
import numpy as np
from datetime import datetime

history = []

risk_colors = {
    "Low": "#4CAF50",
    "Medium": "#FFC107",
    "High-Urgency": "#FF7043",
    "Immediate-Threat": "#F44336"
}

def predict_risk_level(user_text):
    if not user_text.strip():
        return {"Error": "Please enter some text."}

    processed_text = anonymize_and_clean_for_bert(user_text)
    inputs = tokenizer(processed_text, return_tensors="tf", truncation=True, padding=True, max_length=128)
    outputs = loaded_model(inputs)
    probs = tf.nn.softmax(outputs.logits, axis=-1).numpy()[0]
    pred_id = np.argmax(probs)
    pred_label = id_to_label[pred_id]
    confidence = float(np.max(probs))
    timestamp = datetime.now().strftime("%H:%M:%S")

    history.append({
        "Time": timestamp,
        "Risk": pred_label,
        "Confidence": f"{confidence:.2f}",
        "Text": processed_text
    })

    color = risk_colors.get(pred_label, "#9E9E9E")
    bar_html = f"""
    <div style='width:100%; background:#e0e0e0; border-radius:8px;'>
        <div style='width:{confidence*100:.1f}%; background:{color}; padding:4px; border-radius:8px; text-align:center; color:white;'>
            {confidence*100:.1f}%
        </div>
    </div>
    """

    html_result = f"""
    <div style='font-family:monospace;'>
        <p><b>Predicted Risk:</b> <span style='color:{color}; font-weight:600;'>{pred_label}</span></p>
        <p><b>Confidence:</b></p>{bar_html}
        <hr>
        <p><b>Anonymized Input:</b> {processed_text}</p>
    </div>
    """
    hist_table = "\n".join([f"üïí {h['Time']} | {h['Risk']} ({h['Confidence']}) ‚Üí {h['Text']}" for h in history[-5:]])
    return html_result, hist_table

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("<h2 style='text-align:center'>üõ°Ô∏è Domestic Safety Risk Level Classifier</h2>")
    gr.Markdown("Predicts message **risk level**  All inputs are automatically anonymized.")
    txt = gr.Textbox(lines=4, placeholder="Type your message here...")
    res = gr.HTML()
    hist = gr.Textbox(label="Recent Predictions (last 5)", interactive=False)
    btn = gr.Button("üîç Analyze", variant="primary")
    btn.click(predict_risk_level, inputs=txt, outputs=[res, hist])

demo.launch(share=True)
