In [1]:
!pip install tensorflow transformers pandas scikit-learn spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import json
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import spacy

print("Loading spaCy model...")
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' loaded successfully.")
except IOError:
    print("Error loading 'en_core_web_sm'. Please ensure it's installed.")


def anonymize_text(text):
    text = re.sub(r'(\+91[\-\s]?)?[0]?[789]\d{9}', '[PHONE]', text)
    text = re.sub(r'\b\d{4}\s\d{4}\s\d{4}\b', '[AADHAAR]', text)
    text = re.sub(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', '[PHONE]', text)
    text = re.sub(r'\S+@\S+', '[EMAIL]', text)
    text = re.sub(r'[A-Z]{2}[0-9]{1,2}[A-Z]{1,2}[0-9]{1,4}', '[LICENSE_PLATE]', text)
    text = re.sub(r'\b\d{1,5}\s[\w\s,.-]+(?:Street|St|Road|Rd|Avenue|Ave|Drive|Dr|Lane|Ln|Court|Ct|Colony|Nagar|Vihar)\b', '[ADDRESS]', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(?:Apartment|Apt\.?|Flat|Flt\.|House No\.?|H\.?No\.?|C-|B-)\s?[\w\s\d-]+', '[ADDRESS]', text, flags=re.IGNORECASE)
    text = re.sub(r'[A-Z]{5}[0-9]{4}[A-Z]{1}', '[PAN]', text)

    if 'nlp' in globals():
        doc = nlp(text)
        new_text = text
        for ent in reversed(doc.ents):
            if ent.label_ in ["PERSON", "GPE", "LOC", "ORG"]:
                new_text = new_text[:ent.start_char] + f"[{ent.label_}]" + new_text[ent.end_char:]
        text = new_text

    return text


def anonymize_and_clean_for_bert(text):
    anonymized_text = anonymize_text(text)

    cleaned_text = anonymized_text.lower()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

Loading spaCy model...
spaCy model 'en_core_web_sm' loaded successfully.


In [3]:
file_path = '/content/synthetic_data_1000.json'

print(f"Loading data from {file_path}...")
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    print(f"Successfully loaded {len(df)} entries.")
except FileNotFoundError:
    print(f"ERROR: File not found at {file_path}. Please upload the file.")


if 'df' in locals():
    df['risk_level'] = df['labels'].apply(lambda x: x.get('risk_level'))
    df = df[['text', 'risk_level']]
    df.dropna(inplace=True)

    print("\nAnonymizing and cleaning text data (using advanced NER method)...")
    df['processed_text'] = df['text'].apply(anonymize_and_clean_for_bert)

    possible_labels = df.risk_level.unique()
    label_dict = {label: i for i, label in enumerate(possible_labels)}
    id_to_label = {i: label for label, i in label_dict.items()}

    print(f"\nLabel mapping created: {label_dict}")

    df['label_id'] = df['risk_level'].replace(label_dict)

    X = df['processed_text'].tolist()
    y = df['label_id'].tolist()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"\nData split into {len(X_train)} training samples and {len(X_test)} testing samples.")

Loading data from /content/synthetic_data_1000.json...
Successfully loaded 999 entries.

Anonymizing and cleaning text data (using advanced NER method)...

Label mapping created: {'High-Urgency': 0, 'Medium': 1, 'Immediate-Threat': 2, 'Low': 3}

Data split into 799 training samples and 200 testing samples.


  df['label_id'] = df['risk_level'].replace(label_dict)


In [4]:
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

print("Tokenizing training and testing data...")
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)

print("Tokenization complete.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokenizing training and testing data...
Tokenization complete.


In [5]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

print("TensorFlow Datasets created.")

TensorFlow Datasets created.


In [6]:
print("Loading pre-trained DistilBERT model...")
model = TFDistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(possible_labels),
    use_safetensors=False
)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

Loading pre-trained DistilBERT model...


tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-

In [8]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
print("Starting model fine-tuning...")
history = model.fit(
    train_dataset,
    epochs=15,
    validation_data=test_dataset
)

model.save_pretrained("./FineTunedBurt_model")
print("Model fine-tuning complete and saved to ./FineTunedBurt_model.")

Starting model fine-tuning...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Model fine-tuning complete and saved to ./FineTunedBurt_model.


In [9]:
!mkdir -p FineTunedBurt_model
!mv tf_model.h5 FineTunedBurt_model/
!mv config.json FineTunedBurt_model/

mv: cannot stat 'tf_model.h5': No such file or directory
mv: cannot stat 'config.json': No such file or directory


In [10]:
print("Loading the saved model...")
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(
    "./FineTunedBurt_model",
    local_files_only=True,
    num_labels=len(possible_labels)
)
print("Model loaded successfully.")

new_sentence_to_predict = "This is a test sentence for prediction."

cleaned_sentence_to_predict = anonymize_and_clean_for_bert(new_sentence_to_predict)

inputs_to_predict = tokenizer(cleaned_sentence_to_predict, return_tensors="tf", truncation=True, padding=True, max_length=128)

outputs_predict = loaded_model(inputs_to_predict)
logits_predict = outputs_predict.logits

predicted_id_predict = tf.argmax(logits_predict, axis=-1).numpy()[0]

predicted_label_predict = id_to_label[predicted_id_predict]

print(f"\nOriginal Sentence: {new_sentence_to_predict}")
print(f"Cleaned Sentence: {cleaned_sentence_to_predict}")
print(f"Predicted Risk Level: {predicted_label_predict}")

Loading the saved model...


Some layers from the model checkpoint at ./FineTunedBurt_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./FineTunedBurt_model and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
TensorFlow and JAX classes are deprecated and will be removed in

Model loaded successfully.

Original Sentence: This is a test sentence for prediction.
Cleaned Sentence: this is a test sentence for prediction.
Predicted Risk Level: Low


In [11]:
print("Evaluating model on the test set...")
test_predictions = model.predict(test_dataset)
logits = test_predictions.logits

predicted_ids = tf.argmax(logits, axis=-1).numpy()

print("\n--- Model Evaluation Results ---")
accuracy = accuracy_score(y_test, predicted_ids)
print(f"Overall Model Accuracy: {accuracy * 100:.2f}%\n")

print("Detailed Classification Report:")
print(classification_report(y_test, predicted_ids, target_names=label_dict.keys()))

Evaluating model on the test set...

--- Model Evaluation Results ---
Overall Model Accuracy: 73.50%

Detailed Classification Report:
                  precision    recall  f1-score   support

    High-Urgency       0.72      0.71      0.72        73
          Medium       0.65      0.87      0.74        63
Immediate-Threat       0.89      0.68      0.77        37
             Low       1.00      0.56      0.71        27

        accuracy                           0.73       200
       macro avg       0.82      0.70      0.74       200
    weighted avg       0.77      0.73      0.73       200



In [15]:
new_sentence = "He is threatening me again and following my car. His name is Yash . His Pan is GTUDS3456T His number is 9876543210 and email is yash@email.com. I'm scared he has a weapon."
print(f"Original Sentence:\n{new_sentence}\n")

cleaned_sentence = anonymize_and_clean_for_bert(new_sentence)
print(f"Cleaned Sentence:\n{cleaned_sentence}\n")

inputs = tokenizer(cleaned_sentence, return_tensors="tf", truncation=True, padding=True, max_length=128)

outputs = loaded_model(inputs)
logits = outputs.logits

predicted_id = tf.argmax(logits, axis=-1).numpy()[0]

predicted_label = id_to_label[predicted_id]

print(f"Predicted Risk Level: {predicted_label}")

Original Sentence:
He is threatening me again and following my car. His name is Yash . His Pan is GTUDS3456T His number is 9876543210 and email is yash@email.com. I'm scared he has a weapon.

Cleaned Sentence:
he is threatening me again and following my car. his name is [person] . his [org] is [[org]] his number is [phone] and email is [email] i'm scared he has a weapon.

Predicted Risk Level: Medium


In [16]:
print("Testing the anonymize_and_clean_for_bert function with various inputs:")

test_cases = [
    "My name is John Doe and my email is john.doe@example.com. My phone number is 123-456-7890.",
    "Call me at +91 9876543210 or email me at test@test.org. My license plate is AB12CD3456.",
    "The meeting is at 123 Main Street, Apartment 4B.",
    "My Aadhaar number is 1234 5678 9012.",
    "This sentence has no PII.",
    "Mixed case and extra spaces:  My NAME is Jane Smith.  Email: Jane.Smith@EXAMPLE.ORG   Phone: (999) 888-7777 .",
    "Multiple phone numbers: 111-222-3333 and 444.555.6666.",
    "Address with different formats: House No. 10, Gandhi Nagar and C-5, Vihar.",
    "Sentence with a person name and location: [PERSON] is in [GPE].",
    "Sentence with a person name, location and organization: [PERSON] from [ORG] visited [LOC].",
    "he is threatening me again, his name is Aditya and he lives in Chandigarh and his car number is MP25CE3456 and his phone number is [PHONE]",
    "Bring your aadhar card with you to Andheri tomorrow."
]

for i, test_text in enumerate(test_cases):
    anonymized_cleaned_text = anonymize_and_clean_for_bert(test_text)
    print(f"\n--- Test Case {i+1} ---")
    print(f"Original: {test_text}")
    print(f"Processed: {anonymized_cleaned_text}")
    new_sentence = "He is threatening me again and following my car. His name is Aditya. His number is 9876543210 and email is aditya@email.com. I'm scared he has a weapon."

    inputs = tokenizer(anonymized_cleaned_text, return_tensors="tf", truncation=True, padding=True, max_length=128)
    outputs = loaded_model(inputs)
    logits = outputs.logits
    predicted_id = tf.argmax(logits, axis=-1).numpy()[0]
    predicted_label = id_to_label[predicted_id]
    print(f"Predicted Risk Level: {predicted_label}")

Testing the anonymize_and_clean_for_bert function with various inputs:

--- Test Case 1 ---
Original: My name is John Doe and my email is john.doe@example.com. My phone number is 123-456-7890.
Processed: my name is [person] and my email is [email] my phone number is [phone].
Predicted Risk Level: Medium

--- Test Case 2 ---
Original: Call me at +91 9876543210 or email me at test@test.org. My license plate is AB12CD3456.
Processed: call me at [phone] or email me at [email] my license plate is [license_plate].
Predicted Risk Level: Medium

--- Test Case 3 ---
Original: The meeting is at 123 Main Street, Apartment 4B.
Processed: the meeting is at [address], [address].
Predicted Risk Level: Low

--- Test Case 4 ---
Original: My Aadhaar number is 1234 5678 9012.
Processed: my [org] number is [[org]].
Predicted Risk Level: Medium

--- Test Case 5 ---
Original: This sentence has no PII.
Processed: this sentence has no [org].
Predicted Risk Level: Low

--- Test Case 6 ---
Original: Mixed case 

In [14]:
!pip install gradio --quiet

import gradio as gr
import tensorflow as tf
import numpy as np
from datetime import datetime

history = []

risk_colors = {
    "Low": "#4CAF50",
    "Medium": "#FFC107",
    "High-Urgency": "#FF7043",
    "Immediate-Threat": "#F44336"
}

def predict_risk_level(user_text):
    if not user_text.strip():
        return {"Error": "Please enter some text."}

    processed_text = anonymize_and_clean_for_bert(user_text)
    inputs = tokenizer(processed_text, return_tensors="tf", truncation=True, padding=True, max_length=128)
    outputs = loaded_model(inputs)
    probs = tf.nn.softmax(outputs.logits, axis=-1).numpy()[0]
    pred_id = np.argmax(probs)
    pred_label = id_to_label[pred_id]
    confidence = float(np.max(probs))
    timestamp = datetime.now().strftime("%H:%M:%S")

    history.append({
        "Time": timestamp,
        "Risk": pred_label,
        "Confidence": f"{confidence:.2f}",
        "Text": processed_text
    })

    color = risk_colors.get(pred_label, "#9E9E9E")
    bar_html = f"""
    <div style='width:100%; background:#e0e0e0; border-radius:8px;'>
        <div style='width:{confidence*100:.1f}%; background:{color}; padding:4px; border-radius:8px; text-align:center; color:white;'>
            {confidence*100:.1f}%
        </div>
    </div>
    """

    html_result = f"""
    <div style='font-family:monospace;'>
        <p><b>Predicted Risk:</b> <span style='color:{color}; font-weight:600;'>{pred_label}</span></p>
        <p><b>Confidence:</b></p>{bar_html}
        <hr>
        <p><b>Anonymized Input:</b> {processed_text}</p>
    </div>
    """
    hist_table = "\n".join([f"üïí {h['Time']} | {h['Risk']} ({h['Confidence']}) ‚Üí {h['Text']}" for h in history[-5:]])
    return html_result, hist_table

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("<h2 style='text-align:center'>üõ°Ô∏è Domestic Safety Risk Level Classifier</h2>")
    gr.Markdown("Predicts message **risk level**  All inputs are automatically anonymized.")
    txt = gr.Textbox(lines=4, placeholder="Type your message here...")
    res = gr.HTML()
    hist = gr.Textbox(label="Recent Predictions (last 5)", interactive=False)
    btn = gr.Button("üîç Analyze", variant="primary")
    btn.click(predict_risk_level, inputs=txt, outputs=[res, hist])

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://83638abde440e36fcb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


