This is a Notebook

## Set-Up
In this section, we will install the dependencies required to run the code in this notebook.

In [None]:
import sys
import os

# Add project root to path
sys.path.append(os.path.abspath(".."))

In [None]:
from src.utils.dataset import get_project_dataset
import statistics
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from typing import Iterable
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter
import math
from scipy.stats import norm
from sklearn.mixture import GaussianMixture

## Data Preprocessing


In [None]:
dataset = get_project_dataset()
print(dataset)

In [None]:
X_train, y_train = dataset["train"]["text"], dataset["train"]["label"]
X_test, y_test = dataset["test"]["text"], dataset["test"]["label"]

In [None]:
first_negative, first_positive = None, None

# Find the first negative (safe) example (label == 0)
for text, label in zip(X_train, y_train):
    if label == 0:
        first_negative = text
        break

# Find the first positive (unsafe) example (label == 1)
for text, label in zip(X_train, y_train):
    if label == 1:
        first_positive = text
        break

print("First negative example:")
print(first_negative)

print("First positive example:")
print(first_positive)

In [None]:
def prompt_lengths(texts: Iterable[str]) -> list[int]:
    return [len(text.split()) for text in texts]

def get_stats(lengths: list[int]) -> dict:
    return {
        "Count": len(lengths),
        "Min": min(lengths),
        "Max": max(lengths),
        "Mean": round(statistics.mean(lengths), 2),
        "Variance": round(statistics.variance(lengths), 2),
    }

# Positive and negative texts
unsafe_prompts = [text for text, label in zip(X_train, y_train) if label == 1]
safe_prompts = [text for text, label in zip(X_train, y_train) if label == 0]

all_lengths = prompt_lengths(X_train)
unsafe_lengths = prompt_lengths(unsafe_prompts)
safe_lengths = prompt_lengths(safe_prompts)

data = {
    "Positive (unsafe)": get_stats(unsafe_lengths),
    "Negative (safe)": get_stats(safe_lengths),
}
df = pd.DataFrame(data).T

print(df)

In [None]:
# Shortest and longest unsafe prompt
shortest_unsafe = min(unsafe_prompts, key=lambda x: len(x.split()))
longest_unsafe = max(unsafe_prompts, key=lambda x: len(x.split()))

# Shortest and longest safe prompt
shortest_safe = min(safe_prompts, key=lambda x: len(x.split()))
longest_safe = max(safe_prompts, key=lambda x: len(x.split()))

print("Shortest unsafe prompt:\n", shortest_unsafe)
print("\nLongest unsafe prompt:\n", longest_unsafe)

print("\nShortest safe prompt:\n", shortest_safe)
print("\nLongest safe prompt:\n", longest_safe)

In [None]:
bins = np.arange(0, 2150, 50)  # bins every 20 words
bin_labels = [f"{b}-{b+50}" for b in bins[:-1]]

unsafe_counts, _ = np.histogram(unsafe_lengths, bins=bins)
safe_counts, _ = np.histogram(safe_lengths, bins=bins)

# Normalize counts to get proportions
unsafe_freqs = unsafe_counts / unsafe_counts.sum()
safe_freqs = safe_counts / safe_counts.sum()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=bin_labels,
    y=unsafe_freqs,
    name='Positive (unsafe)',
    marker_color='red'
))

fig.add_trace(go.Bar(
    x=bin_labels,
    y=safe_freqs,
    name='Negative (safe)',
    marker_color='green'
))

fig.update_layout(
    barmode='group',
    title='Normalized Prompt Length Distribution by Label',
    xaxis_title='Number of Words',
    yaxis_title='Proportion',
    xaxis_tickangle=-45,
    bargap=0.1,
    yaxis_type='log'
)

fig.show()

In [None]:
# Idea is that unsafe prompts might be more repetitive or formulaic.
def shannon_entropy(text: str) -> float:
    tokens = text.split()
    counts = Counter(tokens)
    probs = [count / len(tokens) for count in counts.values()]
    return -sum(p * math.log2(p) for p in probs)

entropies = [shannon_entropy(t) for t in X_train]

# Prepare DataFrame
df_entropy = pd.DataFrame({
    "entropy": entropies,
    "label": y_train
})
df_entropy["label_name"] = df_entropy["label"].map({0: "Safe", 1: "Unsafe"})

# Histogram with normalized counts (relative frequencies)
fig_hist = px.histogram(
    df_entropy, x="entropy", color="label_name",
    nbins=30, barmode="group",
    opacity=0.6,
    histnorm='probability',   # <-- normalize within each class
    labels={"entropy": "Shannon Entropy", "label_name": "Prompt Class"},
    title="Normalized Entropy Distribution for Safe vs Unsafe Prompts",
    marginal="rug"  # optional
)
fig_hist.update_layout(bargap=0.1)

fig_hist.show()

# Boxplot distribution
# fig_box = px.box(
#     df_entropy, x="label_name", y="entropy",
#     color="label_name",
#     labels={"label_name": "Prompt Class", "entropy": "Shannon Entropy"},
#     title="Entropy Comparison between Safe and Unsafe Prompts"
# )
# fig_box.show()

In [None]:
def shannon_entropy(text: str) -> float:
    tokens = text.split()
    counts = Counter(tokens)
    probs = [count / len(tokens) for count in counts.values()]
    return -sum(p * math.log2(p) for p in probs)

def add_histogram(fig, data, bins, name, color):
    counts, bin_edges = np.histogram(data, bins=bins, density=False)
    bin_width = bin_edges[1] - bin_edges[0]
    fig.add_trace(go.Bar(
        x=bin_edges[:-1],
        y=counts,
        width=bin_width * 0.9,
        name=f"{name} Histogram",
        marker_color=color,
        opacity=0.5,
    ))
    return counts, bin_edges, bin_width

def add_normal_fit(fig, data, bin_width, color="green", name="Safe Normal Fit"):
    mu, std = norm.fit(data)
    x = np.linspace(min(data), max(data), 300)
    pdf = norm.pdf(x, mu, std)
    pdf_scaled = pdf * len(data) * bin_width
    fig.add_trace(go.Scatter(
        x=x,
        y=pdf_scaled,
        mode="lines",
        line=dict(color=color, width=3),
        name=name
    ))

def add_gmm_fit(fig, data, bin_width, color="red", name="Unsafe GMM Fit", n_components=2):
    # Reshape data for GMM (expects 2D)
    data_reshaped = data.reshape(-1, 1)
    
    # Fit GMM
    gmm = GaussianMixture(n_components=n_components, random_state=0)
    gmm.fit(data_reshaped)
    
    # Create x-axis range for smooth plot
    x = np.linspace(min(data), max(data), 300).reshape(-1, 1)
    
    # Compute weighted sum of component PDFs for each x
    logprob = gmm.score_samples(x)
    pdf = np.exp(logprob)
    
    # Scale PDF to histogram counts
    pdf_scaled = pdf * len(data) * bin_width
    
    fig.add_trace(go.Scatter(
        x=x.flatten(),
        y=pdf_scaled,
        mode="lines",
        line=dict(color=color, width=3),
        name=name
    ))

# Calculate entropies
entropies = [shannon_entropy(t) for t in X_train]
df_entropy = pd.DataFrame({
    "entropy": entropies,
    "label": y_train
})
df_entropy["label_name"] = df_entropy["label"].map({0: "Safe", 1: "Unsafe"})

fig = go.Figure()
bins = 30

safe_data = df_entropy[df_entropy["label"] == 0]["entropy"].values
unsafe_data = df_entropy[df_entropy["label"] == 1]["entropy"].values

safe_counts, safe_bin_edges, safe_bin_width = add_histogram(fig, safe_data, bins, "Safe", "green")
unsafe_counts, unsafe_bin_edges, unsafe_bin_width = add_histogram(fig, unsafe_data, bins, "Unsafe", "red")

add_normal_fit(fig, safe_data, safe_bin_width)
#add_gmm_fit(fig, unsafe_data, unsafe_bin_width)
add_gmm_fit(fig, unsafe_data, unsafe_bin_width, n_components=3)

fig.update_layout(
    title="Entropy Distribution with Normal Fit for Safe and Poisson Fit for Unsafe Prompts",
    xaxis_title="Shannon Entropy",
    yaxis_title="Count",
    barmode="overlay",
    bargap=0.2,
)

fig.show()

In [None]:
# Create TF-IDF vectorizer and fit on training data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(dataset["train"]["text"])

In [None]:
print(X_train_tfidf)

## Model training

In [None]:
clf = Pipeline([
    ("tfidf", TfidfVectorizer()), 
    ("logreg", LogisticRegression(max_iter=1000))
])

In [None]:
clf.fit(X_train_tfidf, y_train)