In [1]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from chromadb.config import Settings
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import yake
from keybert import KeyBERT
import openai
import os
from dotenv import load_dotenv


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/Luke/MSDS/deeplearning_dogwhistle_project/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/Luke/MSDS/deeplearning_dogwhistle_project/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/Luke/MSDS/deeplearning_dogwhistle_project/.venv/lib/python3.11/site-p

In [2]:
# -------------------------------
# Config
# -------------------------------
DATASET_NAME = "SALT-NLP/silent_signals"
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
VECTOR_DB_PATH = "./vector_store"
CLASSIFIER_MODEL = "tomh/toxigen_hatebert"
# unitary/toxic-bert didnt seem to work that well

In [3]:
import random

random.seed(2002)

In [4]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# -------------------------------
# Load and manually split SilentSignals
# -------------------------------
print("Loading Silent Signals dataset...")
dataset_raw = load_dataset(DATASET_NAME, split="train")
df = pd.DataFrame(dataset_raw)

# Drop unnecessary columns
df = df.drop(columns=["party", "chamber", "speaker"])

# Manual 80/20 split (no stratification)
train_df, silent_test_df = train_test_split(df, test_size=0.2, random_state=42)

# Label SilentSignals test slice as coded (1)
silent_test_df = silent_test_df[["content"]].copy()
silent_test_df["label"] = 1

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(silent_test_df)}")

Loading Silent Signals dataset...
Train size: 13006
Test size: 3252


In [5]:
noncoded_dataset = load_dataset("civil_comments", split="train")
noncoded_df = pd.DataFrame(noncoded_dataset)

In [6]:
# Standardize column name to match SilentSignals
noncoded_df = noncoded_df.rename(columns={"text": "content"})
# Filter the dataframe using query conditions
filtered_df = noncoded_df[
    (noncoded_df["toxicity"] == 0) &
    (noncoded_df["severe_toxicity"] == 0) &
    (noncoded_df["obscene"] == 0) &
    (noncoded_df["threat"] == 0) &
    (noncoded_df["insult"] == 0) &
    (noncoded_df["identity_attack"] == 0) &
    (noncoded_df["sexual_explicit"] == 0)
]

# Select only the 'content' column and add a 'label' column
filtered_df = filtered_df[["content"]].copy()
filtered_df = filtered_df.head(813)  # Apply LIMIT 813
filtered_df["label"] = 0

In [7]:
# combine filtered_df and silent_test_df
combined_df = pd.concat([filtered_df, silent_test_df], ignore_index=True)

In [8]:
# -------------------------------
# Sentence Embeddings
# -------------------------------
print("Generating embeddings...")
model = SentenceTransformer(EMBEDDING_MODEL_NAME)
texts = train_df['content'].tolist()  # Adjust column name if needed
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True, convert_to_tensor=True)

Generating embeddings...


Batches:   0%|          | 0/407 [00:00<?, ?it/s]

# Create vector store to store embeddings

In [9]:
print("Creating ChromaDB vector store...")
chroma_client = chromadb.Client(Settings(persist_directory=VECTOR_DB_PATH))
# reset the collection if it exists
try:
    chroma_client.delete_collection("earshot")
except Exception as e:
    print("Collection deletion failed or doesn't exist yet:", e)
collection = chroma_client.create_collection("earshot")

# Add embeddings to vector DB (skip numpy by converting to list via torch.tolist())
for i, (text, emb) in enumerate(zip(texts, embeddings)):
    collection.add(
        documents=[text],
        embeddings=[emb.cpu().tolist()],
        ids=[f"doc_{i}"]
    )

print(f"Stored {len(texts)} vectors in ChromaDB.")

Creating ChromaDB vector store...
Collection deletion failed or doesn't exist yet: Collection [earshot] does not exists
Stored 13006 vectors in ChromaDB.


## Nearest Neighbors check to make sure embedding worked

In [10]:
# -------------------------------
# Nearest Neighbor Retrieval
# -------------------------------
def get_neighbors(query_text, k=5):
    query_emb = model.encode([query_text], convert_to_tensor=True)[0].cpu().tolist()
    results = collection.query(query_embeddings=[query_emb], n_results=k)
    return results

# Example usage
example_query = texts[11]
neighbors = get_neighbors(example_query)
print("Nearest Neighbors for:", example_query)
for doc in neighbors['documents'][0]:
    print(" -", doc)

Nearest Neighbors for: Dianna Anderson has written (before drinking the gender woo woo) some very insightful things about Christianity, patriarchy, religious abuse, misogyny etc. Unfortunately she got sooo close and then veered off into...whatever this is. Just because you were abused and you hate being a woman doesn't mean you aren't one.
 - Dianna Anderson has written (before drinking the gender woo woo) some very insightful things about Christianity, patriarchy, religious abuse, misogyny etc. Unfortunately she got sooo close and then veered off into...whatever this is. Just because you were abused and you hate being a woman doesn't mean you aren't one.
 - This aligns closely with my analysis of the rise of gender identity ideology. Society is constantly concocting alternative routes around confronting trauma, individually, and collectively. A must read on this phenomenon is Judith Lewis Herman, *Trauma and Recovery*.
 - Dig around, do more research and you will see how absolutely ho

# Loading my mixed 100 row dataset, 50/50 split between dogwhistles and non-dogwhistles

In [11]:
# load silent signal detection
mixed_dataset = load_dataset("SALT-NLP/silent_signals_detection", split="train")
mixed_df = pd.DataFrame(mixed_dataset)
mixed_df['label'] = mixed_df['label'].map({"coded":1,"non-coded":0})

In [12]:
mixed_df = mixed_df.rename(columns={"example": "content"})
mixed_df

Unnamed: 0,idx,dog_whistle,dog_whistle_root,ingroup,definition,content,label
0,0,YWNBAW,YWNBAW,transphobic,You Will Never Be A Woman (YWNBAW) is an alt-r...,"""Might I suggest you reach out to YWNBAW? Very...",1
1,1,109,109,antisemitic,109 stands for the 109 locations that Jews wer...,"""""Wow 109 countries, and they consider us as ...",1
2,2,alarmist,alarmist,climate change denier,"""Alarmist"" is a common global warming denialis...","""A mountain of evidence shows climate alarmist...",1
3,3,Aiden,Aiden,transphobic,"Derogatory term for a transgender man, based o...","""TRAs always need to get the last word in and ...",1
4,4,Judeo-Christian,Judeo-Christian,Islamophobic,"It might seem neighbourly, even pluralistic, t...","""We are stopping cold the attacks on Judeo-Chr...",1
...,...,...,...,...,...,...,...
96,96,norm20,,,,"As the plane accelerates, we feel like we’re b...",0
97,97,norm21,,,,"Depending on the type of bomb, weapon safety r...",0
98,98,norm22,,,,"and to ad on, even humans don't die of ""old ag...",0
99,99,norm23,,,,I just don't like it anymore. I lived in Wisco...,0


# Running PREDICT on the dataset

In [16]:
import numpy
print(numpy.__version__)

2.2.4


In [15]:
texts = mixed_df['content'].tolist()  # Adjust column name if needed
# -------------------------------
# PREDICT Pipeline: Filter + Keyword Extraction
# -------------------------------

# Load classifier manually to bypass numpy issue
print("Loading custom toxicity classifier model...")
tokenizer = AutoTokenizer.from_pretrained(CLASSIFIER_MODEL)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
classifier_model = AutoModelForSequenceClassification.from_pretrained(CLASSIFIER_MODEL)
classifier_model.resize_token_embeddings(len(tokenizer))
classifier_model.eval()

# Initialize YAKE and KeyBERT keyword extractors
kw_extractor_yake = yake.KeywordExtractor(top=5, stopwords=None)
kw_extractor_kb = KeyBERT(model=EMBEDDING_MODEL_NAME)

def get_toxic_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = classifier_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)[0]
    return probs[1].item()

def extract_keywords_yake(text):
    return [kw for kw, score in kw_extractor_yake.extract_keywords(text)]

def extract_keywords_keybert(text):
    return [kw for kw, _ in kw_extractor_kb.extract_keywords(text, top_n=5)]

# Run PREDICT on neighbors of example query
print("Running PREDICT on nearest neighbors...")
example_query = texts[12]
neighbors = get_neighbors(example_query, k=5)
predict_results = []

print(f"Retrieved {len(neighbors['documents'][0])} neighbor posts.")

for i, post in enumerate(neighbors['documents'][0]):
    score_raw = get_toxic_score(post)
    print(f"Neighbor {i+1} | Toxicity Score: {score_raw:.3f}")
    if score_raw >= 0.2:  # lowered threshold from 0.3 to 0.2
        print(f"Neighbor {i+1} | Toxic: True")
        keywords_yake = extract_keywords_yake(post)
        keywords_kb = extract_keywords_keybert(post)
        combined_keywords = list(set(keywords_yake + keywords_kb))
        filtered_keywords = [kw for kw in combined_keywords if len(kw) > 3 and kw.lower() not in {"going", "today", "very", "have", "back", "face", "every", "copy", "copies"}]
        predict_results.append({
            'post': post,
            'keywords': filtered_keywords
        })
    else:
        print(f"Neighbor {i+1} | Toxic: False")

print(f"PREDICT returned {len(predict_results)} toxic posts with keywords.")
for result in predict_results:
    print("\nPost:", result['post'])
    print("Extracted Keywords:", result['keywords'])


Loading custom toxicity classifier model...
Running PREDICT on nearest neighbors...
Retrieved 5 neighbor posts.
Neighbor 1 | Toxicity Score: 0.048
Neighbor 1 | Toxic: False
Neighbor 2 | Toxicity Score: 0.813
Neighbor 2 | Toxic: True


RuntimeError: Numpy is not available

# Evaluate PREDICT on the dataset

In [None]:
from sklearn.metrics import classification_report

y_true = []
y_pred = []

for row in mixed_df.itertuples():
    text = row.content  # or row.example, depending on column name
    label = row.label
    score = get_toxic_score(text)
    
    y_true.append(label)
    y_pred.append(1 if score >= 0.2 else 0)  # adjust threshold if needed

print("Initial PREDICT Classifier Evaluation on Full Mixed Dataset:")
print(classification_report(y_true, y_pred))

Initial PREDICT Classifier Evaluation on Full Mixed Dataset:
              precision    recall  f1-score   support

           0       0.44      0.48      0.46        50
           1       0.43      0.39      0.41        51

    accuracy                           0.44       101
   macro avg       0.44      0.44      0.43       101
weighted avg       0.44      0.44      0.43       101



# Running DIRECT on the dataset

In [None]:
# -------------------------------
# DIRECT Pipeline: LLM Prompting for Dog Whistle Detection
# -------------------------------
import openai
import os
from dotenv import load_dotenv
load_dotenv()
client = openai.OpenAI()

def call_llm_direct(post):
    prompt = f"""
You are a content moderation assistant. Identify whether the following post contains any covert hate speech or dog whistles.

Return your response in the following JSON format:
{{
  "is_dog_whistle": true/false,
  "dog_whistle_terms": ["term1", "term2", ...],
  "explanation": "brief explanation of detection or why it's not a dog whistle"
}}

Post: "{post}"
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content

# Run DIRECT on neighbors of example query
print("\nRunning DIRECT on nearest neighbors...")
direct_results = []

for post in neighbors['documents'][0]:
    try:
        analysis = call_llm_direct(post)
        direct_results.append({
            'post': post,
            'llm_response': analysis
        })
    except Exception as e:
        print("Error calling LLM:", e)

# Show results
for result in direct_results:
    print("Post:", result['post'])
    print("LLM Response:", result['llm_response'])


Running DIRECT on nearest neighbors...
Post: Censoring comedy, fighting for some invented right to teach little kids in schools that switching genders is brave and stunning, cheering for a man swimming against girls, forced diversity, forcing latinx on Latinos who keep insisting they consider it a slur - list goes on.
LLM Response: {
  "is_dog_whistle": true,
  "dog_whistle_terms": ["censoring comedy", "invented right", "switching genders", "forced diversity", "latinx"],
  "explanation": "The post uses terms and phrases that suggest a disdain for LGBTQ+ rights and diversity initiatives, which can be interpreted as covert hate speech. Phrases like 'invented right' and 'forced diversity' imply that these rights and initiatives are illegitimate, while 'latinx' is presented in a derogatory context, indicating a rejection of inclusive language. This language can resonate with audiences who hold anti-LGBTQ+ or anti-diversity sentiments."
}
Post: This aligns closely with my analysis of the r

# Evaluate DIRECT on the dataset

In [None]:
import json
from sklearn.metrics import classification_report

y_true = []
y_pred = []

for row in mixed_df.itertuples():
    text = row.content
    label = row.label
    try:
        response = call_llm_direct(text)
        parsed = json.loads(response)
        is_dog_whistle = parsed.get("is_dog_whistle", False)
    except Exception as e:
        print("Error:", e)
        is_dog_whistle = False

    y_true.append(label)
    y_pred.append(1 if is_dog_whistle else 0)

print("Initial DIRECT Classifier Evaluation on Full Mixed Dataset:")
print(classification_report(y_true, y_pred, zero_division=0))

Initial DIRECT Classifier Evaluation on Full Mixed Dataset:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        50
           1       0.85      0.80      0.83        51

    accuracy                           0.83       101
   macro avg       0.83      0.83      0.83       101
weighted avg       0.83      0.83      0.83       101



# Fine-tuning PREDICT

In [None]:
from datasets import Dataset
from transformers import Trainer, TrainingArguments
hf_dataset = Dataset.from_pandas(mixed_df)

# Tokenize
def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, padding="max_length", max_length=128)

tokenized_ds = hf_dataset.map(tokenize_fn, batched=True)

# Trainer Setup
training_args = TrainingArguments(
    output_dir="./finetuned_predict_model",
    per_device_train_batch_size=8,
    num_train_epochs=4,
    logging_steps=10
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer = Trainer(
    model=classifier_model,
    args=training_args,
    train_dataset=tokenized_ds,
    eval_dataset=tokenized_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./finetuned_predict_model")

Map: 100%|██████████| 101/101 [00:00<00:00, 9821.82 examples/s]


Step,Training Loss
10,1.098
20,0.6691
30,0.3417
40,0.1313
50,0.2079


In [None]:
# load silent signal detection
mixed_dataset = load_dataset("SALT-NLP/silent_signals_detection", split="train")
mixed_df = pd.DataFrame(mixed_dataset)
mixed_df['label'] = mixed_df['label'].map({"coded":1,"non-coded":0})
mixed_df

Unnamed: 0,idx,dog_whistle,dog_whistle_root,ingroup,definition,example,label
0,0,YWNBAW,YWNBAW,transphobic,You Will Never Be A Woman (YWNBAW) is an alt-r...,"""Might I suggest you reach out to YWNBAW? Very...",1
1,1,109,109,antisemitic,109 stands for the 109 locations that Jews wer...,"""""Wow 109 countries, and they consider us as ...",1
2,2,alarmist,alarmist,climate change denier,"""Alarmist"" is a common global warming denialis...","""A mountain of evidence shows climate alarmist...",1
3,3,Aiden,Aiden,transphobic,"Derogatory term for a transgender man, based o...","""TRAs always need to get the last word in and ...",1
4,4,Judeo-Christian,Judeo-Christian,Islamophobic,"It might seem neighbourly, even pluralistic, t...","""We are stopping cold the attacks on Judeo-Chr...",1
...,...,...,...,...,...,...,...
96,96,norm20,,,,"As the plane accelerates, we feel like we’re b...",0
97,97,norm21,,,,"Depending on the type of bomb, weapon safety r...",0
98,98,norm22,,,,"and to ad on, even humans don't die of ""old ag...",0
99,99,norm23,,,,I just don't like it anymore. I lived in Wisco...,0


# Logistic Regression using PREDICT

In [None]:
import numpy as np
# -------------------------------
# Logistic Regression Using PREDICT Features
# -------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

feature_rows = []

for row in mixed_df.itertuples():
    text = row.content
    label = row.label
    tox_score = get_toxic_score(text)
    keywords_yake = extract_keywords_yake(text)
    keywords_kb = extract_keywords_keybert(text)

    feature_rows.append({
        "toxicity_score": tox_score,
        "num_keywords_yake": len(keywords_yake),
        "num_keywords_keybert": len(keywords_kb),
        "post_length": len(text),
        "label": label
    })

features_df = pd.DataFrame(feature_rows)

X = features_df.drop(columns=["label"])
y = features_df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print("Logistic Regression on PREDICT Features:")
print(classification_report(y_test, y_pred))


RuntimeError: Numpy is not available

# Running the Fine-Tuned PREDICT on the dataset

In [None]:
texts = mixed_df['example'].tolist()  # Adjust column name if needed
# -------------------------------
# PREDICT Pipeline: Filter + Keyword Extraction
# -------------------------------

# Load fine-tuned classifier
print("Loading fine-tuned PREDICT classifier model...")
tokenizer = AutoTokenizer.from_pretrained("./finetuned_predict_model")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

classifier_model = AutoModelForSequenceClassification.from_pretrained("./finetuned_predict_model")
classifier_model.resize_token_embeddings(len(tokenizer))
classifier_model.eval()

# Initialize YAKE and KeyBERT keyword extractors
kw_extractor_yake = yake.KeywordExtractor(top=5, stopwords=None)
kw_extractor_kb = KeyBERT(model=EMBEDDING_MODEL_NAME)

def get_toxic_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = classifier_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)[0]
    return probs[1].item()

def extract_keywords_yake(text):
    return [kw for kw, score in kw_extractor_yake.extract_keywords(text)]

def extract_keywords_keybert(text):
    return [kw for kw, _ in kw_extractor_kb.extract_keywords(text, top_n=5)]

# Run PREDICT on neighbors of example query
print("Running PREDICT on nearest neighbors...")
example_query = texts[12]
neighbors = get_neighbors(example_query, k=5)
predict_results = []

print(f"Retrieved {len(neighbors['documents'][0])} neighbor posts.")

for i, post in enumerate(neighbors['documents'][0]):
    score_raw = get_toxic_score(post)
    print(f"Neighbor {i+1} | Toxicity Score: {score_raw:.3f}")
    if score_raw >= 0.2:  # lowered threshold from 0.3 to 0.2
        print(f"Neighbor {i+1} | Toxic: True")
        keywords_yake = extract_keywords_yake(post)
        keywords_kb = extract_keywords_keybert(post)
        combined_keywords = list(set(keywords_yake + keywords_kb))
        filtered_keywords = [kw for kw in combined_keywords if len(kw) > 3 and kw.lower() not in {"going", "today", "very", "have", "back", "face", "every", "copy", "copies"}]
        predict_results.append({
            'post': post,
            'keywords': filtered_keywords
        })
    else:
        print(f"Neighbor {i+1} | Toxic: False")

print(f"PREDICT returned {len(predict_results)} toxic posts with keywords.")
for result in predict_results:
    print("\nPost:", result['post'])
    print("Extracted Keywords:", result['keywords'])


Loading fine-tuned PREDICT classifier model...
Running PREDICT on nearest neighbors...
Retrieved 5 neighbor posts.
Neighbor 1 | Toxicity Score: 0.000
Neighbor 1 | Toxic: False
Neighbor 2 | Toxicity Score: 0.999
Neighbor 2 | Toxic: True
Neighbor 3 | Toxicity Score: 0.172
Neighbor 3 | Toxic: False
Neighbor 4 | Toxicity Score: 0.000
Neighbor 4 | Toxic: False
Neighbor 5 | Toxicity Score: 0.000
Neighbor 5 | Toxic: False
PREDICT returned 1 toxic posts with keywords.

Post: This aligns closely with my analysis of the rise of gender identity ideology. Society is constantly concocting alternative routes around confronting trauma, individually, and collectively. A must read on this phenomenon is Judith Lewis Herman, *Trauma and Recovery*.
Extracted Keywords: ['ideology', 'gender identity ideology', 'rise of gender', 'trauma', 'confronting', 'identity ideology', 'Judith Lewis Herman', 'aligns closely', 'gender', 'judith']


# Evaluating the tuned PREDICT on the dataset

In [None]:
from sklearn.metrics import classification_report

y_true = []
y_pred = []

for row in mixed_df.itertuples():
    text = row.example  # or row.example, depending on column name
    label = row.label
    score = get_toxic_score(text)
    
    y_true.append(label)
    y_pred.append(1 if score >= 0.2 else 0)  # adjust threshold if needed

print("Initial PREDICT Classifier Evaluation on Full Mixed Dataset:")
print(classification_report(y_true, y_pred))

Initial PREDICT Classifier Evaluation on Full Mixed Dataset:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        50
           1       0.96      1.00      0.98        51

    accuracy                           0.98       101
   macro avg       0.98      0.98      0.98       101
weighted avg       0.98      0.98      0.98       101



In [None]:
# Detection wrappers

def predict_detects(post):
    return get_toxic_score(post) >= 0.2

def direct_detects(post):
    try:
        response = call_llm_direct(post)
        parsed = json.loads(response)
        return parsed.get("is_dog_whistle", False)
    except:
        return False

from sklearn.metrics import classification_report

results = []

for row in mixed_df.itertuples():
    post = row.example
    label = row.label

    predict_flag = predict_detects(post)
    direct_flag = direct_detects(post)

    results.append({
        "post": post,
        "label": label,
        "predict_detected": int(predict_flag),
        "direct_detected": int(direct_flag)
    })
# -------------------------------
# ENSEMBLE Evaluation: PREDICT OR DIRECT
# -------------------------------
from sklearn.metrics import classification_report

ensemble_preds = [
    int(r["predict_detected"] or r["direct_detected"])
    for r in results
]

print("\nENSEMBLE Evaluation (PREDICT OR DIRECT):")
print(classification_report([r["label"] for r in results], ensemble_preds))


ENSEMBLE Evaluation (PREDICT OR DIRECT):
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        50
           1       0.85      1.00      0.92        51

    accuracy                           0.91       101
   macro avg       0.93      0.91      0.91       101
weighted avg       0.92      0.91      0.91       101



In [None]:
# -------------------------------
# ENSEMBLE Evaluation: PREDICT AND DIRECT
# -------------------------------
from sklearn.metrics import classification_report

ensemble_preds = [
    int(r["predict_detected"] and r["direct_detected"])
    for r in results
]

print("\nENSEMBLE Evaluation (PREDICT AND DIRECT):")
print(classification_report([r["label"] for r in results], ensemble_preds))


ENSEMBLE Evaluation (PREDICT AND DIRECT):
              precision    recall  f1-score   support

           0       0.83      0.98      0.90        50
           1       0.98      0.80      0.88        51

    accuracy                           0.89       101
   macro avg       0.90      0.89      0.89       101
weighted avg       0.90      0.89      0.89       101



In [None]:
from sklearn.metrics import f1_score, accuracy_score

print("\n========= WEIGHTED ENSEMBLE GRID SEARCH =========")
print("PREDICT_weight | DIRECT_weight | Accuracy | F1 Score")

for pw in [round(i * 0.1, 2) for i in range(11)]:  # 0.0 to 1.0
    dw = round(1.0 - pw, 2)
    preds = []

    for r in results:
        score = pw * r["predict_detected"] + dw * r["direct_detected"]
        preds.append(int(score >= 0.5))

    acc = accuracy_score([r["label"] for r in results], preds)
    f1 = f1_score([r["label"] for r in results], preds)

    print(f"{pw:.2f}           | {dw:.2f}          | {acc:.2f}     | {f1:.2f}")



PREDICT_weight | DIRECT_weight | Accuracy | F1 Score
0.00           | 1.00          | 0.82     | 0.82
0.10           | 0.90          | 0.82     | 0.82
0.20           | 0.80          | 0.82     | 0.82
0.30           | 0.70          | 0.82     | 0.82
0.40           | 0.60          | 0.82     | 0.82
0.50           | 0.50          | 0.91     | 0.92
0.60           | 0.40          | 0.98     | 0.98
0.70           | 0.30          | 0.98     | 0.98
0.80           | 0.20          | 0.98     | 0.98
0.90           | 0.10          | 0.98     | 0.98
1.00           | 0.00          | 0.98     | 0.98


In [None]:
import transformers
print(transformers.__version__)

4.51.3


In [19]:
from transformers import TrainingArguments

In [20]:
# -------------------------------
# Hybrid Ensemble: Finetuned + Logistic + DIRECT
# -------------------------------
from sklearn.metrics import classification_report

# STEP 1: Get prediction components
hybrid_rows = []

for row in mixed_df.itertuples():
    post = row.example
    label = row.label

    # Fine-tuned score
    inputs = tokenizer(post, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = classifier_model(**inputs).logits
        ft_score = F.softmax(logits, dim=-1)[0][1].item()

    # Logistic regression score (recompute features)
    toks = get_toxic_score(post)
    kws_yake = extract_keywords_yake(post)
    kws_kb = extract_keywords_keybert(post)
    logreg_input = [[
        toks,
        len(kws_yake),
        len(kws_kb),
        len(post)
    ]]
    logreg_prob = logreg.predict_proba(logreg_input)[0][1]

    # DIRECT flag
    try:
        direct_json = json.loads(call_llm_direct(post))
        direct_flag = int(direct_json.get("is_dog_whistle", False))
    except:
        direct_flag = 0

    hybrid_rows.append({
        "post": post,
        "label": label,
        "ft_score": ft_score,
        "logreg_prob": logreg_prob,
        "direct_flag": direct_flag
    })

hybrid_df = pd.DataFrame(hybrid_rows)


AttributeError: 'Pandas' object has no attribute 'example'

In [None]:
print("\n========= HYBRID ENSEMBLE GRID SEARCH =========")
print("ft | lr | dir | thresh | Accuracy | F1")

best_combo = None
best_f1 = 0

for ft_w in [0.3, 0.4]:
    for lr_w in [0.3, 0.2]:
        for dir_w in [0.4, 0.5]:
            for threshold in [0.5, 0.6]:
                if abs(ft_w + lr_w + dir_w - 1.0) > 0.01:
                    continue

                preds = []
                for r in hybrid_rows:
                    score = (
                        ft_w * r["ft_score"] +
                        lr_w * r["logreg_prob"] +
                        dir_w * r["direct_flag"]
                    )
                    preds.append(int(score >= threshold))

                acc = accuracy_score([r["label"] for r in hybrid_rows], preds)
                f1 = f1_score([r["label"] for r in hybrid_rows], preds)

                print(f"{ft_w:.1f} | {lr_w:.1f} | {dir_w:.1f} | {threshold:.1f} | {acc:.2f} | {f1:.2f}")

                if f1 > best_f1:
                    best_f1 = f1
                    best_combo = (ft_w, lr_w, dir_w, threshold)



ft | lr | dir | thresh | Accuracy | F1
0.3 | 0.3 | 0.4 | 0.5 | 0.98 | 0.98
0.3 | 0.3 | 0.4 | 0.6 | 0.90 | 0.89
0.3 | 0.2 | 0.5 | 0.5 | 0.83 | 0.83
0.3 | 0.2 | 0.5 | 0.6 | 0.90 | 0.89
0.4 | 0.2 | 0.4 | 0.5 | 0.98 | 0.98
0.4 | 0.2 | 0.4 | 0.6 | 0.90 | 0.89


In [21]:
# Re-evaluate best combo
ft_w, lr_w, dir_w, threshold = best_combo
final_preds = []

for r in hybrid_rows:
    score = (
        ft_w * r["ft_score"] +
        lr_w * r["logreg_prob"] +
        dir_w * r["direct_flag"]
    )
    final_preds.append(int(score >= threshold))

print("\n========= FINAL HYBRID ENSEMBLE EVALUATION =========")
print(f"Best weights → ft: {ft_w}, lr: {lr_w}, dir: {dir_w}, threshold: {threshold}")
print(classification_report([r["label"] for r in hybrid_rows], final_preds))

NameError: name 'best_combo' is not defined

# Evaluating on the test dataset

In [None]:
combined_df

In [39]:
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
import torch.nn.functional as F

# Load the fine-tuned model
ft_model = AutoModelForSequenceClassification.from_pretrained("./finetuned_predict_model")
ft_model.eval()

ft_preds = []
for row in combined_df.itertuples():
    inputs = tokenizer(row.content, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        probs = F.softmax(ft_model(**inputs).logits, dim=-1)[0]
    ft_preds.append(torch.argmax(probs).item())

print("Fine-tuned PREDICT Evaluation:")
print(classification_report(combined_df['label'], ft_preds))


Fine-tuned PREDICT Evaluation:
              precision    recall  f1-score   support

           0       0.20      0.94      0.32       813
           1       0.71      0.04      0.07      3252

    accuracy                           0.22      4065
   macro avg       0.45      0.49      0.20      4065
weighted avg       0.61      0.22      0.12      4065



In [43]:
# -------------------------------
# Logistic Regression Evaluation on combined_df
# -------------------------------

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import torch.nn.functional as F

# Updated get_toxic_score with truncation fix
def get_toxic_score(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    with torch.no_grad():
        outputs = classifier_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)[0]
    return probs[1].item()

# Extract features
logreg_rows = []
for row in combined_df.itertuples():
    toks = get_toxic_score(row.content)
    kws_yake = extract_keywords_yake(row.content)
    kws_kb = extract_keywords_keybert(row.content)
    logreg_rows.append({
        "toxicity_score": toks,
        "num_keywords_yake": len(kws_yake),
        "num_keywords_keybert": len(kws_kb),
        "post_length": len(row.content)
    })

# Predict
logreg_features = pd.DataFrame(logreg_rows)
logreg_preds = logreg.predict(logreg_features)

# Evaluate
print("Logistic Regression Evaluation:")
print(classification_report(combined_df['label'], logreg_preds))


Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.20      0.94      0.32       813
           1       0.71      0.04      0.07      3252

    accuracy                           0.22      4065
   macro avg       0.45      0.49      0.20      4065
weighted avg       0.61      0.22      0.12      4065



In [45]:
import json

direct_preds = []
for row in combined_df.itertuples():
    try:
        direct_json = json.loads(call_llm_direct(row.content))
        direct_preds.append(int(direct_json.get("is_dog_whistle", False)))
    except:
        direct_preds.append(0)

print("DIRECT Evaluation:")
print(classification_report(combined_df['label'], direct_preds))


DIRECT Evaluation:
              precision    recall  f1-score   support

           0       0.48      0.93      0.63       813
           1       0.98      0.75      0.85      3252

    accuracy                           0.78      4065
   macro avg       0.73      0.84      0.74      4065
weighted avg       0.88      0.78      0.80      4065

