In [8]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib

In [2]:
def load_threat_model(model_name="log_nb_model.pkl"):
    """
    Load the trained threat detection model and log-count ratio.

    Args:
        model_name (str): The file name of the saved model.
    
    Returns:
        - Trained Logistic Regression model.
        - Log-count ratio (r) for Naive Bayes weighting.
    """
    model, r = joblib.load(model_name)
    print("Threat model loaded successfully!")
    return model, r

In [14]:
# keeping punctuations in sentences without removing them but separating them from the words to keep the words clean.
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

# Load the vectorizer
vec = joblib.load("tfidf_vectorizer.pkl")

#load the model
model,r = load_threat_model()

Threat model loaded successfully!


In [16]:
def predict_threat(test_df, text_column, vec, model, r):
    """
    Predict the "threat" probability for a new dataset using the saved model.
    
    Args:
        test_df (pd.DataFrame): A DataFrame containing text data.
        text_column (str): The name of the column containing the text.
        vec (TfidfVectorizer): The TF-IDF vectorizer (must be trained).
        model_name (str): The filename of the saved model.

    Returns:
        pd.DataFrame: Original DataFrame with added "threat_probability" and "pred_label".
    """   

    # Ensure text column is string format
    test_df[text_column] = test_df[text_column].astype(str)

    # Transform new text data using the saved vectorizer
    test_x = vec.transform(test_df[text_column])  # Convert to TF-IDF features

    # Apply Naive Bayes transformation using the saved r
    test_x_nb = test_x.multiply(r)

    # Get probability predictions for class 1 (threat)
    threat_probs = model.predict_proba(test_x_nb)[:, 1]

    # Assign probabilities and thresholded predictions
    test_df["threat_probability"] = threat_probs
    test_df["pred_label"] = (threat_probs >= 0.5).astype(int)  # Threshold at 0.5

    return test_df

### Dataset 1: Toxic Comment Classification

In [19]:
toxiccomment = pd.read_csv("..//Datasets/toxiccomment/toxiccomment.csv")
toxiccomment.head()

Unnamed: 0,id,comment_text,true_label
0,55858b89f99e9bda,Hope he dies \n\nNow this Atheist filth's wife...,1
1,425a1dbdf740e9b8,"2006 (UTC)\n\n Removed Merge 17:15, 5 April",0
2,20c81b99f7adf557,John discuss it here \n\nSeems you don't like ...,0
3,af0dce6ce84974ec,"""\nTo answer your question, no. There is no si...",0
4,a069e6d6d1a2348d,"""\n But Arpad can cite any webpage he finds, o...",0


In [21]:
toxiccomment = predict_threat(toxiccomment,"comment_text", vec, model, r)

In [23]:
toxiccomment.head()

Unnamed: 0,id,comment_text,true_label,threat_probability,pred_label
0,55858b89f99e9bda,Hope he dies \n\nNow this Atheist filth's wife...,1,0.444857,0
1,425a1dbdf740e9b8,"2006 (UTC)\n\n Removed Merge 17:15, 5 April",0,8.8e-05,0
2,20c81b99f7adf557,John discuss it here \n\nSeems you don't like ...,0,0.000116,0
3,af0dce6ce84974ec,"""\nTo answer your question, no. There is no si...",0,8.6e-05,0
4,a069e6d6d1a2348d,"""\n But Arpad can cite any webpage he finds, o...",0,6.8e-05,0


In [25]:
toxiccomment.to_csv('nbsvm_toxiccomment.csv',index=False)

### Dataset 2: Jigsaw Unintended Bias Toxic Classification

In [28]:
jigsaw = pd.read_csv("..//Datasets/jigsaw/jigsaw.csv")
jigsaw.head()

Unnamed: 0,id,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,Threat_Jigsaw,toxicity_annotator_count,true_label
0,304799,Is your concern satisfied by the fact that the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0
1,5293619,Comey's firing is a separate issue from the ma...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0
2,5009206,klastri> besides the legal points. She shoul...,0.5,0.1,0.0,0.0,0.0,0.1,0.5,10,1
3,6122758,Trump has been the only one to threaten to wip...,0.536232,0.0,0.086957,0.0,0.057971,0.536232,0.043478,69,0
4,6018742,"Just listen to yourself.\n""the swamp"" (twice!)...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0


In [30]:
jigsaw = predict_threat(jigsaw,"comment_text", vec, model, r)
jigsaw.head()

Unnamed: 0,id,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,Threat_Jigsaw,toxicity_annotator_count,true_label,threat_probability,pred_label
0,304799,Is your concern satisfied by the fact that the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0,0.000111,0
1,5293619,Comey's firing is a separate issue from the ma...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0,0.000156,0
2,5009206,klastri> besides the legal points. She shoul...,0.5,0.1,0.0,0.0,0.0,0.1,0.5,10,1,0.00119,0
3,6122758,Trump has been the only one to threaten to wip...,0.536232,0.0,0.086957,0.0,0.057971,0.536232,0.043478,69,0,0.000459,0
4,6018742,"Just listen to yourself.\n""the swamp"" (twice!)...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0,0.00012,0


In [32]:
jigsaw.to_csv('nbsvm_jigsaw.csv',index=False)

### Dataset 3: Combined Toxicity Profanity v2

In [35]:
combined = pd.read_csv('..//Datasets/Combined_Toxicity_Profanity/combined_en.csv')
combined.head()

Unnamed: 0,text,labels,encoded_labels,true_label
0,Agree. The number of morons that shoot animal...,['toxic' 'threat' 'insult'],"[1, 0, 1, 0, 1, 0, 0, 0, 0]",1
1,"""As long as we get to shoot them at some point...",['toxic' 'threat'],"[1, 0, 0, 0, 1, 0, 0, 0, 0]",1
2,mad know take never felt angry mad never get g...,['selfharm'],"[0, 0, 0, 0, 0, 0, 0, 1, 0]",0
3,The RINOS in charge of the Congress would neve...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0
4,Which of Trump's appointments do you believe w...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0


In [37]:
combined = predict_threat(combined,"text", vec, model, r)
combined.head()

Unnamed: 0,text,labels,encoded_labels,true_label,threat_probability,pred_label
0,Agree. The number of morons that shoot animal...,['toxic' 'threat' 'insult'],"[1, 0, 1, 0, 1, 0, 0, 0, 0]",1,0.037256,0
1,"""As long as we get to shoot them at some point...",['toxic' 'threat'],"[1, 0, 0, 0, 1, 0, 0, 0, 0]",1,0.000394,0
2,mad know take never felt angry mad never get g...,['selfharm'],"[0, 0, 0, 0, 0, 0, 0, 1, 0]",0,0.011681,0
3,The RINOS in charge of the Congress would neve...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,9.7e-05,0
4,Which of Trump's appointments do you believe w...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0.000133,0


In [51]:
combined.to_csv('nbsvm_combined.csv', index=False)

### Dataset 4: Kaggle Suspicious

In [40]:
sus = pd.read_csv('..//Datasets/kaggle_suspicious/suspicious_tweets.csv')
sus.head()

Unnamed: 0,message,label
0,@MsLynnGallo Never mind - You missed the joke ...,0
1,turns out there had been #earthquake warnings ...,1
2,@susanhutchinson The other thing is the accent...,0
3,@LolKate18 You like Ladyhawke? You are so cool,0
4,#asot400 we want to see some pics of the locat...,0


In [44]:
sus = predict_threat(sus,"message", vec, model, r)
sus.head()

Unnamed: 0,message,label,threat_probability,pred_label
0,@MsLynnGallo Never mind - You missed the joke ...,0,2.8e-05,0
1,turns out there had been #earthquake warnings ...,1,0.000249,0
2,@susanhutchinson The other thing is the accent...,0,0.000456,0
3,@LolKate18 You like Ladyhawke? You are so cool,0,0.000161,0
4,#asot400 we want to see some pics of the locat...,0,0.000132,0


In [53]:
sus.to_csv('nbsvm_sus.csv', index=False)

### Dataset 5: Lifethreat Comments

In [47]:
lifethreat = pd.read_csv("..//Datasets/threat_comments/ThreatsComments.csv")
lifethreat.head()

Unnamed: 0,ID,Comments
0,1.0,White people attacking police and no one was s...
1,2.0,yall needed 20 officers for one guy why yall a...
2,3.0,I think that cop wanted him attack metro loves...
3,4.0,This is just like the judge attack. Got offend...
4,5.0,Then he got his face slammed into the wall.


In [49]:
lifethreat = predict_threat(lifethreat,"Comments", vec, model, r)
lifethreat.head()

Unnamed: 0,ID,Comments,threat_probability,pred_label
0,1.0,White people attacking police and no one was s...,0.000139,0
1,2.0,yall needed 20 officers for one guy why yall a...,0.001647,0
2,3.0,I think that cop wanted him attack metro loves...,0.000137,0
3,4.0,This is just like the judge attack. Got offend...,0.000123,0
4,5.0,Then he got his face slammed into the wall.,0.000198,0


In [55]:
lifethreat.to_csv('nbsvm_lifethreat.csv', index=False)