In [1]:
import pickle
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# Load the tokenizer
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Load the trained LSTM model
model = load_model("lstm_threat_model.keras")

print("Tokenizer, embedding matrix, and LSTM model loaded successfully!")


Tokenizer, embedding matrix, and LSTM model loaded successfully!


In [5]:
# Define label names (must match model output order)
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [7]:
# Define the max sequence length (must match training value)
MAX_SEQUENCE_LENGTH = 100  # Adjust if needed

def preprocess_text(texts, tokenizer, maxlen=MAX_SEQUENCE_LENGTH):
    """
    Tokenizes and pads text sequences for LSTM model inference.
    
    Args:
        texts (list of str): List of text inputs.
        tokenizer (Tokenizer): Pretrained Keras tokenizer.
        maxlen (int): Maximum sequence length.

    Returns:
        np.array: Preprocessed text sequences.
    """
    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences(texts)

    # Pad sequences to the required length
    padded_sequences = pad_sequences(sequences, maxlen=maxlen)

    return np.array(padded_sequences)

# Example new text data
# new_texts = ["I will hurt you!", "Let's meet tomorrow."]

# Preprocess the new text
# X_new = preprocess_text(new_texts, tokenizer)

# print("New text preprocessed successfully!")


In [58]:
def predict(model,inp,dataframe):
    # Predict probabilities for each label (assuming multi-label classification)
    predictions = model.predict(inp)
    
    # Convert probabilities to binary labels (threshold = 0.5)
    binary_predictions = (predictions >= 0.01).astype(int)
    
    # Print results
    print("Predicted probabilities:\n", predictions)
    print("Binary predictions:\n", binary_predictions)
    
    # Convert predictions to a DataFrame
    probs_df = pd.DataFrame(predictions, columns=[f"{label}_prob" for label in label_names])
    binary_df = pd.DataFrame(binary_predictions, columns=[f"{label}_pred" for label in label_names])
    
    # Concatenate with the original DataFrame
    dataframe = pd.concat([dataframe, probs_df, binary_df], axis=1)

    return dataframe

### Dataset 1: Toxic Comment Classification

In [48]:
toxiccomment = pd.read_csv("..//Datasets/toxiccomment/toxiccomment.csv")
toxiccomment.head()

Unnamed: 0,id,comment_text,true_label
0,55858b89f99e9bda,Hope he dies \n\nNow this Atheist filth's wife...,1
1,425a1dbdf740e9b8,"2006 (UTC)\n\n Removed Merge 17:15, 5 April",0
2,20c81b99f7adf557,John discuss it here \n\nSeems you don't like ...,0
3,af0dce6ce84974ec,"""\nTo answer your question, no. There is no si...",0
4,a069e6d6d1a2348d,"""\n But Arpad can cite any webpage he finds, o...",0


In [50]:
input = preprocess_text(toxiccomment['comment_text'].values,tokenizer)

In [52]:
# Predict probabilities for each label (assuming multi-label classification)
predictions = model.predict(input)

# Convert probabilities to binary labels (threshold = 0.5)
binary_predictions = (predictions >= 0.01).astype(int)

# Print results
print("Predicted probabilities:\n", predictions)
print("Binary predictions:\n", binary_predictions)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Predicted probabilities:
 [[1.3689028e-01 5.2621617e-04 7.2897850e-03 1.0213106e-03 1.7765222e-02
  2.0823302e-02]
 [1.7541224e-03 3.8301441e-05 4.5968633e-04 2.9912795e-05 3.0577162e-04
  1.0645079e-04]
 [4.6352758e-03 3.4298075e-05 6.8385195e-04 4.5898876e-05 7.8927568e-04
  9.7889992e-05]
 ...
 [3.3527236e-02 8.1517518e-04 7.7471966e-03 1.3016362e-03 7.9007335e-03
  1.6744373e-03]
 [9.9292916e-01 4.0534022e-01 9.3284702e-01 1.3737620e-01 8.7665248e-01
  5.6266999e-01]
 [9.8263061e-01 5.0640666e-01 9.2478245e-01 1.9955091e-01 8.2243675e-01
  4.5520556e-01]]
Binary predictions:
 [[1 0 0 0 1 1]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [1 0 0 0 0 0]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]


In [26]:
assert len(predictions) == len(toxiccomment), "Mismatch: Predictions and DataFrame row count do not match!"

In [54]:
# Convert predictions to a DataFrame
probs_df = pd.DataFrame(predictions, columns=[f"{label}_prob" for label in label_names])
binary_df = pd.DataFrame(binary_predictions, columns=[f"{label}_pred" for label in label_names])

# Concatenate with the original DataFrame
toxiccomment = pd.concat([toxiccomment, probs_df, binary_df], axis=1)

print("Updated toxiccomment DataFrame:")
toxiccomment.head()

Updated toxiccomment DataFrame:


Unnamed: 0,id,comment_text,true_label,toxic_prob,severe_toxic_prob,obscene_prob,threat_prob,insult_prob,identity_hate_prob,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred
0,55858b89f99e9bda,Hope he dies \n\nNow this Atheist filth's wife...,1,0.13689,0.000526,0.00729,0.001021,0.017765,0.020823,1,0,0,0,1,1
1,425a1dbdf740e9b8,"2006 (UTC)\n\n Removed Merge 17:15, 5 April",0,0.001754,3.8e-05,0.00046,3e-05,0.000306,0.000106,0,0,0,0,0,0
2,20c81b99f7adf557,John discuss it here \n\nSeems you don't like ...,0,0.004635,3.4e-05,0.000684,4.6e-05,0.000789,9.8e-05,0,0,0,0,0,0
3,af0dce6ce84974ec,"""\nTo answer your question, no. There is no si...",0,0.001644,1.2e-05,0.00028,1e-05,0.000257,3.6e-05,0,0,0,0,0,0
4,a069e6d6d1a2348d,"""\n But Arpad can cite any webpage he finds, o...",0,0.000783,4e-06,0.000128,3e-06,0.0001,1.7e-05,0,0,0,0,0,0


In [56]:
toxiccomment.to_csv('lstm_toxiccomment.csv',index=False)

## Dataset 2: Jigsaw Unintended Bias Toxic Comment Classification

In [34]:
jigsaw = pd.read_csv("..//Datasets/jigsaw/jigsaw.csv")
jigsaw.head()

Unnamed: 0,id,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,Threat_Jigsaw,toxicity_annotator_count,true_label
0,304799,Is your concern satisfied by the fact that the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0
1,5293619,Comey's firing is a separate issue from the ma...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0
2,5009206,klastri> besides the legal points. She shoul...,0.5,0.1,0.0,0.0,0.0,0.1,0.5,10,1
3,6122758,Trump has been the only one to threaten to wip...,0.536232,0.0,0.086957,0.0,0.057971,0.536232,0.043478,69,0
4,6018742,"Just listen to yourself.\n""the swamp"" (twice!)...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0


In [36]:
input = preprocess_text(jigsaw['comment_text'].values,tokenizer)

In [38]:
# Predict probabilities for each label (assuming multi-label classification)
predictions = model.predict(input)

# Convert probabilities to binary labels (threshold = 0.5)
binary_predictions = (predictions >= 0.01).astype(int)

# Print results
print("Predicted probabilities:\n", predictions)
print("Binary predictions:\n", binary_predictions)

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step
Predicted probabilities:
 [[1.79644916e-02 1.10216344e-04 2.21086666e-03 1.96775014e-04
  3.31710861e-03 4.32194094e-04]
 [3.22220564e-01 1.34007481e-03 1.82849243e-02 2.16343114e-03
  6.03475347e-02 2.32016444e-02]
 [6.50705816e-03 1.95716930e-05 7.08196603e-04 3.55506891e-05
  9.09796858e-04 1.17542222e-04]
 ...
 [1.04074940e-01 6.46706438e-04 1.15983915e-02 1.67381414e-03
  2.32836008e-02 4.65251645e-03]
 [9.38696682e-01 5.85419312e-02 5.40449858e-01 4.12942581e-02
  6.04226589e-01 6.97873086e-02]
 [1.65631935e-01 9.33216885e-04 1.42971668e-02 2.44145119e-03
  3.03731449e-02 1.22662419e-02]]
Binary predictions:
 [[1 0 0 0 0 0]
 [1 0 1 0 1 1]
 [0 0 0 0 0 0]
 ...
 [1 0 1 0 1 0]
 [1 1 1 1 1 1]
 [1 0 1 0 1 1]]


In [44]:
# Convert predictions to a DataFrame
probs_df = pd.DataFrame(predictions, columns=[f"{label}_prob" for label in label_names])
binary_df = pd.DataFrame(binary_predictions, columns=[f"{label}_pred" for label in label_names])

# Concatenate with the original DataFrame
jigsaw = pd.concat([jigsaw, probs_df, binary_df], axis=1)

print("Updated Jigsaw DataFrame:")
jigsaw.head()

Updated Jigsaw DataFrame:


Unnamed: 0,id,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,Threat_Jigsaw,toxicity_annotator_count,...,obscene_prob,threat_prob,insult_prob,identity_hate_prob,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred
0,304799,Is your concern satisfied by the fact that the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,...,0.002211,0.000197,0.003317,0.000432,1,0,0,0,0,0
1,5293619,Comey's firing is a separate issue from the ma...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,...,0.018285,0.002163,0.060348,0.023202,1,0,1,0,1,1
2,5009206,klastri> besides the legal points. She shoul...,0.5,0.1,0.0,0.0,0.0,0.1,0.5,10,...,0.000708,3.6e-05,0.00091,0.000118,0,0,0,0,0,0
3,6122758,Trump has been the only one to threaten to wip...,0.536232,0.0,0.086957,0.0,0.057971,0.536232,0.043478,69,...,0.184227,0.046887,0.320257,0.092132,1,1,1,1,1,1
4,6018742,"Just listen to yourself.\n""the swamp"" (twice!)...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,...,0.002334,0.000235,0.004179,0.002365,1,0,0,0,0,0


In [46]:
jigsaw.to_csv('lstm_jigsaw.csv',index=False)

### Dataset 3: Combined Toxicity Profanity

In [60]:
combined = pd.read_csv('..//Datasets/Combined_Toxicity_Profanity/combined_en.csv')
combined.head()

Unnamed: 0,text,labels,encoded_labels,true_label
0,Agree. The number of morons that shoot animal...,['toxic' 'threat' 'insult'],"[1, 0, 1, 0, 1, 0, 0, 0, 0]",1
1,"""As long as we get to shoot them at some point...",['toxic' 'threat'],"[1, 0, 0, 0, 1, 0, 0, 0, 0]",1
2,mad know take never felt angry mad never get g...,['selfharm'],"[0, 0, 0, 0, 0, 0, 0, 1, 0]",0
3,The RINOS in charge of the Congress would neve...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0
4,Which of Trump's appointments do you believe w...,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0


In [62]:
combined['text']

0      Agree.  The number of morons that shoot animal...
1      "As long as we get to shoot them at some point...
2      mad know take never felt angry mad never get g...
3      The RINOS in charge of the Congress would neve...
4      Which of Trump's appointments do you believe w...
                             ...                        
855    Is it fair for the media to attribute somethin...
856    Funny you bought a Ford F350 for what was it $...
857    This time tomorrowThis time tomorrow hopefully...
858    Sad!!If police say they don't think it's drug ...
859    I don’t want to live anymoreI hate my life I’m...
Name: text, Length: 860, dtype: object

In [64]:
inp = preprocess_text(combined['text'].values,tokenizer)

In [66]:
combined = predict(model,inp,combined)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Predicted probabilities:
 [[6.53806567e-01 7.62355886e-03 1.52094111e-01 1.22744376e-02
  2.30443344e-01 3.30835320e-02]
 [2.61949170e-02 2.56138504e-04 3.35035170e-03 4.02235310e-04
  4.15367866e-03 2.35203747e-03]
 [9.83333230e-01 3.34376067e-01 9.22124326e-01 1.22339852e-01
  8.05881679e-01 2.23243400e-01]
 ...
 [2.00042091e-02 1.30337459e-04 3.47526325e-03 3.39726481e-04
  2.90672318e-03 5.50839468e-04]
 [4.47031140e-01 4.65856167e-03 9.45467949e-02 9.83688515e-03
  1.38886243e-01 1.22343805e-02]
 [8.38948429e-01 1.78659316e-02 4.93409216e-01 1.49861230e-02
  4.08281446e-01 2.85236854e-02]]
Binary predictions:
 [[1 0 1 1 1 1]
 [1 0 0 0 0 0]
 [1 1 1 1 1 1]
 ...
 [1 0 0 0 0 0]
 [1 0 1 0 1 1]
 [1 1 1 1 1 1]]


In [68]:
combined.to_csv('lstm_combined.csv',index=False)

### Dataset 4: Kaggle Suspicious

In [70]:
sus = pd.read_csv('..//Datasets/kaggle_suspicious/suspicious_tweets.csv')
sus.head()

Unnamed: 0,message,label
0,@MsLynnGallo Never mind - You missed the joke ...,0
1,turns out there had been #earthquake warnings ...,1
2,@susanhutchinson The other thing is the accent...,0
3,@LolKate18 You like Ladyhawke? You are so cool,0
4,#asot400 we want to see some pics of the locat...,0


In [72]:
inp = preprocess_text(sus['message'].values,tokenizer)

In [74]:
sus = predict(model,inp,sus)
sus.head()

[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step
Predicted probabilities:
 [[1.2585135e-01 1.0951697e-03 2.1573350e-02 2.3088520e-03 2.7428510e-02
  1.7319557e-03]
 [6.9646724e-03 4.9814476e-05 8.3322031e-04 6.7086352e-05 1.1282997e-03
  1.7804861e-04]
 [1.6491294e-01 2.3924217e-03 5.6861009e-02 3.1840540e-03 5.0148748e-02
  4.0399116e-03]
 ...
 [2.0103531e-02 5.8373861e-04 5.3390432e-03 6.4292573e-04 5.0634402e-03
  9.1169984e-04]
 [7.4054990e-03 8.9069727e-05 1.3212711e-03 7.6010365e-05 1.6187924e-03
  1.5760210e-04]
 [1.3494208e-01 2.0593519e-03 2.3362296e-02 4.2898324e-03 3.7225455e-02
  5.1069953e-03]]
Binary predictions:
 [[1 0 1 0 1 0]
 [0 0 0 0 0 0]
 [1 0 1 0 1 0]
 ...
 [1 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 1 0 1 0]]


Unnamed: 0,message,label,toxic_prob,severe_toxic_prob,obscene_prob,threat_prob,insult_prob,identity_hate_prob,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred
0,@MsLynnGallo Never mind - You missed the joke ...,0,0.125851,0.001095,0.021573,0.002309,0.027429,0.001732,1,0,1,0,1,0
1,turns out there had been #earthquake warnings ...,1,0.006965,5e-05,0.000833,6.7e-05,0.001128,0.000178,0,0,0,0,0,0
2,@susanhutchinson The other thing is the accent...,0,0.164913,0.002392,0.056861,0.003184,0.050149,0.00404,1,0,1,0,1,0
3,@LolKate18 You like Ladyhawke? You are so cool,0,0.308234,0.00412,0.036226,0.005711,0.101464,0.004473,1,0,1,0,1,0
4,#asot400 we want to see some pics of the locat...,0,0.004248,6.4e-05,0.000888,5.4e-05,0.000788,0.000184,0,0,0,0,0,0


In [76]:
sus.to_csv('lstm_sus.csv',index=False)

### Dataset 5: Lifethreat

In [148]:
lifethreat = pd.read_csv("..//Datasets/threat_comments/ThreatsComments.csv")
lifethreat.head()

Unnamed: 0,ID,Comments
0,1.0,White people attacking police and no one was s...
1,2.0,yall needed 20 officers for one guy why yall a...
2,3.0,I think that cop wanted him attack metro loves...
3,4.0,This is just like the judge attack. Got offend...
4,5.0,Then he got his face slammed into the wall.


In [156]:
lifethreat['Comments'] = lifethreat['Comments'].astype(str)

In [158]:
inp = preprocess_text(lifethreat['Comments'].values,tokenizer)

In [162]:
lifethreat = predict(model,inp,lifethreat)
lifethreat.head()

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
Predicted probabilities:
 [[1.08517930e-01 9.54188814e-04 1.22024352e-02 2.10013450e-03
  2.08539050e-02 8.27379990e-03]
 [8.17211121e-02 1.15148828e-03 1.80808399e-02 1.81790930e-03
  2.39894688e-02 3.45176435e-03]
 [1.99137732e-01 1.25011965e-03 1.61745157e-02 3.16697080e-03
  3.79645452e-02 8.61750264e-03]
 ...
 [3.02052021e-01 2.25060037e-03 2.92701069e-02 2.86923046e-03
  7.49001876e-02 1.09777329e-02]
 [2.50468720e-02 1.14644405e-04 2.12581153e-03 2.50729063e-04
  3.32866260e-03 1.53685734e-03]
 [1.91160202e-01 4.93806903e-04 1.27471080e-02 1.16467651e-03
  3.33460793e-02 7.72207696e-03]]
Binary predictions:
 [[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


Unnamed: 0,ID,Comments,toxic_prob,severe_toxic_prob,obscene_prob,threat_prob,insult_prob,identity_hate_prob,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred
0,1.0,White people attacking police and no one was s...,0.108518,0.000954,0.012202,0.0021,0.020854,0.008274,0,0,0,0,0,0
1,2.0,yall needed 20 officers for one guy why yall a...,0.081721,0.001151,0.018081,0.001818,0.023989,0.003452,0,0,0,0,0,0
2,3.0,I think that cop wanted him attack metro loves...,0.199138,0.00125,0.016175,0.003167,0.037965,0.008618,0,0,0,0,0,0
3,4.0,This is just like the judge attack. Got offend...,0.038864,0.000342,0.00531,0.000637,0.00632,0.0018,0,0,0,0,0,0
4,5.0,Then he got his face slammed into the wall.,0.025906,0.000512,0.005108,0.000751,0.005412,0.001019,0,0,0,0,0,0
