# 0. Install Dependencies and Bring in Data

In [None]:
!pip install tensorflow pandas matplotlib scikit-learn

In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
df = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')

In [4]:
# a new column for is_toxic
df['is_toxic'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)

In [6]:
# Dropping the old columns to only have the `is_toxic`
df = df[['comment_text', 'is_toxic']]

In [7]:
df.head()

Unnamed: 0,comment_text,is_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


# 1. Preprocess

In [8]:
from tensorflow.keras.layers import TextVectorization

In [14]:
X = df['comment_text']
y = df['is_toxic'].values

In [15]:
MAX_FEATURES = 200000 # number of words in the vocab

In [16]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [17]:
vectorizer.adapt(X.values)

In [18]:
vectorized_text = vectorizer(X.values)

In [19]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [20]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# 2. Create Sequential Model

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [27]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(1, activation='sigmoid'))

In [28]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [29]:
model.summary()

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

[1m1489/6981[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m28:11[0m 308ms/step - loss: 0.2572

# 3. Make Predictions

In [None]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [None]:
res = model.predict(np.expand_dims(input_text, 0))

In [None]:
(res > 0.5).astype(int)

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
res.shape

# 4. Evaluate Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

# 5. Test and Gradio

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [None]:
input_str = vectorizer('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
res

In [None]:
# def score_comment(comment):
#     vectorized_comment = vectorizer([comment])
#     results = model.predict(vectorized_comment)
    
#     text = ''
#     for idx, col in enumerate(df.columns[2:]):
#         text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
#     return text
def score_comment(comment):
    """
    Scores a given comment by predicting its attributes using a model.

    Args:
        comment (str): The comment to be scored.

    Returns:
        str: A formatted string with the prediction results.
    """
    # Vectorize the comment
    vectorized_comment = vectorizer([comment])
    
    # Predict results using the model
    results = model.predict(vectorized_comment)
    
    # Format the output text
    text = "Comment Scoring Results:\n"
    text += "-" * 30 + "\n"
    
    # Iterate through the columns to format predictions
    for idx, col in enumerate(df.columns[2:]):
        prediction = "True" if results[0][idx] > 0.5 else "False"
        text += f"{col:<20}: {prediction}\n"
    
    text += "-" * 30
    return text

In [None]:
print(score_comment("I am going to kick out all my fucking staff tonight"))