In [1]:
! pip install tensorflow pandas matplotlip

Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlip
  Downloading matplotlip-0.2-py3-none-any.whl.metadata (700 bytes)
Downloading matplotlip-0.2-py3-none-any.whl (1.8 kB)
Installing collected packages: matplotlip
Successfully installed matplotlip-0.2



[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# 0. Install Dependencies and Bring in Data

In [2]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np




In [3]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


# 1. Preprocess

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
X=df["comment_text"]
y=df[df.columns[2:]].values

In [6]:
#The number of words in the vocab
Max_Features=200000

In [7]:
vectorizer=TextVectorization(max_tokens=Max_Features,
                             output_sequence_length=1800,
                             output_mode='int')




In [8]:
vectorizer.adapt(X.values)




In [9]:
vectorized_text= vectorizer(X.values)

In [10]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [11]:
#MCSHBAP = map, cashe, shuffle, batch, prefetch
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset=dataset.cache()
dataset=dataset.shuffle(160000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8)

In [12]:
batch_X, batch_y=dataset.as_numpy_iterator().next()

In [13]:
train = dataset.take(int(len(dataset)*.7))
test = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
val = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [14]:
train_generator=train.as_numpy_iterator()

In [15]:
train_generator.next()

(array([[ 3003,     4,    21, ...,     0,     0,     0],
        [  509,  1048,   468, ...,     0,     0,     0],
        [ 2841,   929,   349, ...,     0,     0,     0],
        ...,
        [17746, 88246,     6, ...,     0,     0,     0],
        [   49,  7883,  4607, ...,     0,     0,     0],
        [  863,   306,   144, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

# 2. Create Sequential Model

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Input

# Create the model
model = Sequential()

# Explicitly define the input layer shape
model.add(Input(shape=(1800,)))

# Create the embedding layer
model.add(Embedding(Max_Features+1, 32))

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Final layer
model.add(Dense(6, activation='sigmoid'))

# Compile the model
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

# Print the model summary
model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1800, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                       

In [17]:
history = model.fit(train, epochs=1, validation_data=val)



In [18]:
model.save('toxicity.h5')

  saving_api.save_model(


# 3. Evaluate Model

In [19]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [20]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [21]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



























In [22]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.7906487584114075, Recall:0.7108363509178162, Accuracy:0.4663991928100586


# 4. Make Predictions

In [49]:
df.columns[2:-1]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult'], dtype='object')

In [50]:
input_str = vectorizer('hey i freaken hate you!')

In [51]:
res = model.predict(np.expand_dims(input_str,0))
res



array([[0.7739172 , 0.04032751, 0.38799563, 0.0415929 , 0.4190651 ,
        0.08014166]], dtype=float32)

In [44]:
input_text = vectorizer('You freaking suck!')


In [47]:
res = model.predict(np.expand_dims(input_text,0))
res



array([[0.99715525, 0.4435976 , 0.92595154, 0.06175226, 0.9014419 ,
        0.14713843]], dtype=float32)

In [48]:
input_text = vectorizer('I love you')
res = model.predict(np.expand_dims(input_text,0))
res



array([[0.14897762, 0.0012239 , 0.04708375, 0.00842597, 0.06318499,
        0.02862271]], dtype=float32)