0. Install Dependencies and Obtain Data

We are going to use a dataset from Kaggle that has labelled comments with varying toxicity

In [47]:
%pip install tensorflow pandas matplotlib scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [48]:
import os #for filepaths and folders
import pandas as pd #read in tabular data for csvz
import tensorflow as tf# deeplearning framework
import numpy as np 


In [49]:
os.getcwd()

'/Users/mercy/Desktop/ToxicCommentModel/notebooks'

In [50]:
df = pd.read_csv(
    os.path.join('/Users/mercy/Desktop/ToxicCommentModel/data','train.csv')) #read in the data in train.csv

In [51]:
df.head()#gives a short preview of the table

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [52]:
df.iloc[4]['comment_text']#gives an example of one of the comments 

"You, sir, are my hero. Any chance you remember what page that's on?"

In [53]:
df[df.columns[2:]].iloc[4]#the labels of the comment in question

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 4, dtype: int64

1. Preprocessing:
It converts the comments or words to numbers which are used as tokens for processing using textVectorization

In [54]:
pip list

Package                      Version
---------------------------- -----------
absl-py                      2.1.0
appnope                      0.1.4
asttokens                    3.0.0
astunparse                   1.6.3
certifi                      2024.12.14
charset-normalizer           3.4.1
comm                         0.2.2
contourpy                    1.3.1
cycler                       0.12.1
debugpy                      1.8.11
decorator                    5.1.1
exceptiongroup               1.2.2
executing                    2.1.0
flatbuffers                  24.12.23
fonttools                    4.55.3
gast                         0.6.0
google-pasta                 0.2.0
grpcio                       1.68.1
h5py                         3.12.1
idna                         3.10
ipykernel                    6.29.5
ipython                      8.31.0
jedi                         0.19.2
joblib                       1.4.2
jupyter_client               8.6.3
jupyter_core                 5.7

In [55]:
from tensorflow.keras.layers import TextVectorization#to tokenize text

In [56]:
df.columns#lists all the colum labels

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [57]:
X = df['comment_text']#comments
y = df[df.columns[2:]].values#label ratings

In [58]:
df[df.columns[2:]].values#all our labels in a vector array form for tensorflow

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [59]:
MAX_FEATURES = 200000#NUMBER OF WORDS IN THE VOCAB

In [60]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,#each sentence capped at 1800 word
                               output_mode='int')#maps every word to an int

In [61]:
vectorizer.adapt(X.values)#values converts it from a pandas series to a numpy array. trained it


In [62]:
#example of textvectorization
vectorizer("Hello World")[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([288, 263,   0,   0,   0])>

In [21]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 np.str_('the'),
 np.str_('to'),
 np.str_('of'),
 np.str_('and'),
 np.str_('a'),
 np.str_('you'),
 np.str_('i'),
 np.str_('is'),
 np.str_('that'),
 np.str_('in'),
 np.str_('it'),
 np.str_('for'),
 np.str_('this'),
 np.str_('not'),
 np.str_('on'),
 np.str_('be'),
 np.str_('as'),
 np.str_('have'),
 np.str_('are'),
 np.str_('your'),
 np.str_('with'),
 np.str_('if'),
 np.str_('article'),
 np.str_('was'),
 np.str_('or'),
 np.str_('but'),
 np.str_('page'),
 np.str_('my'),
 np.str_('an'),
 np.str_('from'),
 np.str_('by'),
 np.str_('do'),
 np.str_('at'),
 np.str_('about'),
 np.str_('me'),
 np.str_('so'),
 np.str_('wikipedia'),
 np.str_('can'),
 np.str_('what'),
 np.str_('there'),
 np.str_('all'),
 np.str_('has'),
 np.str_('will'),
 np.str_('talk'),
 np.str_('please'),
 np.str_('would'),
 np.str_('its'),
 np.str_('no'),
 np.str_('one'),
 np.str_('just'),
 np.str_('like'),
 np.str_('they'),
 np.str_('he'),
 np.str_('dont'),
 np.str_('which'),
 np.str_('any'),
 np.str_('been'),
 np

In [63]:
vectorized_text = vectorizer(X.values)

In [64]:
len(X)

159571

In [65]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

Create a Tensorflow Dataset pipeline

In [66]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))#parse data
dataset = dataset.cache()#
dataset = dataset.shuffle(160000)#shuffles
dataset = dataset.batch(16)#data is in batches of 16 samples
dataset = dataset.prefetch(8) #prevent bottlenecks

In [67]:
batch_X, batch_Y = dataset.as_numpy_iterator().next()#fetches the next batch(x is the comments and y is the labels)

In [68]:
#create training, data and test partitions #returns in form of batches
train = dataset.take(int(len(dataset)*.7)) #7 training
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) #20 percent validation
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))#10 percent testing
                     


In [69]:
train_generator = train.as_numpy_iterator()

In [76]:
train_generator.next()

(array([[ 312,    4,  422, ...,    0,    0,    0],
        [1949,  410,   14, ...,    0,    0,    0],
        [  82,   20,    7, ...,    0,    0,    0],
        ...,
        [1909,   60, 1909, ...,    0,    0,    0],
        [5088, 2668,    5, ...,    0,    0,    0],
        [  34,    2,  401, ...,    0,    0,    0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

2. Create Sequential Model


In [81]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [85]:
y.shape

(159571, 6)

In [82]:
model = Sequential()#Instantiate our model
model.add(Embedding(MAX_FEATURES+1, 32))#embedding layer -- no of words + 1, 1 embedding per word 
model.add(Bidirectional(LSTM(32, activation='tanh')))#no of LSTM layers and we specify it should be bidirectional nad LSTM requires activation of tanh
model.add(Dense(128, activation='relu'))#three feature extraction layers for embedding 
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
#no of different outputs or labels 
model.add(Dense(6, activation='sigmoid'))#converts outputs to between 0 and 1

In [83]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')#BinaryCrossentropy for multiple output

In [86]:
model.summary()#look at our model!

In [None]:
history = model.fit(train, epochs=5, validation_data=val)


Epoch 1/5


In [None]:
history.history