Importing the Modules Needed

In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

Loading the Dataset

In [2]:
df = pd.read_csv("./dataset/train.csv")

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Preprocess the Data and the Text Vectorization Layer from TensorFlow

In [3]:
from tensorflow.keras.layers import TextVectorization

In [4]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [5]:
MAX_FEATURES = 200000 # number of words in the vocabulary

In [6]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [7]:
vectorizer.adapt(X.values)

In [9]:
vectorized_text = vectorizer(X.values)

Adapting the Dataset before Training

In [10]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

Splitting the Dataset to Train, Validation and Test Datasets

In [11]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

Loading the Sequential Model and the LSTM, Dropout, Bidirectional, Dense and Embedding Layers to build the NLP Neural Network Model

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [13]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [14]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

Start Training the Model

In [17]:
history = model.fit(train, epochs=1, validation_data=val)

 116/6981 [..............................] - ETA: 1:22:06 - loss: 0.1278

In [None]:
from matplotlib import pyplot as plt

Plotting the History Results of the NLP Model

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

Making Predictions

In [None]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [None]:
res = model.predict(input_text)

In [None]:
(res > 0.5).astype(int)

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
res.shape

Evaluating the Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
model.save('./model/nlp_toxicity_model.h5')

In [11]:
model = tf.keras.models.load_model('./model/nlp_toxicity_model.h5')

In [27]:
input_str = vectorizer('i hate you')

In [28]:
res = model.predict(np.expand_dims(input_str,0))



In [29]:
res

array([[1.7583117e-02, 2.9971961e-05, 1.0124151e-02, 3.3520343e-04,
        2.8367525e-03, 7.5592234e-04]], dtype=float32)

Examples of Words Vectorized

In [20]:
vectorizer("Hello")

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([288,   0,   0, ...,   0,   0,   0], dtype=int64)>

In [8]:
vectorizer("fuck you")

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([139,   7,   0, ...,   0,   0,   0], dtype=int64)>

Saving the Vectorizer Layer

In [9]:
import pickle

# Pickle the config and weights
pickle.dump({'config': vectorizer.get_config(),
             'weights': vectorizer.get_weights()}
            , open("./model/vectorizer_layer.pkl", "wb"))

**********


Loading the Vectorizer Layer for a Word Test

In [10]:
from_disk = pickle.load(open("./model/vectorizer_layer.pkl", "rb"))
new_v = TextVectorization.from_config(from_disk['config'])

new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_v.set_weights(from_disk['weights'])

# Lets see the Vector for word "this"
print(new_v("fuck you"))

tf.Tensor([139   7   0 ...   0   0   0], shape=(1800,), dtype=int64)
