### Install Prerequisite

In [1]:
!pip install gradio jinja2



### Importing Libraries

In [3]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append('C:/Users/JOY/Desktop/Final Project')

from sklearn.model_selection import train_test_split

from keras import backend as K
import tensorflow as tf
import tensorflow.keras.preprocessing.text as tk
import tensorflow.keras.preprocessing.sequence as ps
from tensorflow.keras.layers import Input, Dropout, Bidirectional, Dense, Embedding, Conv1D
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate

import Caribe as cb
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


### Data Loading

In [9]:
df = pd.read_csv(os.path.join('datasets/jigsaw-toxic-comment-classification-challenge','train.csv', 'train.csv'))
GLOVE_EMBEDDING = "datasets/glove.6B.100d.txt"

column_names=["short","long"]
df1 = pd.read_csv("datasets/abbrevations.csv",names=column_names)
df2 = pd.read_csv("datasets/emoji_df.csv")
df3 = pd.read_csv("datasets/twitterSlang.csv")

### Train Test Split

In [10]:
train, test = train_test_split(df, test_size = 0.3, random_state = 1)

print("Train:", train.shape)
print("Test:", test.shape)

Train: (111699, 8)
Test: (47872, 8)


In [11]:
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")

x_train = train["comment_text"].str.lower()
x_test = test["comment_text"].str.lower()

y_train = train[["Toxic", "Severe Toxic", "Obscene", "Threat", "Insult", "Identity Hate"]].values
y_test = test[["Toxic", "Severe Toxic", "Obscene", "Threat", "Insult", "Identity Hate"]].values

### Tokenizer

In [12]:
max_words = 100000
max_len = 150

embed_size = 100

tokenizer = tk.Tokenizer(num_words=max_words, lower=True)

tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = ps.pad_sequences(x_train, maxlen=max_len)
x_test = ps.pad_sequences(x_test, maxlen=max_len)

### Glove Embedding Vectorizer

In [13]:
embeddings_index = {}

with open(GLOVE_EMBEDDING, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        embed = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embed

word_index = tokenizer.word_index

num_words = min(max_words, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embed_size), dtype='float32')

for word, i in word_index.items():

    if i >= max_words:
        continue

    embedding_vector = embeddings_index.get(word)

    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Precision Recall F1

In [14]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Defining CNN Model

In [15]:
input = Input(shape=(max_len,))

x = Embedding(max_words, embed_size, weights=[embedding_matrix], trainable=False)(input)

x = Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, dropout=0.1,
                                                      recurrent_dropout=0.1))(x)

x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)

avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)

x = concatenate([avg_pool, max_pool])

preds = Dense(6, activation="sigmoid")(x)

model = tf.keras.Model(input, preds)



### Summarize Model

In [16]:
model.summary()
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-3), metrics=['acc',f1_m,precision_m, recall_m])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 150)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 150, 100)     10000000    ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 150, 256)     176640      ['embedding[0][0]']              
                                                                                                  
 conv1d (Conv1D)                (None, 148, 64)      49216       ['bidirectional[0][0]']          
                                                                                              

  super(Adam, self).__init__(name, **kwargs)


### Train Model

In [17]:
batch_size = 128

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,save_weights_only=True,verbose=1)

callbacks = [tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_loss'),tf.keras.callbacks.TensorBoard(log_dir='./logs'),cp_callback]

model.fit(x_train, y_train, validation_split=0.2, batch_size=batch_size, epochs=3, callbacks=callbacks, verbose=1)

Epoch 1/3
Epoch 00001: saving model to training_1\cp.ckpt
Epoch 2/3
Epoch 00002: saving model to training_1\cp.ckpt
Epoch 3/3
Epoch 00003: saving model to training_1\cp.ckpt


<keras.callbacks.History at 0x26594edad88>

### Load the Latest Model

In [18]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x265952bf388>

### Model Predictions

In [19]:
predictions = model.predict(np.expand_dims(x_train[43], 0))

print(tokenizer.sequences_to_texts([x_train[43]]))
print(y_train[43])
print(predictions)

['article creation where would i go to start an article thanks']
[0 0 0 0 0 0]
[[1.1243155e-03 7.9698846e-05 1.1842737e-03 9.8319390e-05 6.3161750e-04
  8.7936380e-05]]


### Model Evaluation

In [20]:
loss, accuracy, f1_score, precision, recall = model.evaluate(x_test, y_test, verbose=1)



In [21]:
print(f'Precision: {precision}, Recall:{recall}, Accuracy:{accuracy}, F1:{f1_score}')

Precision: 0.6972138285636902, Recall:0.6637949347496033, Accuracy:0.8493691682815552, F1:0.6574142575263977


### Save the Model 

In [22]:
model.save('models/rnn03.h5')

### Load the Model

In [27]:
model = tf.keras.models.load_model('models/rnn03.h5',custom_objects={"f1_m": f1_m, "precision_m": precision_m, "recall_m": recall_m})



### Pre-Processing Filters

In [28]:
def preprocessingfilters(comment):
    print("\nOriginal sentence : " ,comment)

    #Slang 
    for i in df3.loc[:, 'slang']:
        comment = comment.lower()
        if(i.lower() in comment.split(" ")):
            j = df3[i==df3['slang']]
            k = j.iloc[-1].values

        comment = comment.lower()
        comment = comment.replace(i.lower(), k[1])
        print("Slang Expanded sentence : " ,comment)

    # Abberivation
    for i in df1.loc[:, 'short']:
        comment = comment.lower()
        if(i.lower() in comment.split(" ")):
            j = df1[i==df1['short']]
            k = j.iloc[-1].values

        comment = comment.lower()
        comment = comment.replace(i.lower(), k[1])
        print("Abberivation Expanded sentence : " ,comment)

    # Emojis
    for i in df2.loc[:, 'emoji']:
        if(i in comment):
            j = df2[i==df2['emoji']]
            k = j.iloc[-1].values

        comment = comment.replace(i, ", having " +k[1])
        print("Emojis Expanded sentence : " ,comment)

    comment=cb.caribe_corrector(comment)
    print("Correct sentence : " ,comment)
    return comment

### User Interface

In [29]:
#Importing detoxification function from detox file
from detox import detoxification





`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

Generating outputs: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]
Decoding outputs: 100%|██████████| 1/1 [00:07<00:00,  7.36s/it]
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JOY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


["I'm tired of the school shootings by black people"]


Generating outputs: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
Decoding outputs: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU score -> 1.1640469867513693e-231 text -> ['I', 'will', 'kill', 'you'] pred -> ['I', 'will', 'not', 'let', 'you', 'down']
CS score -> 0.0 text -> I will kill you pred -> I will not let you down


  % sorted(inconsistent)


### UI Action Function

In [30]:
def score_comment(comment):
    fcomment = preprocessingfilters(comment)
    comment1 = tokenizer.texts_to_sequences([comment])
    comment1 = tf.keras.preprocessing.sequence.pad_sequences(comment1, maxlen=max_len)
    
    results = model.predict(np.expand_dims(comment1[0], 0))
    
    if results[0][0] > 0.1:
        pred = detoxification(comment)
        pred = (str(pred)[1:-1]).strip('[\]",')
        pred = str(pred)
    else:
        pred = "Non Toxic Sentence" 

    d = []
    for idx, col in enumerate(df.columns[2:]):
        d.append(
            {
                'Catagory': col,
                'Result': results[0][idx] > 0.1,
                'Percent':  round((results[0][idx])*100,2)
            }
        )

    d = pd.DataFrame(d)      
        
    return fcomment,d,pred

### UI Modeling

In [31]:
with gr.Blocks() as interface:
    name = gr.inputs.Textbox(lines=2, placeholder='Enter Your Sentence', label = "Input Sentence")
    greet_btn = gr.Button("Submit")
    output = [gr.Textbox(label="Did you mean?"),gr.Dataframe(label="Toxicity Detection & Classification",headers=['Catagory', 'Result', 'Percent']) , gr.Textbox(label="Detoxification")]
    greet_btn.click(fn=score_comment, inputs=name, outputs=output)

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",


### UI Launch

In [32]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://07a7a3681080d53e.gradio.app

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


