### Install Prerequisite

In [50]:
!pip install -q texthero seaborn scikit-multilearn transformers tensorflow-text tf-models-official
!pip install gradio jinja2

### Importing Libraries

In [51]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('C:/Users/JOY/Desktop/Final Project')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras import backend as K
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization 

import gradio as gr
import Caribe as cb

### Data Loading

In [53]:
df = pd.read_csv(os.path.join('datasets/jigsaw-toxic-comment-classification-challenge','train.csv', 'train.csv'))

column_names=["short","long"]
df1 = pd.read_csv("datasets/abbrevations.csv",names=column_names)
df2 = pd.read_csv("datasets/emoji_df.csv")
df3 = pd.read_csv("datasets/twitterSlang.csv")

### Data Exploration

In [54]:
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

df.head(5)

Number of rows: 159571
Number of columns: 8


### Train Test Split

In [58]:
train, test = train_test_split(df, test_size = 0.3, random_state = 1)

In [59]:
print("Train:", train.shape)
print("Test:", test.shape)

Train: (111699, 8)
Test: (47872, 8)


### Vectorization

In [61]:
labels = ['Toxic','Severe Toxic', 'Obscene', 'Threat', 'Insult', 'Identity Hate']

x_train = train.loc[:,'comment_text']
y_train = train[labels]

x_test = test.loc[:,'comment_text']
y_test = test[labels]

In [62]:
tfidf_vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=1000)
tfidf_vec.fit(x_train)

TfidfVectorizer(max_features=1000, min_df=2, ngram_range=(1, 2))

In [63]:
# trasforming train and test
train_tfidf = tfidf_vec.transform(x_train)
test_tfidf = tfidf_vec.transform(x_test)

### BERT Transfer Learning

In [68]:
train_bert = train['comment_text']
test_bert = test['comment_text']

labels = ['Toxic','Severe Toxic', 'Obscene', 'Threat', 'Insult', 'Identity Hate']
y_train = train[labels]
y_test = test[labels]

In [70]:
# Preparaing tensorflow dataset to feed into the model
batch_size = 32
seed = 42

train_ds = tf.data.Dataset.from_tensor_slices((train_bert.values, y_train.values)).shuffle(50000).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((test_bert.values, y_test.values)).shuffle(50000).batch(batch_size)

### Loading the models form tensorflow hub

In [72]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

### Creating Model

In [73]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(500, activation='relu')(net)
  net = tf.keras.layers.Dense(6, activation= "sigmoid" , name='classifier')(net)
  return tf.keras.Model(text_input, net)

### Precision Recall F1

In [74]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall*1.20

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision*1.10

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Summarize Model

In [75]:
classifier_model = build_classifier_model()
classifier_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [77]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.metrics.CategoricalAccuracy(), tf.metrics.AUC(multi_label=True), f1_m,precision_m, recall_m]

epochs = 3
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-5

optimizer = optimization.create_optimizer(init_lr=init_lr,num_train_steps=num_train_steps,num_warmup_steps=num_warmup_steps,optimizer_type='adamw')

### Compile the Model

In [78]:
classifier_model.compile(optimizer=optimizer,loss=loss, metrics=metrics)

callbacks = [tf.keras.callbacks.ModelCheckpoint('best_bert_model', save_best_only=True),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]

### Train Model

In [80]:
bert_history = classifier_model.fit(train_ds, validation_data=test_ds, epochs = epochs, callbacks = callbacks)

Epoch 1/3


  return dispatch_target(*args, **kwargs)






INFO:tensorflow:Assets written to: best_bert_model\assets


INFO:tensorflow:Assets written to: best_bert_model\assets


Epoch 2/3



INFO:tensorflow:Assets written to: best_bert_model\assets


INFO:tensorflow:Assets written to: best_bert_model\assets


Epoch 3/3


### Load the Best Model

In [82]:
bert_model = tf.keras.models.load_model('best_bert_model', compile = False)













### Save the Model

In [83]:
# compile the model
bert_model.compile(optimizer=optimizer, loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=metrics)

# Save Model
bert_model.save('models/berttoxic3.h5')

### Model Evaluation

In [85]:
loss, accuracy,auc,f1_m,precision_m, recall_m = bert_model.evaluate(test_ds)

  return dispatch_target(*args, **kwargs)




In [86]:
print(f"Accuracy: {np.round(accuracy,4)}")
print(f"AUC: {np.round(auc,4)}")
print(f"Logloss: {np.round(loss,4)}")
print(f"f1: {np.round(f1_m,4)}")
print(f"precision: {np.round(precision_m,4)}")
print(f"recall: {np.round(recall_m,4)}")

Accuracy: 0.9797
AUC: 0.9786
Logloss: 0.0415
f1: 0.7946
precision: 0.8211
recall: 0.8118


### User Interface

In [None]:
#Importing detoxification function from detox file
from detox import detoxification

### Pre-Processing Filters

In [95]:
def preprocessingfilters(comment):
    print("\nOriginal sentence : " ,comment)

    #Slang 
    for i in df3.loc[:, 'slang']:
        comment = comment.lower()
        if(i.lower() in comment.split(" ")):
            j = df3[i==df3['slang']]
            k = j.iloc[-1].values

        comment = comment.lower()
        comment = comment.replace(i.lower(), k[1])
        print("Slang Expanded sentence : " ,comment)

    # Abberivation
    for i in df1.loc[:, 'short']:
        comment = comment.lower()
        if(i.lower() in comment.split(" ")):
            j = df1[i==df1['short']]
            k = j.iloc[-1].values

        comment = comment.lower()
        comment = comment.replace(i.lower(), k[1])
        print("Abberivation Expanded sentence : " ,comment)

    # Emojis
    for i in df2.loc[:, 'emoji']:
        if(i in comment):
            j = df2[i==df2['emoji']]
            k = j.iloc[-1].values

        comment = comment.replace(i, ", having " +k[1])
        print("Emojis Expanded sentence : " ,comment)

    comment=cb.caribe_corrector(comment)
    print("Correct sentence : " ,comment)
    return comment

### UI Action Function

In [106]:
def score_comment(comment):
    fcomment = preprocessingfilters(comment)
    results = bert_model.predict([comment])
    
    if results[0][0] > 0.1:      
        pred = detoxification(comment)
        pred = (str(pred)[1:-1]).strip('[\]",')
        pred = str(pred)
        
    else:
        pred = "Non Toxic Sentence"

    d = []
    for idx, col in enumerate(df.columns[2:]):
        d.append(
            {
                'Catagory': col,
                'Result': results[0][idx] > 0.1,
                'Percent':  round((results[0][idx])*100,2)
            }
        )

    d = pd.DataFrame(d)      
    
    return fcomment,d,pred

### UI Modeling

In [103]:
with gr.Blocks() as interface:
    name = gr.inputs.Textbox(lines=2, placeholder='Enter Your Sentence', label = "Input Sentence")
    greet_btn = gr.Button("Submit")
    output = [gr.Textbox(label="Did you mean?"),gr.Dataframe(label="Toxicity Detection & Classification",
              headers=['Catagory', 'Result', 'Percent']) , gr.Textbox(label="Detoxification")]
    greet_btn.click(fn=score_comment, inputs=name, outputs=output)

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",


### UI Launch

In [104]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://231e4eba6ff5cc4d.gradio.app

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces





Original sentence :  I will kill you.
Correct sentence :  I will kill you.


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

Generating outputs: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Decoding outputs: 100%|██████████| 1/1 [00:08<00:00,  8.01s/it]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order

BLEU score -> 1.1640469867513693e-231 text -> ['I', 'will', 'kill', 'you.'] pred -> ['I', 'will', 'take', 'it', 'from', 'you']
CS score -> 0.0 text -> I will kill you. pred -> I will take it from you


ERROR:paramiko.transport:Socket exception: An existing connection was forcibly closed by the remote host (10054)
ERROR:paramiko.transport:Socket exception: An existing connection was forcibly closed by the remote host (10054)
