0. Install Depndencies

In [10]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [11]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge/','train.csv'))

In [12]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


1. Preprocessing

In [13]:
from tensorflow.keras.layers import TextVectorization

In [14]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [15]:
MAX_FEATURES = 200_000

In [16]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [17]:
vectorizer.adapt(X.values)

In [18]:
vectorized_text = vectorizer(X.values)

In [20]:
#MCSHBAP, map, cache, shuffle, batch, prefetch, from_tensor_slices, list_files
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160_000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [21]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take((int(len(dataset)*.2)))
test = dataset.skip(int(len(dataset)*.9)).take((int(len(dataset)*.1)))

2. Creat Model

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense, Embedding

In [24]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [25]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(8, 5))
pd.DataFrame(history.history).plot()
plt.show()

3.Predict

In [28]:
input_text = vectorizer('you suck loser!')

In [30]:
res = model.predict(np.expand_dims(input_text,0))



In [31]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [32]:
batch_x, batch_y = test.as_numpy_iterator().next()

In [33]:
(model.predict(batch_x) > 0.5).astype(int)



array([[1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

4. Evaluation

In [34]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [35]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [40]:
for batch in test.as_numpy_iterator():
    
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)    
    acc.update_state(y_true, yhat)    



In [41]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8274014592170715, Recall:0.6084998846054077, Accuracy:0.45937812328338623


5.Save model and deploy it on Gradio

In [42]:
!pip install gradio jinja2

Collecting gradio
  Downloading gradio-3.15.0-py3-none-any.whl (13.8 MB)
     ---------------------------------------- 13.8/13.8 MB 9.5 MB/s eta 0:00:00
Collecting uvicorn
  Downloading uvicorn-0.20.0-py3-none-any.whl (56 kB)
     ---------------------------------------- 56.9/56.9 kB ? eta 0:00:00
Collecting httpx
  Downloading httpx-0.23.1-py3-none-any.whl (84 kB)
     ---------------------------------------- 85.0/85.0 kB ? eta 0:00:00
Collecting fastapi
  Downloading fastapi-0.88.0-py3-none-any.whl (55 kB)
     ---------------------------------------- 55.5/55.5 kB 2.8 MB/s eta 0:00:00
Collecting orjson
  Downloading orjson-3.8.3-cp310-none-win_amd64.whl (200 kB)
     ------------------------------------- 200.2/200.2 kB 11.9 MB/s eta 0:00:00
Collecting pycryptodome
  Downloading pycryptodome-3.16.0-cp35-abi3-win_amd64.whl (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 21.8 MB/s eta 0:00:00
Collecting aiohttp
  Downloading aiohttp-3.8.3-cp310-cp310-win_amd64.whl (319

In [43]:
import gradio as gr

In [44]:
model.save('model.h5')

In [45]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx] > 0.5)
    return text

In [49]:
interface = gr.Interface(fn=score_comment,
                        inputs=gr.components.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [50]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://0a4fdeed-ca3e-48c1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




