In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from tensorflow.keras.layers import TextVectorization

In [None]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv','train.csv'))

In [None]:
df.head()

In [None]:
df[df['toxic']==0].iloc[0]['comment_text']

In [None]:
df[df.columns[2:]].iloc[6]

In [None]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
total_features = 100_000

In [None]:
vectorization = TextVectorization(max_tokens=total_features,
                           output_sequence_length=1000,
                           output_mode='int'
                          )

In [None]:
vectorization.adapt(X.values)

In [None]:
vector.get_vocabulary()

In [None]:
txt_vector = vectorization(X.values)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((txt_vector,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [None]:
train_size = int(len(dataset)*0.7)
val_size = int(len(dataset)*0.2)
test_size = len(dataset) - train_size - val_size
print(train_size)
print(val_size)
print(test_size)
if train_size+val_size+test_size == len(dataset):
    print("Correct")

In [None]:
train = dataset.take(train_size)
val = dataset.skip(train_size).take(val_size)
test = dataset.skip((train_size+val_size)).take(test_size)

In [None]:
train_generator = train.as_numpy_iterator()

In [None]:
train_next = train_generator.next()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()
model.add(Embedding(total_features, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

In [None]:
input_text = vectorization('I hate you.')

In [None]:
res = model.predict(np.expand_dims(input_text,0))

In [None]:
res[0]

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
res

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    # Flattening the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    precision.update_state(y_true, yhat)
    recall.update_state(y_true, yhat)
    accuracy.update_state(y_true, yhat)

In [None]:
print(f'Precision: {precision.result().numpy()}, Recall:{recall.result().numpy()}, Accuracy:{accuracy.result().numpy()}')

In [None]:
# !pip install gradio

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [None]:
input_str = vectorization('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
res

In [None]:
def score_comment(comment):
    vectorized_comment = vectorization([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [None]:
interface.launch(share=True)