In [1]:
## Reading the dataset
import pandas as pd
data = pd.read_csv('comments.csv',usecols=['Text','IsToxic'],encoding='utf-8')

In [2]:
data.head()

Unnamed: 0,Text,IsToxic
0,If only people would just take a step back and...,False
1,Law enforcement is not trained to shoot to app...,True
2,\nDont you reckon them 'black lives matter' ba...,True
3,There are a very large number of people who do...,False
4,"The Arab dude is absolutely right, he should h...",False


In [3]:
data.shape

(1000, 2)

In [4]:
## Check for nulls and duplicated
print(data.isnull().sum())
print(data.duplicated().sum())

Text       0
IsToxic    0
dtype: int64
3


In [5]:
print(data[data.duplicated()])

              Text  IsToxic
657  run them over     True
677  run them over     True
699  RUN THEM OVER     True


In [6]:
## Drop duplicated
data = data.drop_duplicates()

In [7]:
## Check value counts for IsToxic
print(data['IsToxic'].value_counts())

IsToxic
False    538
True     459
Name: count, dtype: int64


In [8]:
## Dependent and Independent Features
x = data['Text']
y = data['IsToxic']

In [9]:
## Lowercasing the text
x = list(map(str.lower, x))

In [10]:
## Text Preproccessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 4881


In [11]:
x = tokenizer.texts_to_sequences(x)
max_len = max([len(sent) for sent in x])

In [12]:
max_len

815

In [13]:
## Apply padding
x = pad_sequences(x, maxlen=max_len)

In [14]:
y  = y.astype(int) # Convert to integer type

In [15]:
## Train Test Split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [16]:
## Implementing LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(Bidirectional(LSTM(32,return_sequences=False, kernel_regularizer=regularizers.l2(0.001))))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None,max_len))

In [17]:
model.summary()

In [18]:
## Early Stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [19]:
history = model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test), callbacks=[early_stopping])

Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 280ms/step - accuracy: 0.5354 - loss: 0.8950 - val_accuracy: 0.4700 - val_loss: 0.8499
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 230ms/step - accuracy: 0.5700 - loss: 0.8099 - val_accuracy: 0.4700 - val_loss: 0.7952
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 217ms/step - accuracy: 0.5969 - loss: 0.7299 - val_accuracy: 0.6150 - val_loss: 0.7037
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 218ms/step - accuracy: 0.8265 - loss: 0.5578 - val_accuracy: 0.5900 - val_loss: 0.7195
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 240ms/step - accuracy: 0.9135 - loss: 0.3862 - val_accuracy: 0.7000 - val_loss: 0.6548


In [20]:
sent = "This post is pathetic, just like you."
seq = tokenizer.texts_to_sequences([sent])
seq = pad_sequences(seq, maxlen=max_len)
pred = model.predict(seq)
if pred[0][0] > 0.4:
    print("Toxic Comment")
else:
    print("Non-Toxic Comment")

print(pred[0][0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step
Non-Toxic Comment
0.24053265


In [21]:
import pickle

with open("tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)


In [22]:
model.save('model.keras')