COMMENT TOXICITY


INSTALLING DEPENDENCIES


In [6]:
!pip install tensorflow tensorflow pandas matplotlib sklearn



In [26]:
import os
import pandas as pd
import matplotlib as plt
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [8]:
df=pd.read_csv("jigsaw-toxic-comment-classification-challenge/train.csv")

DATA PREPROCESSING

In [9]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [11]:
df.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [12]:
df["comment_text"][5]

'"\n\nCongratulations from me as well, use the tools well. \xa0· talk "'

In [13]:
from tensorflow.keras.layers import TextVectorization

In [14]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [15]:
maxval=300000

In [16]:
vectorizer = TextVectorization(max_tokens=maxval,
                               output_sequence_length=2000,
                               output_mode='int')

In [17]:
vectorizer.adapt(X.values)

In [18]:
vectorized_text = vectorizer(X.values)

In [19]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [20]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

SEQUENTIAL MODEL

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [22]:
model = Sequential()

# Embedding layer

model.add(Embedding(maxval+1, 32))

# Bidirectional LSTM Layer

model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers

model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Final layer

model.add(Dense(6, activation='sigmoid'))

In [23]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          9600032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [25]:
history = model.fit(train, epochs=1, validation_data=val)



In [79]:
good_cmnt = vectorizer('Hello,I am a big fan of yours. Love your content')
bad_cmnt= vectorizer('i hate you , and i am coming to kill you')

In [50]:
pred1 = model.predict(np.array([good_cmnt]))



In [80]:
pred2 = model.predict(np.array([bad_cmnt]))



In [59]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [81]:
print("Prediction for positive comment : ",pred1 , "\n")
print("Prediction for negative comment : ",pred2)

Prediction for positive comment :  [[0.11268672 0.00043986 0.01487998 0.00213797 0.01320602 0.00536107]] 

Prediction for negative comment :  [[0.8210689  0.09406394 0.5221734  0.06113708 0.46124056 0.14376067]]


MODEL EVALUATIOIN

In [69]:
result = model.evaluate(test)
test_loss = result  
print(f"Test loss: {test_loss}")


Test loss: 0.04633940011262894


In [71]:
from sklearn.metrics import accuracy_score

# Concatenate true labels from the test dataset
y_true = np.concatenate([y for _, y in test], axis=0)  
y_pred_prob = model.predict(test)  

# using 0.5 as threshold to get a binary prediction
y_pred = (y_pred_prob > 0.5).astype(int)

test_accuracy = accuracy_score(y_true, y_pred)

print(f"Accuracy: {test_accuracy}")


Accuracy: 0.8304914744232698


In [72]:
model.save("commenttoxicity.h5")

  saving_api.save_model(
