# Introduction
This is a work where I have to classify toxic comment using different levels (multiclassification). This is an opportunity to use tensorflow skills.

# Data understanding

In [6]:
import pandas as pd
import os
import tensorflow as tf
import numpy as np

In [7]:
seed = 1
pathData = '../input/jigsaw-toxic-comment-classification-challenge'
batchSize = 128

In [8]:
pd.set_option('max_colwidth', 200)

## Training data

In [9]:
dsTrain = pd.read_csv(os.path.join(pathData, 'train.csv.zip'))
print('Shape:', dsTrain.shape)
dsTrain.head(20)

Shape: (159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remo...",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tid...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the tools well. · talk """,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,"Your vandalism to the Matt Shirvington article has been reverted. Please don't do it again, or you will be banned.",0,0,0,0,0,0
8,00037261f536c51d,"Sorry if the word 'nonsense' was offensive to you. Anyway, I'm not intending to write anything in the article(wow they would jump on me for vandalism), I'm merely requesting that it be more encycl...",0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contrary to those of DuLithgow,0,0,0,0,0,0


In [10]:
dsTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


Identifying empty comments

In [11]:
blanks = []
for index, id, text in dsTrain[['id', 'comment_text']].itertuples():
  newText = str(text)
  if newText.isspace():
    blanks.append(index)
print(f'Number of observations without text: {len(blanks)}')

Number of observations without text: 0


Identifying comments with more than one classification

In [12]:
dsTrain['countToxic'] = dsTrain['toxic'] + dsTrain['severe_toxic'] + dsTrain['obscene'] + dsTrain['threat'] + dsTrain['insult'] + dsTrain['identity_hate']
dsTrainCount = dsTrain[['id', 'countToxic']].groupby('id').count().reset_index()
dsTrainCount[dsTrainCount['countToxic'] > 1]

Unnamed: 0,id,countToxic


Observations:
* There are no null values.
* There are no empty values.
* There are no observatio with more than one classification.

### Cleaning
In this step I will remove numbers and special characters, because these words do not help to understand toxic comments.

In [13]:
import re
import spacy

In [14]:
nlp = spacy.load('en_core_web_sm')

In [15]:
def customCleaning(text):
  '''Function to get only valid words'''

  # Remove http texts
  text = re.sub(r'http\S+', ' ', text)

  # Remove numbers and special characters
  text = re.sub(r'[^A-Za-z\']+', ' ', text)
  
  text = text.lower()

  return text

In [16]:
dsTrain['comment_text'] = dsTrain['comment_text'].map(customCleaning)

In [17]:
dsTrain.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,countToxic
0,0000997932d777bf,explanation why the edits made under my username hardcore metallica fan were reverted they weren't vandalisms just closure on some gas after i voted at new york dolls fac and please don't remove t...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww he matches this background colour i'm seemingly stuck with thanks talk january utc,0,0,0,0,0,0,0
2,000113f07ec002fd,hey man i'm really not trying to edit war it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about th...,0,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can't make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that...,0,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember what page that's on,0,0,0,0,0,0,0
5,00025465d4725e87,congratulations from me as well use the tools well talk,0,0,0,0,0,0,0
6,0002bcb3da6cb337,cocksucker before you piss around on my work,1,1,1,0,1,0,4
7,00031b1e95af7921,your vandalism to the matt shirvington article has been reverted please don't do it again or you will be banned,0,0,0,0,0,0,0
8,00037261f536c51d,sorry if the word 'nonsense' was offensive to you anyway i'm not intending to write anything in the article wow they would jump on me for vandalism i'm merely requesting that it be more encycloped...,0,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contrary to those of dulithgow,0,0,0,0,0,0,0


### Spliting data

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X = dsTrain['comment_text'].values
y = dsTrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = seed)

print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('y_train:', y_train.shape)
print('y_val:', y_val.shape)

X_train: (127656,)
X_val: (31915,)
y_train: (127656, 6)
y_val: (31915, 6)


### Tokenizer
In this step, I will turn words into numbers, where each word will has its id.

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
vocabSize = 10000
embeddingDim = 128
maxLength = 120
truncType='post'
oovTok = "<OOV>"

In [23]:
tokenizer = Tokenizer(num_words = vocabSize, oov_token=oovTok)
tokenizer.fit_on_texts(X_train)

#### Training

In [24]:
trainSequences = tokenizer.texts_to_sequences(X_train)
trainPadded = pad_sequences(trainSequences, maxlen=maxLength, truncating=truncType)

In [25]:
trainPadded.shape

(127656, 120)

#### Validation

In [26]:
valSequences = tokenizer.texts_to_sequences(X_val)
valPadded = pad_sequences(valSequences, maxlen=maxLength, truncating=truncType)

In [27]:
valPadded.shape

(31915, 120)

## Testing data

In [28]:
dsTest = pd.read_csv(os.path.join(pathData, 'test.csv.zip'))
print('Shape:', dsTest.shape)
dsTest.head()

Shape: (153164, 2)


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja...
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [29]:
dsTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            153164 non-null  object
 1   comment_text  153164 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [30]:
blanks = []
for index, id, text in dsTest[['id', 'comment_text']].itertuples():
  newText = str(text)
  if newText.isspace():
    blanks.append(index)
print(f'Number of observations without text: {len(blanks)}')

Number of observations without text: 1


### Cleaning

In [31]:
dsTest['comment_text'] = dsTest['comment_text'].map(customCleaning)

In [32]:
dsTest.head(20)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja ru...
1,0000247867823ef7,from rfc the title is fine as it is imo
2,00013b17ad220c46,sources zawe ashton on lapland
3,00017563c3f7919a,if you have a look back at the source the information i updated was the correct form i can only guess the source hadn't updated i shall update the information once again but thank you for your me...
4,00017695ad8997eb,i don't anonymously edit articles at all
5,0001ea8717f6de06,thank you for understanding i think very highly of you and would not revert without discussion
6,00024115d4cbde0f,please do not add nonsense to wikipedia such edits are considered vandalism and quickly undone if you would like to experiment please use the sandbox instead thank you
7,000247e83dcc1211,dear god this site is horrible
8,00025358d4737918,only a fool can believe in such numbers the correct number lies between to ponder the numbers carefully this error will persist for a long time as it continues to reproduce the latest reproductio...
9,00026d1092fe71cc,double redirects when fixing double redirects don't just blank the outer one you need edit it to point it to the final target unless you think it's inappropriate in which case it needs to be nomi...


### Transforming

In [33]:
testSequences = tokenizer.texts_to_sequences(dsTest['comment_text'].values)
testPadded = pad_sequences(testSequences, maxlen=maxLength, truncating=truncType)

In [34]:
testPadded.shape

(153164, 120)

# Modeling
In this case, I will use a basic model based on tensorflow tutorial.

In [35]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocabSize, embeddingDim),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(6, activation='sigmoid')
])

In [36]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [37]:
history = model.fit(trainPadded, y_train, epochs=10, batch_size=32, validation_data=(valPadded, y_val), callbacks=[tf.keras.callbacks.EarlyStopping(monitor = 'val_loss')])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


## Predict

Showing submission file example.

In [43]:
dsSampleSubmission = pd.read_csv(os.path.join(pathData, 'sample_submission.csv.zip'))
dsSampleSubmission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


Predicting

In [44]:
predicted = model.predict(testPadded)
print('Shape:', predicted.shape)

Shape: (153164, 6)


In [45]:
predicted = np.round(predicted, 1)

In [46]:
dsPredicted = pd.DataFrame(predicted, columns=['toxic', 'severe_toxic',	'obscene', 'threat', 'insult', 'identity_hate'])
dsSubmission = pd.concat([dsTest['id'], dsPredicted], axis=1)
dsSubmission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.0,0.3,1.0,0.1,0.9,0.2
1,0000247867823ef7,0.0,0.0,0.0,0.0,0.0,0.0
2,00013b17ad220c46,0.0,0.0,0.0,0.0,0.0,0.0
3,00017563c3f7919a,0.0,0.0,0.0,0.0,0.0,0.0
4,00017695ad8997eb,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
dsSubmission.to_csv('submission.csv', index=False)

# References
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview

https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35

https://www.jeansnyman.com/posts/multi-class-text-classification-with-tensorflow/