In [1]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from keras.callbacks import EarlyStopping ,ReduceLROnPlateau

In [2]:
train = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')

In [3]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [4]:
train = train.loc[:40000,:]
train.shape

(40001, 3)

In [5]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

1403

In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import string
stop_words = stopwords.words('english')

import unicodedata

def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def to_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def replace_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_whitespaces(text):
    return text.strip()

def remove_stopwords(words, stop_words):
    return [word for word in words if word not in stop_words]

def stem_words(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def lemmatize_verbs(words):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])

def text2words(text):
  return word_tokenize(text)

def normalize_text( text):
    text = remove_non_ascii(text)
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = replace_numbers(text)
    words = text2words(text)
    words = remove_stopwords(words, stop_words)
    words = lemmatize_words(words)
    words = lemmatize_verbs(words)
    return ''.join(words)
def normalize_corpus(corpus):
  return [normalize_text(t) for t in corpus]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
xtrain = normalize_corpus(xtrain)
xvalid = normalize_corpus(xvalid)

In [9]:
early = EarlyStopping(monitor='val_loss', mode='min', patience=4) 
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience = 2, verbose=1,factor=0.3, min_lr=0.000001)
callbacks_list = [early, learning_rate_reduction]

In [10]:
token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [22]:
import pickle

# to save the fitted tokenizer
with open('tokenizer_version2.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                 300,
                 input_length=max_len))
model.add(SimpleRNN(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

2022-06-02 20:49:52.259496: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-02 20:49:52.423358: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-02 20:49:52.424137: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-02 20:49:52.426567: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1500, 300)         23724300  
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 100)               40100     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 23,764,501
Trainable params: 23,764,501
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(xtrain_pad, ytrain,validation_split = 0.2, epochs=10, batch_size=64,callbacks=callbacks_list)

2022-06-02 20:49:55.491226: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 153600000 exceeds 10% of free system memory.
2022-06-02 20:49:55.666438: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10

2022-06-02 20:57:55.486367: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 38400000 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 5/10
Epoch 6/10

Epoch 00006: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.


<keras.callbacks.History at 0x7f2be693f0d0>

In [16]:
loss, accuracy = model.evaluate(xvalid_pad, yvalid)
print("Auc: " , accuracy*100)

2022-06-02 21:38:41.049041: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 48006000 exceeds 10% of free system memory.


Auc:  94.20072436332703


In [17]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': accuracy})

In [11]:
embeddings_index = {}
f = open('../input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

2196018it [04:28, 8185.05it/s]

Found 2196017 word vectors.





In [12]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 79080/79080 [00:00<00:00, 234092.76it/s]


In [18]:
model_LSTM = Sequential()
model_LSTM.add(Embedding(len(word_index) + 1,
                 300,
                 weights=[embedding_matrix],
                 input_length=max_len,
                 trainable=False))

model_LSTM.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model_LSTM.add(Dense(1, activation='sigmoid'))
model_LSTM.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model_LSTM.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1500, 300)         23724300  
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 23,884,801
Trainable params: 160,501
Non-trainable params: 23,724,300
_________________________________________________________________


2022-06-02 21:49:10.599514: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 94897200 exceeds 10% of free system memory.


In [19]:
model_LSTM.fit(xtrain_pad, ytrain,validation_split = 0.2, epochs=10, batch_size=64,callbacks=callbacks_list)

2022-06-02 21:50:02.292971: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 153600000 exceeds 10% of free system memory.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 9/10
Epoch 10/10

Epoch 00010: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.


<keras.callbacks.History at 0x7f2bd8660250>

In [20]:
loss, accuracy = model_LSTM.evaluate(xvalid_pad, yvalid)
print("Auc: " , accuracy*100)

Auc:  96.0254967212677


In [21]:
scores_model.append({'Model': 'LSTM','AUC_Score': accuracy})

In [24]:
model_GRU = Sequential()
model_GRU.add(Embedding(len(word_index) + 1,
             300,
             weights=[embedding_matrix],
             input_length=max_len,
             trainable=False))
model_GRU.add(SpatialDropout1D(0.3))
model_GRU.add(GRU(300))
model_GRU.add(Dense(1, activation='sigmoid'))

model_GRU.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model_GRU.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1500, 300)         23724300  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 1500, 300)         0         
_________________________________________________________________
gru (GRU)                    (None, 300)               540900    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 301       
Total params: 24,265,501
Trainable params: 541,201
Non-trainable params: 23,724,300
_________________________________________________________________


In [25]:
model_GRU.fit(xtrain_pad, ytrain,validation_split = 0.2, epochs=10, batch_size=64,callbacks=callbacks_list)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 7/10
Epoch 8/10

Epoch 00008: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.


<keras.callbacks.History at 0x7f2bd05ff190>

In [26]:
loss, accuracy = model_GRU.evaluate(xvalid_pad, yvalid)
print("Auc: " , accuracy*100)

Auc:  95.9879994392395


In [27]:
scores_model.append({'Model': 'GRU','AUC_Score': accuracy})

In [28]:
#model.save('./englishToxicModelRNN_version2.h5')
#model_LSTM.save('./englishToxicModelLSTM_version2.h5')
model_GRU.save('./englishToxicModelGRUVersion2.h5')

In [None]:
from keras.callbacks import ModelCheckpoint ,ReduceLROnPlateau ,EarlyStopping
early = EarlyStopping(monitor='val_loss', mode='min', patience=4) 
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience = 2, verbose=1,factor=0.3, min_lr=0.000001)
callbacks_list = [early, learning_rate_reduction]

In [None]:
model_BI = Sequential()
model_BI.add(Embedding(len(word_index) + 1,
                 300,
                 weights=[embedding_matrix],
                 input_length=max_len,
                 trainable=False))
model_BI.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model_BI.add(Dense(1,activation='sigmoid'))
model_BI.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    
model_BI.summary()

In [None]:
model_BI.fit(xtrain_pad, ytrain,batch_size = 32,
                            validation_split=0.2,
                                    verbose=1,
                            epochs=10,
                                    callbacks=callbacks_list)

In [None]:
model_BI.save('./englishToxicModelBI_version2.h5')

In [None]:
loss, accuracy = model_BI.evaluate(xvalid_pad, yvalid)
print("Auc: " , accuracy)