In [3]:
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

from sklearn import preprocessing

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, MaxPool1D, Dropout, Dense, GlobalMaxPooling1D, Embedding, Activation
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [4]:
train_data = pd.read_csv('/kaggle/input/hate-speech-detection/toxic_train.csv')
test_data = pd.read_csv('/kaggle/input/hate-speech-detection/toxic_test.csv')

In [5]:
# drop unnamed column

train_data = train_data.drop(columns=['Unnamed: 0'])
train_data.head()

Unnamed: 0,comment_text,toxic
0,Explanation\r\nWhy the edits made under my use...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\r\nMore\r\nI can't make any real suggestions...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [6]:
test_data = test_data.drop(columns=['Unnamed: 0'])
test_data.head()

Unnamed: 0,comment_text,toxic
0,Thank you for understanding. I think very high...,0
1,:Dear god this site is horrible.,0
2,"""::: Somebody will invariably try to add Relig...",0
3,""" \r\n\r\n It says it right there that it IS a...",0
4,""" \r\n\r\n == Before adding a new product to t...",0


In [13]:
def preprocess_text(sen):
    # lower the character
    sentence = sen.lower()
    
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    stops = stopwords.words('english')
    
    for word in sentence.split():
        if word in stops:
            sentence = sentence.replace(word, '')
    return sentence

In [14]:
# preprocess data

train_data['comment_text'] = train_data['comment_text'].apply(lambda x : preprocess_text(x))
test_data['comment_text'] = test_data['comment_text'].apply(lambda x : preprocess_text(x))

In [15]:
train_data['comment_text']

0         Explani Why edits ade usernae Hardcore Metalli...
1         D aww He atches background colour seeingly stu...
2         Hey  really trying edit war It guy consttly re...
3          Me make al suggestis improvent wded secti sta...
4                    You sir hero Any chance remember page 
                                ...                        
159566     And second time askg view completely contradi...
159567       You ashamed That horrible thing put talk page 
159568    Spitzer Umm theres actual article prostitution...
159569    And looks like actually put speedy first versi...
159570     And really think underst came idea bad right ...
Name: comment_text, Length: 159571, dtype: object

In [16]:
test_data['comment_text']

0        Thank understing think highly would revert wit...
1                                  Dear god site horrible 
2         Somebody variably try add Religi Really You m...
3         It says right IS type The Type stution needed...
4         Before adding new product list make sure rele...
                               ...                        
63973     Jeroe see never got around surpred looked exa...
63974     Lucky bastard http wikimediafoundation org wi...
63975                 shame You want speak gays romanians 
63976    MEL GIBSON IS NAZI BITCH WHO MAKES SHITTY MOVI...
63977     Unicorn lair dcovery Supposedly unicorn lair ...
Name: comment_text, Length: 63978, dtype: object

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tweet = train_data['comment_text']
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [18]:
y = train_data['toxic'].values

In [None]:
# print(tokenizer.word_index)

In [19]:
print(tweet[0])
print(encoded_docs[0])

Explani Why edits ade usernae Hardcore Metallica Fan reverted They vandaliss closure GAs voted New York Dolls FAC And please reove teple talk page since retired 
[118, 147, 709, 1242, 309, 245, 2429, 3176, 35, 1997, 1314, 37, 15, 2891, 1783, 7, 3, 370, 4776]


In [20]:
padded_sequence[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [22]:
# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,     
                                     input_length=2000) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', 
                           metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2000, 32)          7624096   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 2000, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                16600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 7,640,747
Trainable params: 7,640,747
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
history = model.fit(padded_sequence,y,
                  validation_split=0.2, epochs=3, batch_size=1024)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
from keras.models import load_model
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
# del model  # deletes the existing model
# returns a compiled model
# identical to the previous one

In [25]:
new_model = load_model('my_model.h5')

In [38]:
test_word ="shame on you"
tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=200)
prediction = int(new_model.predict(tw).round().item())


In [39]:
prediction

1