In [None]:
# import some packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
# read CSV training data
trainDf = pd.read_csv("jigsaw-toxic-comment-classification-challenge/train.csv")
trainDf.head()

In [None]:
# 'i'll
# I'm
# you'd
# this doc contains this fuzzy words we have to convert it into normal form, i found this function online which contains
# many of these expresions, we will add more in this.
import re
def clean_text(doc):
    doc = doc.lower()
    doc = re.sub(r"what's", "what is ", doc)
    doc = re.sub(r"\'s", " ", doc)
    doc = re.sub(r"\'ve", " have ", doc)
    doc = re.sub(r"can't", "can not ", doc)
    doc = re.sub(r"n't", " not ", doc)
    doc = re.sub(r"i'm", "i am ", doc)
    doc = re.sub(r"\'re", " are ", doc)
    doc = re.sub(r"\'d", " would ", doc)
    doc = re.sub(r"\'ll", " will ", doc)
    doc = re.sub(r"\'scuse", " excuse ", doc)
    doc = re.sub('\W', ' ', doc)
    doc = re.sub('\s+', ' ', doc)
    doc = doc.strip(' ')
    return doc

In [None]:
# trainDf['comment_text'] = trainDf['comment_text'].map(lambda comments : clean_text(comments))

In [None]:
y = trainDf[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [None]:
from tqdm import tqdm
X = [clean_text(i) for i in tqdm(list(trainDf["comment_text"]))]
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
X =[w for w in X if not w in stop_words]


In [None]:
len(X)

In [None]:
# split train text data using sk-learn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=16)

In [None]:
# it will tockenize words and add padding 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
#  this will convert text to embeddings using glov methods and also create dictionery for words and respective embbedings,
embeddings_dictionary = dict()
# this is smaller version of glove
glove_file = open('/media/mobpair_parth/746443bb-9dd8-48b6-8dda-e37f1cafd5501/fun_projects/text_toxic/glove.6B.100d.txt', encoding="utf8")

for line in tqdm(glove_file):
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tqdm(tokenizer.word_index.items()):
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:

import tensorflow as tf
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding

from tensorflow.keras.layers import Input

# model building
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(6, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

tf.logging.set_verbosity(tf.logging.ERROR) # don't want to see versioning errors
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
print(model.summary())

In [None]:
# Training start
history = model.fit(X_train, y_train, batch_size=128, epochs=5, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
# visualize accuracy
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
# Model store and pull
from tensorflow.keras.models import model_from_json

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 

In [None]:
# # evaluate loaded model on test data
# loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
# score = loaded_model.evaluate(X_test, y_test, verbose=1)
# print("Test Score:", score[0])
# print("Test Accuracy:", score[1])

In [None]:
# reading testing data and predict labels probability
testingDF = pd.read_csv("jigsaw-toxic-comment-classification-challenge/test.csv")
new_data = [clean_text(i) for i in list(testingDF["comment_text"])]
new_data = tokenizer.texts_to_sequences(new_data)
new_data = pad_sequences(new_data, padding='post', maxlen=maxlen)

In [None]:
#  Predict
resultTest = model.predict(new_data)
resultTest.shape

In [None]:
# pull submission file
submitDF = pd.read_csv("jigsaw-toxic-comment-classification-challenge/sample_submission.csv")
submitDF = submitDF.drop(columns=['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate'])
submitDF.head()

In [None]:
# save output in csv
resultTest = pd.DataFrame(resultTest)
finalDF = pd.concat([submitDF, resultTest],  axis=1, sort=False)
finalDF.to_csv("sample_submission_tensorflow_15Epoch.csv")
finalDF.head()