This notebook was my kernel in the 'Jigsaw unintended bias in Toxicity Classification' Challenge in Kaggle. Here is the link to the competition. 
[Click Here](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification)

In [None]:
import numpy as np 
import pandas as pd 
from numpy import array, asarray, zeros
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding, Bidirectional
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

In [2]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

train = train[['comment_text','target']]

In [3]:
import nltk
from nltk.corpus import stopwords

In [4]:
import re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]<>%:')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def text_prepare(text):
    text = text.lower()  
    text = re.sub(re.compile('^[a-z][0-9]'), " ", text)
    text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
    text = re.sub(BAD_SYMBOLS_RE, "", text)  
    querywords = text.split()
    resultwords  = [word for word in querywords if word not in STOPWORDS]
    text = ' '.join(resultwords)
    text = re.sub('\s+', " ", text)
    return text

In [5]:
X_train = [text_prepare(x) for x in train['comment_text']]
X_test = [text_prepare(x) for x in test['comment_text']]
y_train = np.where(train['target'] >= 0.5, 1, 0)

In [6]:
t = Tokenizer()
t.fit_on_texts(X_train+X_test)
vocab_size = len(t.word_index) + 1

encoded_docs_train = t.texts_to_sequences(X_train)
encoded_docs_test = t.texts_to_sequences(X_test)

max_length = 160
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')

embeddings_index = dict()

In [7]:
from sklearn.model_selection import train_test_split
padded_docs_train, padded_docs_val, en_train, en_val = train_test_split(padded_docs_train, y_train, test_size=0.1)

In [8]:
f = open('../input/glove6b300dtxt/glove.6B.300d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 300))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [9]:
# define model
model = Sequential()
e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=160, trainable=False)
model.add(e)
model.add(Bidirectional(LSTM(100)))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())
model.fit(padded_docs_train, en_train, batch_size=1024, epochs=1, verbose=1)


Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 160, 300)          329914500 
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 330,255,501
Trainable params: 341,001
Non-trainable params: 329,914,500
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Epoch 1/1


<keras.callbacks.History at 0x7f363c4e3940>

In [10]:
loss, accuracy = model.evaluate(padded_docs_val, en_val, verbose=1)
print('Accuracy: %f' % (accuracy*100))
predictions = model.predict(padded_docs_test)

Accuracy: 94.675546


In [11]:
prediction = [i[0] for i in predictions] 

In [12]:
submission = pd.DataFrame.from_dict({
    'id': test['id'],
    'prediction': prediction
})

submission.to_csv('submission.csv', index=False)