In [1]:
%matplotlib inline
from keras.preprocessing.text import text_to_word_sequence
from utils import *
from __future__ import division, print_function
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

Using TensorFlow backend.


## Load the data

In [2]:
path = "data/"
batch_size=64
max_features = 20000
maxlen = 100

In [3]:
def read_embedding_list():
    embedding_word_dict = {}
    embedding_list = []
    with open("wiki-news-300d-1M.vec", encoding="utf8") as f:
        for row in tqdm.tqdm(f.read().split("\n")[1:-1]):
            data = row.split(" ")
            word = data[0]
            embedding = np.array([float(num) for num in data[1:-1]])
            embedding_list.append(embedding)
            embedding_word_dict[word] = len(embedding_word_dict)

    embedding_list = np.array(embedding_list)
    return embedding_list, embedding_word_dict

In [8]:
_trainData = pd.read_csv(path+'train.csv')

In [9]:
list_sentences_train = _trainData["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]


In [10]:
_testData = pd.read_csv(path+'test.csv')

In [11]:
list_sentences_test = _testData["comment_text"].fillna("_na_").values

## Process sentences

In [12]:
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

In [13]:
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [14]:
comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))

In [15]:
print(comments[0])

explanationwhi edit made usernam hardcor metallica fan revert werent vandal closur gas vote new york doll fac pleas dont remov templat talk page sinc im retir nown


In [16]:
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

## Vectorize words

In [17]:
tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
# tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(comments + test_comments))
comments_sequence = tokenizer.texts_to_sequences(comments)

In [18]:
_labels_train = _trainData[list_classes].values
test_comments_sequence = tokenizer.texts_to_sequences(test_comments)    
_X_train = sequence.pad_sequences(comments_sequence , maxlen=maxlen)
X_train = _X_train[0:159071,:]
labels_train =_labels_train[0:159071,:]
val_train = _X_train[159071:159572,:]
val_labels_train =_labels_train[159071:159572,:]
Test_train = sequence.pad_sequences(test_comments_sequence, maxlen=maxlen)

In [79]:
print(_X_train.shape)

(159571, 100)


In [4]:
embedding_list, embedding_word_dict = read_embedding_list()


100%|██████████████████████████████████████████████████████████████████████| 1000000/1000000 [02:14<00:00, 7455.76it/s]


In [7]:
emb_mean,emb_std = embedding_list.mean(), embedding_list.std()
emb_mean,emb_std


(-0.0033470585653333162, 0.1098556062554554)

In [25]:
word_index = tokenizer.word_index
nb_words =len(word_index.items())
embed_size= embedding_list.shape[1]
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    vec_idx = embedding_word_dict.get(word)
    if vec_idx is not None:
            embedding_vector = embedding_list[vec_idx]
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
   

In [23]:
print(len(word_index.items()))

378392


In [71]:
## model

In [72]:
#X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
#Test_train = Test_train.reshape((Test_train.shape[0], 1, Test_train.shape[1]))

In [26]:
inp = Input(shape=(maxlen,))
x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(GRU(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(100, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 300)          113517600 
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 200)          240600    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 200)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 200)               800       
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               20100     
__________

In [29]:
#model.load_weights("toxic.hdf")
#model.fit(X_train, labels_train, batch_size=128, epochs=1)

#y_test = model2.predict([X_te], batch_size=1024, verbose=1)

In [75]:
#model.fit(X_train, labels_train, batch_size=128, epochs=1)

In [76]:
#model.save_weights("model_GRU_1.h5")

In [77]:
#preds = model.predict(Test_train)
#sample_submission = pd.read_csv(f'{path}sample_submission.csv')
#sample_submission[list_classes] = preds
#sample_submission.to_csv('submission_textgru_1.csv', index=False)

In [28]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
save_best = ModelCheckpoint('toxic.hdf', save_best_only=True, 
                           monitor='val_loss', mode='min')

history = model.fit(X_train, labels_train,batch_size=64,validation_data=(val_train, val_labels_train),
                    epochs=120, verbose=2,callbacks=[early_stopping,save_best])

Train on 159071 samples, validate on 500 samples
Epoch 1/120
 - 5648s - loss: 0.0673 - acc: 0.9781 - val_loss: 0.0316 - val_acc: 0.9877
Epoch 2/120
 - 5676s - loss: 0.0509 - acc: 0.9815 - val_loss: 0.0314 - val_acc: 0.9897
Epoch 3/120
 - 5671s - loss: 0.0451 - acc: 0.9830 - val_loss: 0.0353 - val_acc: 0.9873
Epoch 4/120
 - 5633s - loss: 0.0406 - acc: 0.9845 - val_loss: 0.0329 - val_acc: 0.9890
Epoch 5/120
 - 5603s - loss: 0.0364 - acc: 0.9859 - val_loss: 0.0385 - val_acc: 0.9870
Epoch 6/120
 - 5589s - loss: 0.0328 - acc: 0.9873 - val_loss: 0.0380 - val_acc: 0.9850
Epoch 7/120
 - 5594s - loss: 0.0296 - acc: 0.9886 - val_loss: 0.0367 - val_acc: 0.9857


In [None]:
#model.layers[0].trainable=True

In [None]:
#model.fit(X_train, labels_train, batch_size=64, epochs=10)

In [None]:
#model.save_weights("model_double_GRU.h5")

In [50]:
#list_sentences_test_1 =['We are using expect for the assertions. and React Test Utils with mocha-jsdom so that we can render out some React DOM and use standard javascript commands to test our react components.']
#test_comments_1=[]
#for text in list_sentences_test_1:
#    test_comments_1.append(text_to_wordlist(text))
    
#test_comments_sequence_1 = tokenizer.texts_to_sequences(test_comments_1)   

T#est_train_1 = sequence.pad_sequences(test_comments_sequence_1, maxlen=maxlen)

In [51]:
#preds = model.predict(Test_train_1)

In [52]:
#print(np.argmax(preds, axis=1))

[0]


In [30]:
preds = model.predict(Test_train)
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
sample_submission[list_classes] = preds
sample_submission.to_csv('submission_textlstm_4.csv', index=False)