# Imports

In [None]:
import keras
import warnings
import numpy as np
import pandas as pd
import dill
import gensim

# Load Data

In [None]:
df_train = pd.read_csv('train.csv', index_col='id')
df_test = pd.read_csv('test.csv', index_col='id')

df_train['comment_text'].fillna('', inplace=True)

df_test['comment_text'].fillna('', inplace=True)

In [None]:
df_train.head(1)

In [None]:
df_train.describe(include='all')

# Tokenizing

In [None]:
simple_tokens = df_train.comment_text.apply(gensim.utils.simple_preprocess)
phrases = gensim.models.phrases.Phrases(simple_tokens)
tokenizer = gensim.models.phrases.Phraser(phrases)
tokenized_text = list(tokenizer[simple_tokens])

In [None]:
tokenized_text[0]

In [None]:
corpus_dict = gensim.corpora.dictionary.Dictionary(tokenized_text)

In [None]:
target_c = df_train.columns[1:]
targets = df_train[target_c].values

# word2vec on comments

In [None]:
MAX_SEQ = 100
word2vec = gensim.models.word2vec.Word2Vec(tokenized_text, window=5, size=MAX_SEQ, min_count=3, workers=80)

In [None]:
word2vec.wv.most_similar('hell')

# Embedding for data

In [None]:
# Index shift due to padding, starts from 0
docs = [[idx + 1 for idx in corpus_dict.doc2idx(doc)]  for doc in tokenized_text]

In [None]:
padded_docs = keras.preprocessing.sequence.pad_sequences(docs, maxlen=MAX_SEQ, truncating='post', value=0)

In [None]:
max_index = max(c for d in docs for c in d)

In [None]:
embeddings = np.array([np.random.normal(size=word2vec.vector_size)]+ # for the '0' padding word
                      [word2vec.wv[corpus_dict[idx]]
                      if corpus_dict[idx] in word2vec.wv.vocab
                      else np.random.normal(size=word2vec.vector_size)
                      for idx in range(max_index)])

In [None]:
def tokens_to_embedding(tokens):
    embeddings = [word2vec.wv[t] / word2vec.wv.vocab[t].count for t in tokens if t in word2vec.wv.vocab]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec.vector_size)

def text_to_embedding(text):
    return tokens_to_embedding(tokenizer[gensim.utils.simple_preprocess(text)])

In [None]:
# Used this to save sessions of the notebook
# import dill
# #dill.dump_session('notebook_env.db')

In [None]:
def loadGloveModel(gloveFile,EMB_DIM, my_dict,em2):
    print('Reading {}'.format(gloveFile))
    print ("Loading Glove Model")
    f = open(gloveFile,'r', encoding="utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.asarray(splitLine[1:], dtype='float32')
        model[word] = embedding
        
    embedding_matrix = np.zeros((len(my_dict) + 1, EMB_DIM))
    
    for word,i in my_dict.items():
        embedding_vector = model.get(i)
        if embedding_vector is not None:
            embedding_matrix[word] = embedding_vector  
        else:
            if i in word2vec.wv.vocab:
                embedding_matrix[word] = word2vec.wv[i][np.newaxis,:]
    print('Done')
        
    return embedding_matrix

In [None]:
# Load a previously saved session
# import dill
# dill.load_session('D:\kaggleStuff\toxic-comment\notebook_env.db')

## RNN

In [None]:
embedding_matrix = loadGloveModel('D:\kaggleStuff\embeddings\glove.twitter.27B.'+ str(MAX_SEQ) +'d.txt', MAX_SEQ, corpus_dict,embeddings)

In [None]:
print(embedding_matrix[0:1])

In [51]:
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Dense, GRU, Dropout
from keras.layers import Bidirectional, Embedding, SpatialDropout1D, concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

inputL = Input(shape=(MAX_SEQ,)) 
embL = Embedding(max_index+1, MAX_SEQ, weights=[embedding_matrix])(inputL)
        
sdL = SpatialDropout1D(0.2)(embL)

bgruL = Bidirectional(GRU(256, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))(sdL)
c1dL = Conv1D(filters=64, kernel_size=2, padding='valid', kernel_initializer="he_uniform")(bgruL)
dL = Dropout(0.5)(c1dL)

avg_pool = GlobalAveragePooling1D()(dL)
max_pool = GlobalMaxPooling1D()(dL)
conc = concatenate([avg_pool, max_pool])
outp = Dense(6, activation="sigmoid")(conc)

batch_size = 512
epochs = 2
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(df_train) / batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0001
lr_decay = exp_decay(lr_init, lr_fin, steps)
        
model = Model(inputs=inputL, outputs=outp)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3, decay=lr_decay), metrics=['accuracy']) 
    

In [52]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

checkpointer = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)


model.fit(padded_docs, targets, batch_size=batch_size,epochs=epochs,
          validation_split=0.3, callbacks=[reduce_lr,checkpointer])

Train on 7000 samples, validate on 3000 samples
Epoch 1/2

KeyboardInterrupt: 

# Submission

In [None]:
# This might take some time, would be better to do it at the start and save the session
test_inputs =  np.array([text_to_embedding(doc) for doc in df_test.comment_text])

test_outputs = model.predict(test_inputs)

In [None]:
output_df = df_test.reset_index()[['id']].copy()

In [None]:
for i, target_class in enumerate(target_c):
    output_df[target_class] = test_outputs[:, i]

In [None]:
output_df.to_csv('sol_2.csv', index=False)