In [1]:
#Download Glove twitter embeddings
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2018-11-22 04:17:06--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2018-11-22 04:17:06--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: ‘glove.840B.300d.zip’


2018-11-22 04:19:16 (16.0 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]



In [2]:
!unzip glove.840B.300d.zip

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [0]:
!rm glove.840B.300d.zip

In [5]:
!ls -lrt

total 5513912
-rw-rw-r-- 1 root root 5646236541 Oct 24  2015 glove.840B.300d.txt
drwxr-xr-x 2 root root       4096 Nov 20 18:17 sample_data


In [6]:
!pip install googledrivedownloader

Collecting googledrivedownloader
  Downloading https://files.pythonhosted.org/packages/7e/41/d59b2a5fcc7afeb40f23091694bd6e6a63ad118c93f834353ee5100285d5/googledrivedownloader-0.3-py2.py3-none-any.whl
Installing collected packages: googledrivedownloader
Successfully installed googledrivedownloader-0.3


In [7]:
from google_drive_downloader import GoogleDriveDownloader as gdd

ids =["1EO59IOOCVcqKymPzTOVHKxTmIjEKEfW2", "1pSY29XeBuodiIZdtGDtq52MZsAJTAKdo", "1Jt7Wr3RUQRnLD0AuirxPCxjkljfumPLw"]

file_names = ["test.csv","train.csv","sample_submission.csv"]
for i in range(len(ids)):
  a = gdd.download_file_from_google_drive(file_id= ids[i],
                                      dest_path="./"+file_names[i],
                                      unzip=False)

Downloading 1EO59IOOCVcqKymPzTOVHKxTmIjEKEfW2 into ./test.csv... Done.
Downloading 1pSY29XeBuodiIZdtGDtq52MZsAJTAKdo into ./train.csv... Done.
Downloading 1Jt7Wr3RUQRnLD0AuirxPCxjkljfumPLw into ./sample_submission.csv... Done.


In [8]:
import pandas as pd
import numpy as np

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [0]:
train = pd.read_csv("train.csv", header =0)
test = pd.read_csv("test.csv", header = 0)

In [0]:
train["comment_text"].fillna("dummytext")
test["comment_text"].fillna("dummytext")
X_train = train["comment_text"].str.lower()
X_test = test["comment_text"].str.lower()
Y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [0]:
#Parameters
embedding_size = 300
maxlen = 150
max_vocab_size = 100000

In [0]:
tokenizer = text.Tokenizer(num_words = max_vocab_size, lower = False)  
# num_words - will consider only top max_vocab_size words based on frequency count
# So, when we do texts_to_sequences, the max value will be 100,000. 
# But, tokinzer on its own will contain word and indices for all the words in the corpus.

tokenizer.fit_on_texts(list(X_train) + list(X_test))
#Its not necessary to fit on both the train and test. If we do that, we take into account the top words of test set as well 

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [13]:
glove_embeddings = {}

with open("glove.840B.300d.txt", encoding= "utf8" ) as f:
  for line in f:
    words_vector = line.rstrip().rsplit(" ")
    word = words_vector[0]
    vector = np.asarray(words_vector[1:], dtype= 'float32')
    glove_embeddings[word] = vector

print("There are: "+str(len(glove_embeddings))+" glove word embeddings")

There are: 2196016 glove word embeddings


In [0]:
max_word_index_tokenizer = len(tokenizer.word_index)
num_words = min(max_vocab_size, max_word_index_tokenizer)

In [15]:
oov_count = 0
embedding_matrix = np.ones((num_words, embedding_size))
for word, i in tokenizer.word_index.items():
  if i>= max_vocab_size:
    continue
  embedding_vector = glove_embeddings.get(word)
  
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector 
   
  else:
    oov_count+=1
   
  
print("There are "+str(oov_count)+" missing glove word embeddings") 
    
  

There are 26633 missing glove word embeddings


In [16]:
sequence_input = Input(shape = (maxlen,)) # Every senetence will have a maxlength of 150.
x = Embedding(input_dim = max_vocab_size, output_dim = embedding_size, trainable = False, weights = [embedding_matrix]) (sequence_input)
x = Bidirectional(GRU(128, return_sequences=True)) (x)
# 128 is the number of hidden units in each GRU cell. 
# return_sequences makes the output of each GRU cell as the input to next cell. 
# We have 128 neurons in each GRU cell. Since it is bidirectional GRU, the forward and the backward pass
# are concatenated and we get 256 neuron ouput from each GRU cell.
# Since we have 150 as max sequence length, the dimension after this cell is (150, 256)

x = GlobalMaxPooling1D()(x)
predictions = Dense(6, activation="sigmoid")(x)

model = Model(sequence_input, predictions)
model.summary()
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          30000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 256)          329472    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 1542      
Total params: 30,331,014
Trainable params: 331,014
Non-trainable params: 30,000,000
_________________________________________________________________


In [17]:
batch_size = 128
epochs = 4
X_train_sub, X_val_sub, Y_train_sub, Y_val_sub = train_test_split(X_train, Y_train, train_size=0.9, random_state=123)



In [18]:
file_path="glove_weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=5)


callbacks_list = [checkpoint, early] #early
model.fit(X_train_sub, Y_train_sub, batch_size=batch_size, epochs=epochs, validation_data=(X_val_sub, Y_val_sub), callbacks=callbacks_list)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.03938, saving model to glove_weights_base.best.hdf5
Epoch 2/4

Epoch 00002: val_loss improved from 0.03938 to 0.03775, saving model to glove_weights_base.best.hdf5
Epoch 3/4

Epoch 00003: val_loss did not improve from 0.03775
Epoch 4/4

Epoch 00004: val_loss improved from 0.03775 to 0.03683, saving model to glove_weights_base.best.hdf5


<keras.callbacks.History at 0x7f6247304dd8>

In [19]:
model.load_weights(file_path)
test_predictions = model.predict(X_test,batch_size=1024,verbose=1)



In [0]:
submission = pd.read_csv('sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = test_predictions
submission.to_csv('biGRU_Glove_840B.csv', index=False)

In [0]:
from google.colab import files

files.download("biGRU_Glove_840B.csv") 

In [0]:
#0.9826 Private  0.9843 Public