In [13]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
embedding_size = 100
max_vocab_size = 20000
path = 'data/glove.6B.100d.txt'
max_sequence_length = 100

In [3]:
def process_embedding_file(path):
    with open(path,encoding='utf-8') as f:
        word2vec = dict()
        for line in f:
            out = line.split()
            word = out[0]
            word2vec[word] = np.asarray(out[1:], dtype='float32')
        return word2vec
            
word2vec = process_embedding_file(path)
            

In [4]:
print('length of word embeddings : {}'.format(len(word2vec)))

length of word embeddings : 400000


In [5]:
df_train = pd.read_csv('data/train.csv')
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
#deleting records with null values
df_train.dropna(inplace=True)
sentences = df_train['comment_text'].values
target = df_train[labels].values

In [12]:
t = Tokenizer(num_words = max_vocab_size)
t.fit_on_texts(sentences)
sequences = t.texts_to_sequences(sentences)
word2idx = t.word_index
print('Number of unique words : {}'.format(len(word2idx)))
padded_sequences = pad_sequences(sequences,maxlen=max_sequence_length)

Number of unique words : 210337


In [16]:
# create weight matrix for words in training sentences
embedding_matrix = np.zeros((max_vocab_size,embedding_size))
for word,index in word2idx.items():
    if index < max_vocab_size:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector


In [18]:
embedding_layer =Embedding(max_vocab_size,
                           embedding_size,
                           weights=[embedding_matrix],
                           input_length=max_sequence_length,
                           trainable=False)