In [1]:
import json
import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
import logging
import tensorflow as tf
from collections import Counter
from nltk import word_tokenize

In [2]:
embedding_size = 300
word2vec_model_filepath = '/home/louner/school/ml/word2vec-GoogleNews-vectors/GoogleNews-vectors-negative300.bin'
vocab_file_path = './data/vocab'
word_id_file_path = '%s.json'%(vocab_file_path)
batch_size = 5
LEAST_WORD_COUNT = 10
vocab_size = 20000
#w2v = KeyedVectors.load_word2vec_format(word2vec_model_filepath, binary=True)

In [3]:
def preprocess(sentences):
    sentences_preprocessed = []
    for sentence in sentences:
        if type(sentence) != str:
            continue
        sentences_preprocessed.append(sentence.lower())
    return sentences_preprocessed

def tokenize(sentence):
    for tok in word_tokenize(sentence):
        yield tok

In [5]:
df = pd.read_csv('train.csv')
sentences = df['comment_text'].values.tolist()

df = pd.read_csv('test.csv')
sentences += df['comment_text'].values.tolist()

sentences = preprocess(sentences)

vocabs = Counter([tok for sentence in sentences for tok in tokenize(sentence)])
with open(vocab_file_path, 'w') as f:
    json.dump(vocabs, f)

In [8]:
embed_fpath = 'data/glove.6B.50d.txt'
def read_glove(embed_fpath):
    embed_mat = []
    word_id = {}
    with open(embed_fpath) as f:
        for line in f:
            toks = line.strip('\n').split(' ')
            idd = len(word_id)
            word_id[toks[0]] = idd
            embed_mat.append(toks[1:])
    embed_mat = np.array(embed_mat)
    return word_id, embed_mat

word_id, embed_mat = read_glove(embed_fpath)

dictionary = {}
embed_matrix = []
for word in vocabs:
    if word in word_id:
        new_idd = len(dictionary)
        dictionary[word] = new_idd
        embed_matrix.append(embed_mat[word_id[word]])
        
embed_matrix = np.array(embed_matrix)
embed_matrix.shape, len(dictionary)

((80716, 50), 80716)

In [5]:
handler = logging.FileHandler('./log/embeddding.log', mode='w')
handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s'))
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(handler)

dictionary = {'@ZERO@': 0, '@UNSEEN@': 1}
embed_matrix = np.random.normal(size=(1, embedding_size)).tolist()
#embed_matrix = [[0]*embedding_size]

with open(vocab_file_path) as f:
    vocab = json.load(f)
vocab = dict(Counter(vocab).most_common(vocab_size))
    
absent_words = []
for word, count in vocab.items():
        try:
            vec = w2v.word_vec(word)
            embed_matrix.append(vec)
            dictionary[word] = len(dictionary)
        except:
            absent_words.append(word)
            logger.error('UNKNOWN %s'%(word))


In [6]:
embed_matrix = np.array(embed_matrix)
embed_matrix.shape

(16717, 300)

In [7]:
len(absent_words)

3284

In [9]:
mean, std = embed_matrix.mean(axis=0), embed_matrix.std(axis=0)

In [10]:
absent_matrix = np.random.normal(mean, std, (len(absent_words), embedding_size))
absent_matrix.shape

(3284, 300)

In [11]:
vocab_size = embed_matrix.shape[0]
for i, word in enumerate(absent_words):
    dictionary[word] = i+1+vocab_size
embed_matrix = np.concatenate((embed_matrix, absent_matrix))
embed_matrix.shape

(20001, 300)

In [12]:
embed_matrix = np.random.normal(size=(len(dictionary), embedding_size))

In [9]:
W = tf.get_variable(name='W', shape=embed_matrix.shape, initializer=tf.constant_initializer(embed_matrix))
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    saver = tf.train.Saver(var_list=[W])
    saver.save(sess, './models/embed_matrix.ckpt')

In [10]:
with open(word_id_file_path, 'w') as f:
    json.dump(dictionary, f)