In [3]:
import numpy as np
import pickle
from collections import defaultdict
import sys, re
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors

w2v_file = 'google_word2vec.bin'
glove_file = 'glove_word2vec.txt'


print( "loading google word2vec vectors..." )
model1 = KeyedVectors.load_word2vec_format(w2v_file, binary=True)
print(  "word2vec loaded!\n" )

print( "loading glove word2vec vectors..." )
model2 = KeyedVectors.load_word2vec_format(glove_file, binary=False)
print(  "word2vec loaded!" )

loading google word2vec vectors...
word2vec loaded!

loading glove word2vec vectors...
word2vec loaded!


In [6]:
def build_data( df ):
    train = []
    dev = []
    test = []
    vocab = defaultdict(float)

    print(df.columns)
    for idx, (label, sent, split) in df.iterrows():
        words = sent.split()
        for word in words:
            vocab[word] += 1
        datum = { "y": label, "text": sent }
        if split == 'train':
            train.append( datum )
        elif split == 'dev':
            dev.append( datum )
        elif split == 'test':
            test.append( datum )
    return train, dev, test, vocab
def get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1, k), dtype='float32')
    W[0] = np.zeros(k, dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W, word_idx_map

def get_W2(word_vecs1, word_vecs2, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs1)
    word_idx_map = dict()
    W1 = np.zeros(shape=(vocab_size+1, k), dtype='float32')
    W2 = np.zeros(shape=(vocab_size+1, k), dtype='float32')

    W1[0] = np.zeros(k, dtype='float32')
    W2[0] = np.zeros(k, dtype='float32')
    i = 1
    for word in word_vecs1:
        W1[i] = word_vecs1[word]
        W2[i] = word_vecs2[word]
        word_idx_map[word] = i
        i += 1
    return W1, W2, word_idx_map

def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    cnt = 0
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)
            cnt += 1
    print('missing cnt: ', cnt)

In [33]:
mr_df = pd.read_pickle('{}.pkl'.format('MR'))
sst1_df = pd.read_pickle('{}.pkl'.format('SST1'))
sst2_df = pd.read_pickle('{}.pkl'.format('SST2'))

mr_train, mr_dev, mr_test, mr_vocab = build_data(mr_df)
sst1_train, sst1_dev, sst1_test, sst1_vocab = build_data(sst1_df)
sst2_train, sst2_dev, sst2_test, sst2_vocab = build_data(sst2_df)

combined_vocab = set.union( 
    set(mr_vocab.keys()), 
    set(sst1_vocab.keys()), 
    set(sst2_vocab.keys()) )

google_w2v = dict( (w, model1.wv[w]) for w in combined_vocab if w in model1.wv )
glove_w2v = dict( (w, model2.wv[w]) for w in combined_vocab if w in model2.wv )

print('google sst1')
add_unknown_words(google_w2v, sst1_vocab)
print('glove sst1')
add_unknown_words(glove_w2v, sst1_vocab)

print('google sst2')
add_unknown_words(google_w2v, sst2_vocab)
print('glove sst2')
add_unknown_words(glove_w2v, sst2_vocab)

print('google mr')
add_unknown_words(google_w2v, mr_vocab)
print('glove mr')
add_unknown_words(glove_w2v, mr_vocab)

Index(['label', 'sentence', 'split'], dtype='object')
Index(['label', 'sentence', 'split'], dtype='object')
Index(['label', 'sentence', 'split'], dtype='object')


  
  from ipykernel import kernelapp as app


google sst1
missing cnt:  1626
glove sst1
missing cnt:  513
google sst2
missing cnt:  0
glove sst2
missing cnt:  0
google mr
missing cnt:  764
glove mr
missing cnt:  580


In [41]:
google_W, glove_W, word_idx_map = get_W2(google_w2v, glove_w2v)

rand_vecs = {}
add_unknown_words(rand_vecs, { 
    k: mr_vocab.get(k, 0) + sst1_vocab.get(k, 0) + sst2_vocab.get(k, 0) 
    for k in set(mr_vocab) | set(sst1_vocab) | set(sst2_vocab) })
random_W, _ = get_W(rand_vecs)

assert set(rand_vecs.keys()) == combined_vocab

missing cnt:  18891


In [42]:
res = {
    'mr_train': mr_train,
    'sst1_train': sst1_train,
    'sst1_dev': sst1_dev,
    'sst1_test': sst1_test,
    'sst2_train': sst2_train,
    'sst2_dev': sst2_dev,
    'sst2_test': sst2_test,
    'google_W': google_W,
    'random_W': random_W,
    'glove_W': glove_W,
    'word_idx_map': word_idx_map,
    'vocab': combined_vocab
}

In [44]:
pickle.dump(res, open('combined.p', 'wb'))