In [2]:
%%time
import numpy as np

with open("glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

Wall time: 7.83 s


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [12]:
w2v

{b'the': array(<map object at 0x0000027E842CD6A0>, dtype=object),
 b',': array(<map object at 0x0000027E8427ABA8>, dtype=object),
 b'.': array(<map object at 0x0000027E8427AB00>, dtype=object),
 b'of': array(<map object at 0x0000027E8427AA58>, dtype=object),
 b'to': array(<map object at 0x0000027E8427A9B0>, dtype=object),
 b'and': array(<map object at 0x0000027E8427A908>, dtype=object),
 b'in': array(<map object at 0x0000027E8427A860>, dtype=object),
 b'a': array(<map object at 0x0000027E8427A7B8>, dtype=object),
 b'"': array(<map object at 0x0000027E8427A710>, dtype=object),
 b"'s": array(<map object at 0x0000027E8427A668>, dtype=object),
 b'for': array(<map object at 0x0000027E8427A5C0>, dtype=object),
 b'-': array(<map object at 0x0000027E8427A518>, dtype=object),
 b'that': array(<map object at 0x0000027E8427A470>, dtype=object),
 b'on': array(<map object at 0x0000027E8427A3C8>, dtype=object),
 b'is': array(<map object at 0x0000027E8427A320>, dtype=object),
 b'was': array(<map objec