In [1]:
import spacy
import torch
import numpy
import json
import datetime
import sys
import pickle
from numpy.testing import assert_almost_equal
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    print('GPU!')
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
nlp = spacy.load("en_pytt_bertbaseuncased_lg")

GPU!


## Basic similarity examples
Using huggingface's BERT model and the spacy-pytorch-transformers plugin

In [4]:
doc = nlp("Here is some text to encode.")
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
print(apple1[0].similarity(apple2[0]))
print(len(doc.tensor.mean(axis=0)))

0.7342852
768


In [5]:
doc = nlp("Here is some text to encode.")
doc._.pytt_word_pieces_  # String values of the wordpieces
doc._.pytt_word_pieces  # Wordpiece IDs (note: *not* spaCy's hash values!)
doc._.pytt_alignment  # Alignment between spaCy tokens and wordpieces
span = doc[2:4]
# .vector and .similarity use the transformer outputs
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
print(apple1[0].similarity(apple2[0]))  # 0.73428553
print(apple1[0].similarity(apple3[0]))  # 0.43365782

0.7342852
0.4336574


## TMDB Utils

In [7]:
def rawTmdbMovies(filename):
    return json.load(open(filename))

def indexableMovies(filename='tmdb.json'):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        title = ''
        overview = ''
        if 'title' in tmdbMovie.keys() and isinstance(tmdbMovie['title'], str):
            title = tmdbMovie['title']        
        if 'overview' in tmdbMovie.keys() and isinstance(tmdbMovie['overview'], str):
            overview = tmdbMovie['overview']        
        yield movieId,title.strip(),overview.strip()

## Using tensors for reranking documents

For this module, we explore how we can use tensor embeddings that are provided from BERT, to do text similarity.  We'll use this for a reranking demo to see some very interesting results!

This notebook only saves the tensors to disk, the notebook 'tmdb-tensor-rerank' then uses these tensors for the search and reranking demo.

For each movie, run the overview text through BERT and save the resulting tensor for use in comparissons.  Warning!  This produces lots of data.  Each overview is expanded to a tensor with an average compressed size of 300K.

In [9]:
movies = []
for movieid,title,overview in indexableMovies('../tmdb.json'):
    if(len(overview)):
        movies.append([movieid,title,overview])

i=0
print(datetime.datetime.now())
for movie in movies:
    try:
        vectors = nlp(movie[2]).tensor
        with open('vectors/' + str(movie[0]) + '.pickle','wb') as outfile:
            pickle.dump(vectors,outfile)
    except:
        e = sys.exc_info()[0]
        print(i,text[0:24],e)
    if i%1000==0:
        print(datetime.datetime.now())
        print(i)
    i+=1

2019-10-24 16:22:17.558928
2019-10-24 16:22:17.622519
0
2019-10-24 16:22:42.412443
1000
2019-10-24 16:23:07.715188
2000
2019-10-24 16:23:32.451480
3000
2019-10-24 16:23:57.663494
4000
2019-10-24 16:24:23.309235
5000
2019-10-24 16:24:48.854009
6000
2019-10-24 16:25:16.344631
7000
2019-10-24 16:25:45.511870
8000
2019-10-24 16:26:13.441363
9000
2019-10-24 16:26:41.215524
10000
2019-10-24 16:27:07.984932
11000
2019-10-24 16:27:35.607620
12000
2019-10-24 16:28:03.338990
13000
2019-10-24 16:28:31.548323
14000
2019-10-24 16:28:59.621166
15000
2019-10-24 16:29:25.329293
16000
2019-10-24 16:29:52.113955
17000
2019-10-24 16:30:19.586912
18000
2019-10-24 16:30:47.977656
19000
2019-10-24 16:31:19.052327
20000
2019-10-24 16:31:50.168801
21000
2019-10-24 16:32:20.282288
22000
2019-10-24 16:32:49.722497
23000
2019-10-24 16:33:19.104779
24000
2019-10-24 16:33:48.558582
25000
2019-10-24 16:34:18.120400
26000
2019-10-24 16:34:48.046921
27000


In [10]:
tensor=None
with open("vectors/100402.pickle", "rb") as input_file:
     tensor = pickle.load(input_file)
print('m =',len(tensor))
print('n =',len(tensor[0]))
print(tensor)

m = 135
n = 768
[[-5.1900923e-01 -4.5147657e-01 -1.6509795e-01 ... -4.2124951e-01
   7.7184802e-01 -6.4559288e-02]
 [-8.0238420e-01 -9.1747904e-01 -8.3499902e-01 ...  8.7159149e-02
   6.2576610e-01 -5.1558369e-01]
 [-1.7619914e+00 -2.6495035e+00 -1.4534602e+00 ... -2.4488549e+00
   1.1497426e+00 -1.1555943e+00]
 ...
 [ 1.1116136e-03 -4.3683195e-01  4.0216276e-01 ...  2.3511739e-01
   8.4110722e-03 -9.5739168e-01]
 [-1.9781365e-01 -5.2316529e-01  1.3382150e-01 ...  1.8816410e-01
   3.3785015e-02 -2.9393935e-01]
 [ 4.6562946e-01 -3.3216679e-01 -2.7836835e-01 ...  9.7025990e-02
  -1.6230325e-01 -3.9267242e-01]]
