# TMDB BERT Tensors

Gives a short intro to using BERT.

Runs TMDB overview text data through BERT and saves the output tensors as pickle files for later use

In [2]:
import spacy
import torch
import numpy
import json
import datetime
import sys
import pickle
import cupy
from numpy.testing import assert_almost_equal
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    print('GPU!')
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
nlp = spacy.load("en_trf_bertbaseuncased_lg")

## Basic similarity examples
Using huggingface's BERT model and the spacy-pytorch-transformers plugin

In [3]:
doc = nlp("Here is some text to encode.")
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
print(apple1[0].similarity(apple2[0]))
print(len(doc.tensor.mean(axis=0)))

0.73428494
768


In [4]:
doc = nlp("Here is some text to encode.")
doc._.trf_word_pieces_  # String values of the wordpieces
doc._.trf_word_pieces  # Wordpiece IDs (note: *not* spaCy's hash values!)
doc._.trf_alignment  # Alignment between spaCy tokens and wordpieces
span = doc[2:4]
# .vector and .similarity use the transformer outputs
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
print(apple1[0].similarity(apple2[0]))  # 0.73428553
print(apple1[0].similarity(apple3[0]))  # 0.43365782

0.73428494
0.4336571


## TMDB Utils

In [5]:
def rawTmdbMovies(filename):
    return json.load(open(filename))

def indexableMovies(filename='tmdb.json'):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        title = ''
        overview = ''
        if 'title' in tmdbMovie.keys() and isinstance(tmdbMovie['title'], str):
            title = tmdbMovie['title']        
        if 'overview' in tmdbMovie.keys() and isinstance(tmdbMovie['overview'], str):
            overview = tmdbMovie['overview']        
        yield movieId,title.strip(),overview.strip()

## Using tensors for reranking documents

For this module, we explore how we can use tensor embeddings that are provided from BERT, to do text similarity.  We'll use this for a reranking demo to see some very interesting results!

This notebook only saves the tensors to disk, the notebook 'tmdb-tensor-rerank' then uses these tensors for the search and reranking demo.

For each movie, run the overview text through BERT and save the resulting tensor for use in comparissons.  Warning!  This produces lots of data.  Each overview is expanded to a tensor with an average compressed size of 300K.

In [6]:
movies = []
for movieid,title,overview in indexableMovies('../tmdb.json'):
    if(len(overview)):
        movies.append([movieid,title,overview])

i=0
print(datetime.datetime.now())
for movie in movies:
    try:
        vectors = cupy.asnumpy(nlp(movie[2]).tensor)
        with open('vectors/' + str(movie[0]) + '.pickle','wb') as outfile:
            pickle.dump(vectors,outfile)
    except:
        e = sys.exc_info()[0]
        print(i,text[0:24],e)
    if i%1000==0:
        print(datetime.datetime.now())
        print(i)
    i+=1

2019-11-01 15:02:06.131660
2019-11-01 15:02:06.349726
0
2019-11-01 15:05:01.762650
1000
2019-11-01 15:07:58.303642
2000
2019-11-01 15:10:44.196243
3000
2019-11-01 15:13:30.769849
4000
2019-11-01 15:16:22.664147
5000
2019-11-01 15:19:12.125798
6000
2019-11-01 15:22:12.464227
7000
2019-11-01 15:25:26.327792
8000
2019-11-01 15:28:27.298205
9000
2019-11-01 15:31:30.420762
10000
2019-11-01 15:34:25.322165
11000
2019-11-01 15:37:26.900859
12000
2019-11-01 15:40:29.489763
13000
2019-11-01 15:43:33.136095
14000
2019-11-01 15:46:36.670220
15000
2019-11-01 15:49:21.972174
16000
2019-11-01 15:52:13.869232
17000
2019-11-01 15:55:08.838274
18000
2019-11-01 15:58:11.433081
19000
2019-11-01 16:01:37.278798
20000
2019-11-01 16:05:02.406734
21000
2019-11-01 16:08:15.627856
22000
2019-11-01 16:11:28.437668
23000
2019-11-01 16:14:41.070264
24000
2019-11-01 16:18:08.819549
25000
2019-11-01 16:21:45.180761
26000
2019-11-01 16:25:04.495194
27000


In [7]:
def examine_tensor(movieid):
    tensor=None
    with open("vectors/"+movieid+".pickle", "rb") as input_file:
         tensor = pickle.load(input_file)
    print('m =',len(tensor))
    print('n =',len(tensor[0]))
    print(tensor)
examine_tensor('100402')

m = 135
n = 768
[[-5.1900893e-01 -4.5147708e-01 -1.6509822e-01 ... -4.2124975e-01
   7.7184820e-01 -6.4559720e-02]
 [-8.0238360e-01 -9.1748035e-01 -8.3499974e-01 ...  8.7159008e-02
   6.2576568e-01 -5.1558346e-01]
 [-1.7619932e+00 -2.6495075e+00 -1.4534638e+00 ... -2.4488537e+00
   1.1497415e+00 -1.1555958e+00]
 ...
 [ 1.1101943e-03 -4.3683273e-01  4.0216300e-01 ...  2.3511750e-01
   8.4112855e-03 -9.5739186e-01]
 [-1.9781429e-01 -5.2316445e-01  1.3382168e-01 ...  1.8816370e-01
   3.3785544e-02 -2.9394013e-01]
 [ 4.6562755e-01 -3.3216560e-01 -2.7836701e-01 ...  9.7025424e-02
  -1.6230188e-01 -3.9267188e-01]]
