In [1]:
!pip install transformers



In [2]:
import os
import sys
import torch
import json
import datetime
import numpy
from transformers import (
    BertConfig,
    BertTokenizer,
    BertModel
)
device = torch.device("cuda:0")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name()

'Tesla T4'

In [3]:
#Grant access to your local g-drive
from google.colab import drive
drive.mount('/content/drive/')
path = '/content/drive/My Drive/Colab Notebooks/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
def blogPosts(filename):
    """ Gets the id, title, and overview for all english movies """
    posts = json.load(open(filename))
    for post in posts:
        url = post['id']
        title = post['title']
        content = post['content']
        summary = post['summary']
        yield url,title.strip(),content.strip(),summary.strip()

In [5]:
posts = []
for url,title,content,summary in blogPosts(path + 'blog-posts.json'):
    posts.append([url,title,content,summary])
print('Loaded',len(posts),'blog posts')

Loaded 687 blog posts


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
print('tokenizer and model are ready')

tokenizer and model are ready


In [0]:
def encode(querystring):
    input_ids = torch.tensor(tokenizer.encode(querystring, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    flat = outputs[0][0].detach().numpy()
    encodings = numpy.reshape(flat,(flat.size//768,768))
    return encodings

In [8]:
summary = posts[0][3]
vecs = encode(summary)
print(summary)
print(len(vecs),vecs[0].size)

Weve discussed setting up a company-wide blog for a while now, and today I decided to just go ahead and do it. We looked at some of the powerful ones, but re...
43 768


In [11]:
def encodeIndex(posts):
    print(datetime.datetime.now())
    index = []
    for i in range(len(posts)):
        post = posts[i]
        url = post[0]
        title = post[1]
        sumary = post [3]
        title_encodings = encode(title)
        summary_encodings = encode(summary)
        index.append({
            "url":url,
            "title":title,
            "summary":summary,
            "title_encodings":title_encodings,
            "summary_encodings":summary_encodings
        })
        if i%100==0:
            print(datetime.datetime.now())
            print(i)
    return index
index = encodeIndex(posts)

2020-01-12 19:37:42.605111
2020-01-12 19:37:42.844411
0
2020-01-12 19:38:09.391280
100
2020-01-12 19:38:37.009291
200
2020-01-12 19:39:04.394506
300
2020-01-12 19:39:31.596519
400
2020-01-12 19:39:58.594775
500
2020-01-12 19:40:24.979496
600


## Let's build an encoding similarity search engine!

We will have a similarity function, and a search function.  Using the title_encodings and summary_encodings, make a new similarity function to replace 'weak_similarity', or change the boosts from the title and summary fields.  Who can make the best recall for k@10?

In [0]:
#Our weak_similarity function takes the dot product of all tokens and averages them together
def weak_similarity(encoding1,encoding2):
    total = 0
    dims = 0
    for a in encoding1:
        for b in encoding2:
          total+=a.dot(b)
          dims+=1
    return total/dims

In [13]:
#Test the weak_similarity function with some comparissons:
A = encode("Apples are very tasty.")
B = encode("Apple stock is high.")
C = encode("I bought a new iPhone today.")
D = encode("I ate some fruit.")

print(weak_similarity(A,B))
print(weak_similarity(B,C))
print(weak_similarity(A,C))
print(weak_similarity(A,D))
print(weak_similarity(B,D))
print(weak_similarity(C,D))

61.89774152210781
51.28725300137959
49.68180496825112
65.36221876314708
61.12438996957273
60.36614611792186


In [0]:
#Our search function takes a querystring, the index, and the similarity function
#It returns a ranked resultset ranked by descending score
def berty_searchy(querystring,index,similarity):
    q = encode(querystring)
    resultset = []
    for i in range(len(index)):
        record = index[i]
        title_similarity = similarity(q,record["title_encodings"])
        summary_similarity = similarity(q,record["summary_encodings"])
        score = title_similarity * 1.1 + summary_similarity
        resultset.append([score,index[i]])
    reranked = sorted(resultset, reverse=True, key=lambda k: k[0])
    return reranked

In [35]:
#Run a search and print the top k results!
query = "How can I measure relevance with nDCG?"
k = 20

results = berty_searchy(query,index,weak_similarity)
for res in results[0:k]:
    print(res[0],res[1]["title"])

120.80150973557951 How is search different than other machine learning problems?
118.25426682847598 What is Test Driven Search Relevancy?
115.75756584134382 Using Cassandra to Build a Naive Bayes Classifier of Users Based Upon Behavior
114.9610861571417 Real-time Doctest Checking In Vim
114.77613978801028 The Simple Power of Elasticsearch Analyzers
114.52198265040442 Understanding How CQL3 Maps to Cassandras Internal Data Structure
114.01879445630169 Search Precision and Recall By Example
113.9525506190246 What is a 'Relevant' Search Result?
113.8849110240856 Semantic Search with Latent Semantic Analysis
113.74971238000052 Zookeeper Resiliency for Solr Cloud in AWS, using Auto-Scaling Groups
113.73435646114606 Does the virtual world equal the real world?
113.51796655167664 Data Modeling For Search Relevance -- Signals and Semantics
113.38265164261287 The Simple Knowledge Organization System (SKOS), in the Context of Semantic Web Deployment
113.08331413286783 Why use a Javascript interf