In [1]:
import openai
import json
import tenacity
from typing import List

import joblib

import random
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.spatial.distance import cosine
from numpy import dot
from numpy.linalg import norm

import logging
import sys

from util import Credentials, TemporarySeed, total_size
import random

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
pd.set_option('display.max_colwidth', None)

logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
logger = logging.getLogger(__name__)

In [4]:
credentials = Credentials.load('openai_credentials.yaml')
client = openai.OpenAI(
    organization=credentials.organization, 
    api_key=credentials.api_key
)

In [5]:
@tenacity.retry(
    wait=tenacity.wait_exponential(min=0.1, max=10),
    stop=tenacity.stop_after_attempt(3), # because 4 is too many and 2 isn't enough.
    after=tenacity.after_log(logger, logging.ERROR),
    reraise=True)
def gpt(prompt):
    chat_response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    content = chat_response.choices[0].message.content
    return content

In [6]:
gpt('sup, bro?')

'Hello! How can I assist you today?'

In [7]:
with open(r'D:\Dropbox\data\jeopardy\jeopardy.json') as fin:
    jeopardy_data = json.load(fin)

In [8]:
def random_jeopardy_question():
    return random.sample(jeopardy_data, 1)[0]

q = random_jeopardy_question()
q

{'category': 'BUSINESS & INDUSTRY',
 'air_date': '1993-11-17',
 'question': "'In 1952 Kemmons Wilson opened the first motel in this chain, on the outskirts of Memphis'",
 'value': '$1,000',
 'answer': 'Holiday Inn',
 'round': 'Jeopardy!',
 'show_number': '2118'}

In [9]:
@tenacity.retry(
    wait=tenacity.wait_exponential(min=0.1, max=10),
    stop=tenacity.stop_after_attempt(3), # because 4 is too many and 2 isn't enough.
    after=tenacity.after_log(logger, logging.ERROR),
    reraise=True)
def openai_embeddings_create(batch):
    return client.embeddings.create(
      model="text-embedding-ada-002",
      input=batch,
      encoding_format="float"
    )

def create_embeddings_database(chunks: List[str], batch_size=2048) -> np.ndarray:
    database = None
    n = len(chunks)
    
    for batch_index in range(0, n, batch_size):
        # call the OpenAI embeddings API for each batch
        batch = chunks[batch_index:batch_index+batch_size]
        embedding_response = openai_embeddings_create(batch)
        embeddings = embedding_response.data

        # defer creation of the database until we know the embedding dimension
        if database is None:
            m = len(embeddings[0].embedding)    
            database = np.zeros(shape=(n, m), dtype='float32')

        # populate database
        for i, embedding in enumerate(embeddings):
            index = batch_index + i
            database[index, :] = embedding.embedding

    return database

In [10]:
def embed(chunk: str) -> np.ndarray:
    db = create_embeddings_database([chunk])
    return db[0]

test_chunk = embed("The King and I")
test_chunk.shape

(1536,)

In [11]:
movie_titles = [
    "The Shawshank Redemption",
    "The Godfather",
    "The Dark Knight",
    "Pulp Fiction",
    "Schindler's List",
    "The Lord of the Rings: The Return of the King",
    "Inception",
    "Fight Club",
    "Forrest Gump",
    "The Matrix"
]

movie_database = create_embeddings_database(movie_titles, batch_size=3)
print(movie_database.shape)

(10, 1536)


In [12]:
movie_database[:, :3]

array([[-0.00141219, -0.04665427, -0.00372241],
       [ 0.00115975, -0.02712612, -0.00477172],
       [-0.01121753, -0.02616804, -0.01262515],
       [-0.00022746, -0.02844045, -0.00606565],
       [-0.02149398, -0.03689073, -0.01679297],
       [-0.0089068 , -0.02844654, -0.02965169],
       [ 0.00983392, -0.01453309, -0.01118691],
       [-0.02976511, -0.03190778, -0.03022978],
       [-0.01263725, -0.03492859,  0.00914044],
       [-0.01052376, -0.02687083, -0.03023948]], dtype=float32)

In [13]:
np.allclose(norm(movie_database, axis=1), 1)

True

In [14]:
def best_k(database: np.ndarray, query: np.ndarray, k: int = 5):
    # Normalize the query vector
    query = query / np.linalg.norm(query)
    
    # Compute cosine distances
    distances = 1.0 - (database @ query)

    # Find the indices of the k smallest distances
    best_k_unsorted = np.argpartition(distances, k)[:k]

    # Sort these indices by distance
    sorted_indices = np.argsort(distances[best_k_unsorted])
    best_k_sorted = best_k_unsorted[sorted_indices]
    
    return best_k_sorted


In [15]:
best_k(movie_database, test_chunk, k=3)

array([5, 1, 9], dtype=int64)

In [16]:
top_k_indices = best_k(movie_database, embed("The Hobbit"), k=3)
print(top_k_indices)
for index in top_k_indices:
    print(movie_titles[index])

[5 2 9]
The Lord of the Rings: The Return of the King
The Dark Knight
The Matrix


In [17]:
len(jeopardy_data), type(jeopardy_data)

(216930, list)

In [18]:
with TemporarySeed(42):
    jeopardy_sample = random.sample(jeopardy_data, k=100)
print(len(jeopardy_sample))
print(jeopardy_sample[0])

100
{'category': 'A FEW GENTLEMEN OF VERONA', 'air_date': '2007-12-07', 'question': "'Verona-born tenor Nino Martini performed brilliantly at this NYC location from 1933 to 1946'", 'value': '$1200', 'answer': 'the Metropolitan Opera', 'round': 'Double Jeopardy!', 'show_number': '5350'}


In [19]:
jeopardy_chunk_template = '''JEOPARDY QUESTION:
category: {category!r}
clue: {question}
correct response: {answer!r}'''

In [20]:
jeopardy_sample_chunks = [ jeopardy_chunk_template.format(**q) for q in jeopardy_sample ]
print(len(jeopardy_sample_chunks))
print(jeopardy_sample_chunks[0])

100
JEOPARDY QUESTION:
category: 'A FEW GENTLEMEN OF VERONA'
clue: 'Verona-born tenor Nino Martini performed brilliantly at this NYC location from 1933 to 1946'
correct response: 'the Metropolitan Opera'


In [21]:
jeopardy_sample_database = create_embeddings_database(jeopardy_sample_chunks)

In [22]:
top_k_indices = best_k(jeopardy_sample_database, embed("Jazz"), k=5)
print(top_k_indices)
for index in top_k_indices:
    print(jeopardy_sample_chunks[index])
    print()

[42 38 14  0 82]
JEOPARDY QUESTION:
category: 'FILE UNDER "A"'
clue: 'It's a violent dance performed by Parisian couples, not by American Indians, as its name implies'
correct response: 'Apache Dance'

JEOPARDY QUESTION:
category: 'FROM CLASSICAL TUNES'
clue: '1963 novelty song that used the "Dance Of The Hours" from "La Gioconda", heard <a href="http://www.j-archive.com/1998-02-17_DJ_21.mp3">here</a>:'
correct response: '"Hello Muddah, Hello Fadduh (A Letter From Camp)"'

JEOPARDY QUESTION:
category: 'DISNEY MOVIES'
clue: 'With special sound equipment needed in the auditorium, this 1940 film was initially released in only 14 theaters'
correct response: 'Fantasia'

JEOPARDY QUESTION:
category: 'A FEW GENTLEMEN OF VERONA'
clue: 'Verona-born tenor Nino Martini performed brilliantly at this NYC location from 1933 to 1946'
correct response: 'the Metropolitan Opera'

JEOPARDY QUESTION:
category: 'COMPOSERS & THEIR WORKS'
clue: 'He wrote the score for the 1954 film "On the Waterfront" 3 year

In [23]:
# jeopardy_chunks = [ jeopardy_chunk_template.format(**q) for q in jeopardy_data ]
# print(len(jeopardy_chunks))

In [24]:
#jeopardy_database = create_embeddings_database(jeopardy_chunks)

In [25]:
# jeopardy_datafile = {
#     'vdb': jeopardy_database, 
#     'chunks': jeopardy_chunks,
#     'questions': jeopardy
# }
# joblib.dump(jeopardy_datafile, r'D:\Dropbox\data\jeopardy\jeopardy_vdb2.joblib')

In [26]:
# np.savez(r'D:\Dropbox\data\jeopardy\jeopardy_vdb.npz', vdb=jeopardy_database, chunks=jeopardy_chunks)

In [27]:
data = joblib.load(r'D:\Dropbox\data\jeopardy\jeopardy_vdb2.joblib')
jeopardy_chunks = data['chunks']
jeopardy_database = data['vdb']
jeopardy_data = data['questions']

In [28]:
def print_best_k(vdb, chunks, query: str, k=5):
    top_k_indices = best_k(vdb, embed(query), k=k)
    for index in top_k_indices:
        print(chunks[index])
        print()

In [29]:
print_best_k(jeopardy_database, jeopardy_chunks, "The sound of raindrops on the windowsill")

JEOPARDY QUESTION:
category: 'RHYMES WITH RAIN'
clue: 'It's a sheet of glass in a window'
correct response: 'pane'

JEOPARDY QUESTION:
category: "IT'S RAINING"
clue: 'Petrichor is this distinctive sensation, a pleasant accompaniment to the first rain after dry weather'
correct response: 'the smell of rain'

JEOPARDY QUESTION:
category: 'WEATHER WORDS & PHRASES'
clue: '<a href="http://www.j-archive.com/media/2005-04-20_J_16.mp3">They</a> sound soothing to some'
correct response: 'wind chimes'

JEOPARDY QUESTION:
category: 'THE RAIN'
clue: '(Alex: Here is Al Roker with the clue)  When I predict this, from Middle English for "to fall", expect slow-falling raindrops less than .02" across'
correct response: 'drizzle'

JEOPARDY QUESTION:
category: 'OLD SONG LYRICS'
clue: 'The Cascades:<br />"Listen to the rhythm of the ___ ___ telling me just what a fool I've been"'
correct response: 'falling rain'



In [30]:
%%timeit -n 3 -r 3
query_vector = embed("not jazz")

247 ms ± 8.75 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [31]:
query_vector = embed("not jazz")

In [32]:
%%timeit -n 3 -r 3
best_k(jeopardy_database, query_vector, k=5)

53 ms ± 2.14 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [33]:
print('total size:', total_size(data)/1e9)

print('vdb type:', type(jeopardy_database))
print('vdb dtype:', jeopardy_database.dtype)
print('vdb shape:', jeopardy_database.shape)
print('vdb estimated size:', jeopardy_database.shape[0] * jeopardy_database.shape[1] * 4/1e9, 'GB')
print('vdb size:', total_size(jeopardy_database)/1e9, 'GB')
print()

print('chunks type:', type(jeopardy_chunks))
print('chunks length:', len(jeopardy_chunks))
print('chunks[0] type:', type(jeopardy_chunks[0]))
print('chunk estimated size:', len(','.join(jeopardy_chunks))/1e9, 'GB')
print('chunks size:', total_size(jeopardy_chunks)/1e9, 'GB')
print()

print('questions type:', type(jeopardy))
print('questions len:', len(jeopardy))
print('questions[0] type:', type(jeopardy[0]))
print('questions[0] len:', len(jeopardy[0]))
print('questions size:', total_size(jeopardy)/1e9, 'GB')

total size: 1.549798098
vdb type: <class 'numpy.ndarray'>
vdb dtype: float32
vdb shape: (216930, 1536)
vdb estimated size: 1.33281792 GB
vdb size: 1.33281792 GB

chunks type: <class 'list'>
chunks length: 216930
chunks[0] type: <class 'str'>
chunk estimated size: 0.037974765 GB
chunks size: 0.050316693 GB



NameError: name 'jeopardy' is not defined

In [None]:
def rag(category, clue):
    query = f'''{category}\n{clue}'''
    query_vector = embed(query)
    top_index = best_k(jeopardy_database, query_vector, k=1)[0]
    answer = jeopardy[top_index]['answer']
    return f'What is {answer}?'

In [None]:
rag('HISTORY', 'For the last 8 years of his life, Galileo was under house arrest for espousing this man\'s theory')

In [None]:
import faiss

In [None]:
l2_index = faiss.IndexFlatL2(1536)
l2_index.add(jeopardy_database)

In [None]:
distances, top_k_indices = l2_index.search(query_vector.reshape(1, -1), k=5)
for index in top_k_indices[0]:
    print(jeopardy_chunks[index])
    print()

In [None]:
%%timeit
distances, top_k_indices = l2_index.search(query_vector.reshape(1, -1), k=5)

In [None]:
inner_product_index = faiss.IndexFlatIP(1536)
inner_product_index.add(jeopardy_database)

In [None]:
distances, top_k_indices = inner_product_index.search(query_vector.reshape(1, -1), k=5)
for index in top_k_indices[0]:
    print(jeopardy_chunks[index])
    print()

In [None]:
%%timeit
distances, top_k_indices = inner_product_index.search(query_vector.reshape(1, -1), k=5)

In [None]:
hnsw_index = faiss.IndexHNSWFlat(1536, 32)
hnsw_index.add(jeopardy_database)

In [None]:
query_vector = embed('Love')

In [None]:
distances, top_k_indices = hnsw_index.search(query_vector.reshape(1, -1), k=5)
for index in top_k_indices[0]:
    print(jeopardy_chunks[index])
    print()

In [None]:
%%timeit
distances, top_k_indices = hnsw_index.search(query_vector.reshape(1, -1), k=5)

In [None]:
99.2e-3 / 355e-6

In [None]:
hnsw_scalar_quantized_index = faiss.IndexHNSWSQ(1536, faiss.ScalarQuantizer.QT_8bit, 32)
hnsw_scalar_quantized_index.train(jeopardy_database)
hnsw_scalar_quantized_index.add(jeopardy_database)

In [None]:
distances, top_k_indices = hnsw_scalar_quantized_index.search(query_vector.reshape(1, -1), k=5)
for index in top_k_indices[0]:
    print(jeopardy_chunks[index])
    print()

In [None]:
%%timeit
distances, top_k_indices = hnsw_scalar_quantized_index.search(query_vector.reshape(1, -1), k=5)

In [None]:
faiss.write_index(hnsw_index, r'D:\dropbox\data\jeopardy\jeopardy_faiss_hnsw.index')

In [None]:
%%timeit
faiss.read_index(r'D:\dropbox\data\jeopardy\jeopardy_faiss_hnsw.index')