In [2]:
import numpy as np
import pandas as pd
from typing import Union
import logging
import sys

import faiss

from jpt import (
    ken,
    alex,
    embed,
    load_jeopardy_dataset,
    load_jeopardy_index,
    load_jeopardy_embedding_data,
    clean_currency,
    index_search,
    brute_force_search
)

In [3]:
pd.set_option('display.max_colwidth', None)

logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
logger = logging.getLogger(__name__)

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
jeopardy = load_jeopardy_embedding_data()
for question in jeopardy.data:
    question['value'] = clean_currency(question['value'])
    question['has_link'] = ('<a href' in question['question'])

In [6]:
jeopardy_index = load_jeopardy_index()

In [6]:
results_df = pd.DataFrame.from_records(index_search(jeopardy_index, jeopardy.data, "Jazz", k=5))
results_df

Unnamed: 0,category,air_date,question,value,answer,round,show_number,has_link,distance
0,MUSIC,1995-11-13,'Styles of music in the titles of the JVC & Playboy festivals',600.0,Jazz,Double Jeopardy!,2576,False,0.280104
1,MUSICAL STYLINGS,2011-09-22,"'Sandra Booker, Nina Simone & Esperanza Spalding are all sultry chanteuses of this 4-letter musical genre'",400.0,jazz,Double Jeopardy!,6209,False,0.293485
2,ALL THAT JAZZ,1997-10-31,"'This 1960s style pioneered by Ornette Coleman ""liberated"" improvisers from set melodies'",1000.0,free jazz,Double Jeopardy!,3030,False,0.294299
3,BOOKS & AUTHORS,1992-10-29,'Type of music that's the title of Toni Morrison's 1992 novel set in 1926 Harlem',400.0,Jazz,Double Jeopardy!,1874,False,0.299384
4,"STARTS WITH ""J""",1996-07-17,'Swing was the most popular style of this music in the 1930s',300.0,Jazz,Jeopardy!,2753,False,0.305021


In [8]:
index_search(jeopardy_index, jeopardy.data, "Jazz", k=2)

[{'category': 'MUSIC',
  'air_date': '1995-11-13',
  'question': "'Styles of music in the titles of the JVC & Playboy festivals'",
  'value': 600.0,
  'answer': 'Jazz',
  'round': 'Double Jeopardy!',
  'show_number': '2576',
  'has_link': False,
  'distance': 0.28010377},
 {'category': 'MUSICAL STYLINGS',
  'air_date': '2011-09-22',
  'question': "'Sandra Booker, Nina Simone & Esperanza Spalding are all sultry chanteuses of this 4-letter musical genre'",
  'value': 400.0,
  'answer': 'jazz',
  'round': 'Double Jeopardy!',
  'show_number': '6209',
  'has_link': False,
  'distance': 0.29348546}]

In [7]:
results_df = pd.DataFrame.from_records(brute_force_search(jeopardy.embeddings, jeopardy.data, "Jazz", k=5))
results_df

Unnamed: 0,category,air_date,question,value,answer,round,show_number,has_link,distance
0,MUSIC,1995-11-13,'Styles of music in the titles of the JVC & Playboy festivals',600.0,Jazz,Double Jeopardy!,2576,False,0.280103
1,MUSICAL STYLINGS,2011-09-22,"'Sandra Booker, Nina Simone & Esperanza Spalding are all sultry chanteuses of this 4-letter musical genre'",400.0,jazz,Double Jeopardy!,6209,False,0.293485
2,ALL THAT JAZZ,1997-10-31,"'This 1960s style pioneered by Ornette Coleman ""liberated"" improvisers from set melodies'",1000.0,free jazz,Double Jeopardy!,3030,False,0.294298
3,BOOKS & AUTHORS,1992-10-29,'Type of music that's the title of Toni Morrison's 1992 novel set in 1926 Harlem',400.0,Jazz,Double Jeopardy!,1874,False,0.299384
4,"STARTS WITH ""J""",1996-07-17,'Swing was the most popular style of this music in the 1930s',300.0,Jazz,Jeopardy!,2753,False,0.305021


In [8]:
ken(
    category='GOOD THINGS COME IN THREES', 
    clue='This legendary alchemist first mentioned the Philosopher\' Stone.')

'Who is Hermes Trismegistus?'

In [9]:
ken(
    category='NOTABLE FRENCH NAMES', 
    clue='This legendary alchemist first mentioned the Philosopher\' Stone.')

'Who is Nicolas Flamel?'

In [10]:
alex(
    category='GOOD THINGS COME IN THREES',
    clue='This legendary alchemist is said to have origininated the hermetic tradition.',
    correct_response='Hermes Trismegistus',
    contestant_response='Who is Nicolas Flamel?')

'Incorrect. The correct response is Hermes Trismegistus.'

In [11]:
query_vector = embed("Jazz")

In [12]:
%%timeit
index_search(jeopardy_index, jeopardy.data, query_vector, k=5)

311 µs ± 3.33 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [13]:
%%timeit
brute_force_search(jeopardy.embeddings, jeopardy.data, query_vector, k=5)

51.6 ms ± 966 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
type(jeopardy_index)

faiss.swigfaiss.IndexHNSWFlat

In [21]:
len(" ".join(jeopardy.chunks))/1e6

37.974765

In [23]:
Total chunk size: 38 million character (roughly 10X the complete works of shakespeare)

jeopardy_chunks = [ jeopardy_chunk_template.format(**q) for q in jeopardy_data ]
jeopardy_database = create_embeddings_database(jeopardy_chunks)
30 minutes and $1 to embed using text-embedding-ada-002

%%timeit
generate_hnsw_index(jeopardy.embeddings, "temp.index")
About 1 minute to create the FAISS HNSW Index.
It's about 1.3 GB on disk, basically the same as just storing the embedding vectors.
Same size in memory.

about 0.6 seconds to load the index off disk
%%timeit
load_jeopardy_index()

HNSW vs. brute force:
311 microseconds vs 51.6 milliseconds
that's about 166X (two orders of magnitude faster)

1339/ 1359

0.9852832965415746

In [19]:
# %%timeit
# generate_hnsw_index(jeopardy.embeddings, "temp.index")

1min 21s ± 887 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
