In [1]:
import pandas as pd
import openai
import numpy as np
import pickle
import requests
import json
import re
from typing import Set
from typing import List
from transformers import GPT2TokenizerFast
from nltk.tokenize import sent_tokenize
from apiclient.discovery import build

In [2]:
openai.api_key = "OPEN_AI_API_KEY"
search_api_key = "SEARCH_API_KEY"

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

    
def extract_text(snippet: str, title: str, link: str) -> str:
    date_regex = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b'
    date = re.search(date_regex, title)
    date = date.group() if date is not None else None
    ntitle = re.sub(date_regex,'',title)
    ntitle = re.sub(r"[\(\[].*[\)\]]|\s*\.{3,}\s*|['\"]", '', ntitle)
    nsnippet = re.sub(r"[\(\[].*[\)\]]|\s*\.{3,}\s*|['\"]", '', snippet)
    return [(ntitle, nsnippet, link, count_tokens(ntitle + " " + nsnippet), date)]
    

In [44]:
prompt_query = "What is a Von Neumann Universal Constructor?"
response = openai.Completion.create(
  model="text-davinci-003",
  prompt= prompt_query,
  temperature=0.3,
  max_tokens=200,
  top_p=0.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)
full_text = "".join(response["choices"][0]["text"].split("\n"))
print(full_text)

A Von Neumann Universal Constructor is a type of self-replicating machine that is capable of constructing copies of itself using raw materials from its environment. It was first proposed by mathematician and computer scientist John von Neumann in the 1950s. The concept has been used in various fields, including robotics, artificial intelligence, and nanotechnology.


### Preprocessing + Google Search

In [45]:
''' Decided against this because it lowers search quality and essentially makes it the same as google

# Summarize initial text prompt
response = openai.Completion.create(
  model="text-davinci-003",
  prompt= full_text + "\n\nTl;dr",
  temperature=0.7,
  max_tokens=60,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=1
)

summarized_text = "".join(response["choices"][0]["text"].split("\n"))
search_query = prompt_query + ". " + summarized_text
print(search_query)
    '''
search_query = full_text
print(search_query)

A Von Neumann Universal Constructor is a type of self-replicating machine that is capable of constructing copies of itself using raw materials from its environment. It was first proposed by mathematician and computer scientist John von Neumann in the 1950s. The concept has been used in various fields, including robotics, artificial intelligence, and nanotechnology.


In [46]:
resource = build("customsearch", 'v1', developerKey=search_api_key).cse()
result = resource.list(q=search_query,cx='CUSTOM_SEARCH_URL',highRange=60).execute()
results = [] 
results.extend(result["items"])
res = []
for items in results:
    print(items['snippet'], items['title'], items['link']) 
    res += extract_text(items['title'], items['snippet'], items['link'])
df = pd.DataFrame(res, columns=["Title", "Snippet", "Link", "Tokens", "Date"])
df = df.drop_duplicates(['Title','Snippet'])
df = df.reset_index().drop('index',axis=1) # reset index
df
    

HttpError: <HttpError 400 when requesting https://customsearch.googleapis.com/customsearch/v1?q=A+Von+Neumann+Universal+Constructor+is+a+type+of+self-replicating+machine+that+is+capable+of+constructing+copies+of+itself+using+raw+materials+from+its+environment.+It+was+first+proposed+by+mathematician+and+computer+scientist+John+von+Neumann+in+the+1950s.+The+concept+has+been+used+in+various+fields%2C+including+robotics%2C+artificial+intelligence%2C+and+nanotechnology.&cx=CUSTOM_SEARCH_URL&highRange=60&key=AIzaSyBOokM1prNl7dNYq86voWIVyGKSMA9sikk&alt=json returned "Request contains an invalid argument.". Details: "[{'message': 'Request contains an invalid argument.', 'domain': 'global', 'reason': 'badRequest'}]">

In [34]:
MODEL_NAME = "davinci"
ADA_DOC = "text-embedding-ada-doc-002"
ADA_QUERY = "text-embedding-ada-query-002"
DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"

In [35]:
def get_embedding(text: str, model: str):
    result = openai.Embedding.create(
        model= model, 
        input=text
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text: str):
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)  #DOC_EMBEDDINGS_MODEL

def get_query_embedding(text: str):
    return get_embedding(text, QUERY_EMBEDDINGS_MODEL) #QUERY_EMBEDDINGS_MODEL

def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_doc_embedding(r.Title + " " + r.Snippet) for idx, r in df.iterrows()
    }

In [36]:
df


Unnamed: 0,Title,Snippet,Link,Tokens,Date
0,There are numerous strategies that will help y...,5.4 Memory Techniques – Student Success,https://opentextbc.ca/studentsuccess/chapter/m...,34,
1,All it takes is trying out new memorization te...,The Science of Memory: Top 10 Proven Technique...,https://zapier.com/blog/better-memory/,39,"Jun 6, 2019"
2,"Using mnemonic devices, such as acronyms, acro...",Science-Backed Memory Tips and Recall Techniqu...,https://www.usa.edu/blog/science-backed-memory...,50,"Aug 5, 2020"
3,This method of memorization connects informati...,Elaborative Rehearsal: A Better Way to Memorize,https://www.verywellhealth.com/elaborative-reh...,38,"May 27, 2022"
4,Disadvantages of Using Mnemonics. 1. They prov...,Essential Study Skills | Kinds of Mnemonics,https://college.cengage.com/collegesurvival/wo...,40,
5,Mnemonic rhymes match information with key det...,15 Techniques You Can Try To Improve Your Memo...,https://www.indeed.com/career-advice/career-de...,40,"Dec 12, 2019"
6,We will also define active listening and the b...,5.3 Improving Listening Competence | Communica...,https://courses.lumenlearning.com/suny-realwor...,44,
7,Top students use a variety of memory technique...,The Complete List of 23 Best Memory Techniques...,https://www.daniel-wong.com/2020/08/24/best-me...,35,"Oct 21, 2021"
8,There are dozens of techniques and memory tric...,How to Remember Things: 21 Proven Memory Techn...,https://www.magneticmemorymethod.com/how-to-re...,37,"Oct 7, 2022"
9,"If you can memorize new vocabulary faster, you...",10 Proven Memory Hacks: How to Remember New Vo...,https://www.theintrepidguide.com/memory-hacks-...,40,"Aug 30, 2020"


In [25]:
context_embeddings

{0: [-0.010231759399175644,
  0.0019981428049504757,
  0.00791010819375515,
  -0.00743435975164175,
  -0.0028782770968973637,
  -0.005331552587449551,
  -0.011639975011348724,
  0.0037806129548698664,
  -3.4888205846073106e-05,
  0.0011457604123279452,
  -0.003514193929731846,
  0.0017967426683753729,
  -0.016987385228276253,
  -0.006838088855147362,
  -0.0036188585218042135,
  8.538690599380061e-05,
  0.00658435607329011,
  0.0039328522980213165,
  -0.008043318055570126,
  0.0016793914837762713,
  -0.006730252411216497,
  0.013359012082219124,
  0.003132009645923972,
  -0.017647089436650276,
  -0.0028211872559040785,
  0.01307990588247776,
  -0.0023454390466213226,
  0.0016413315897807479,
  -0.013612744398415089,
  -0.003498335834592581,
  0.0018712765304371715,
  0.003878934308886528,
  -0.01173512451350689,
  -0.0139425965026021,
  -0.011462361551821232,
  -0.007098164409399033,
  0.005172969773411751,
  -0.0002939727855846286,
  -0.007967198267579079,
  0.007282120641320944,
  -0.

In [37]:
context_embeddings = compute_doc_embeddings(df)    

In [38]:
example_entry = list(context_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

0 : [-0.003945088014006615, 0.01628735102713108, -0.002778989728540182, -0.009772410616278648, 0.002761561656370759]... (12288 entries)


In [39]:
# Make the key of the dict a tuple of the tile, row, and link
document_embeddings = dict()
for key, row in df.iterrows():
    document_embeddings[(row['Title'], row['Snippet'], row['Link'])] = context_embeddings[key]
    

In [40]:
def vector_similarity(x: List[float], y: List[float]):
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference. 
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [41]:
#Ordering the queries based on similarity to original answer
sites = order_document_sections_by_query_similarity(search_query, context_embeddings)


In [42]:
# Final site ranking based on vector embedding distance to original answer
i = 1
for site in sites:
    print(str(i) + ":", df.iloc[site[1]]['Link'])
    i += 1

1: https://www.usa.edu/blog/science-backed-memory-tips/
2: https://opentextbc.ca/studentsuccess/chapter/memory-techniques/
3: https://www.indeed.com/career-advice/career-development/techniques-to-improve-memory
4: https://www.daniel-wong.com/2020/08/24/best-memory-techniques-for-students/
5: https://www.verywellhealth.com/elaborative-rehearsal-a-better-way-to-memorize-98694
6: https://zapier.com/blog/better-memory/
7: https://www.magneticmemorymethod.com/how-to-remember-things/
8: https://courses.lumenlearning.com/suny-realworldcomm/chapter/5-3-improving-listening-competence/
9: https://college.cengage.com/collegesurvival/wong/essential_study/6e/assets/students/protected/wong_ch06_in-depthmnemonics.html
10: https://www.theintrepidguide.com/memory-hacks-how-to-memorize-vocabulary-faster/
