In [None]:
import openai
import pandas as pd
import numpy as np

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    res = openai.Embedding.create(input = [text], model=model)
    return res['data'][0]['embedding']

def cosine_similarity(A, B):
    num = np.dot(A,B)
    sum1 = sum(map(lambda x: x * x, A))
    sum2 = sum(map(lambda x: x * x, B))
    den = np.sqrt(sum1) * np.sqrt(sum2)
    if(den == 0.0):
        return 0.0
    else:
        return float(num) / den
    


#As a sample here I’ll use the results from the repo: ‘gpt3_parse_wikipedia_extract_facts_jupyter_test01’
#The name Guillermo Cabrera Infante were added to every string so it could be identified.
embedding_model = 'text-embedding-ada-002'
input_text = '''
Guillermo Cabrera Infante was born in 1941 in Gibara, Cuba,
Guillermo Cabrera Infante moved to Havana with his parents at the age of six,
Guillermo Cabrera Infante Cabrera Infante studied journalism at the University of Havana,
Guillermo Cabrera Infante he became the editor of the magazine Carteles in 1957,
Guillermo Cabrera Infante  divorced his first wife in 1961 and married his second wife, Miriam Gomez, an actress,
Guillermo Cabrera Infante he served as a cultural attaché in Brussels, Belgium, from 1962 to 1965,
Guillermo Cabrera Infante he went into exile in Madrid in 1965 and then London in 1966,
Guillermo Cabrera Infante  published Tres tristes tigres, a highly experimental, Joycean novel, in 1966,
Guillermo Cabrera Infante he co-wrote the script for Richard C. Sarafian's 1971 cult film Vanishing Point under the pseudonym Guillermo Caín,
Guillermo Cabrera Infante in 1997 he received the Premio Cervantes, presented to him by King Juan Carlos of Spain,
Guillermo Cabrera Infante died on February 21, 2005, in London, of sepsis,
Guillermo Cabrera Infante he is best known for his novel Tres tristes tigres, which was later republished as Ella Cantaba Boleros,
Guillermo Cabrera Infante also wrote a number of other novels, short stories, and essays,
Guillermo Cabrera Infante his work is considered to be highly experimental and avant-garde,
Guillermo Cabrera Infante was critical of the Castro regime and its treatment of writers and intellectuals,
Guillermo Cabrera Infante was awarded the Premio Cervantes, the highest honor given to a Spanish-language writer, in 1997
Guillermo Cabrera Infante died in London in 2005 at the age of 74
'''
input_text = input_text.replace("\n", " ")

df = pd.DataFrame([input_text])
df.columns =['Text']
openai.api_key = ""

# Get keywords representing the Text.
# Since a texts tend to be highly verbose I used GPT to extract keywords 
# and get the embeddings from the keywords instead.

prompt = "Only from the following text. text: " + input_text + " Extract  basic keywords from the previous text for internet search, only list the keywords"
response = openai.Completion.create(
                engine="text-davinci-002",
                prompt=prompt,
                temperature=0.0,
                max_tokens=1256,
                top_p=1.0,
                frequency_penalty=0.0,
                presence_penalty=0.0
                )

keyword_text = response['choices'][0]['text'].replace("\n", "").replace(".", "")
df.loc[0,'keywords'] = keyword_text
#print(keyword_text)

query_embedding = get_embedding(keyword_text, embedding_model)
df.loc[0,'embeddings'] = str(query_embedding)

query = "Tell me a Latin American writer"
#query = "Tell me a Cuban writer"
#query = "Tell me a Chínese writer"


# Get keywords representing the query.
# Since a queries tend to be highly verbose I used GPT to extract keywords 
# and get the embeddings from the keywords instead.

prompt = "Only from the following text. text: " + query + "Extract  basic keywords from the previous text for internet search, only list the keywords"
response = openai.Completion.create(
                engine="text-davinci-002",
                prompt=prompt,
                temperature=0.0,
                max_tokens=1256,
                top_p=1.0,
                frequency_penalty=0.0,
                presence_penalty=0.0
                )

keyword_query = response['choices'][0]['text'].replace("\n", "").replace(".", "")
#query_embedding = get_embedding(keyword_query, embedding_model) #Find embeddings for the keyword
query_embedding = get_embedding(query, embedding_model)  #Find embeddings directly from the query

# Obtain the similarity of each fact to the query
ada_embedding = df.embeddings.apply(eval).apply(np.array)
i = 0
for row_emb in ada_embedding:
    num = cosine_similarity(query_embedding, row_emb)
    df.loc[i,'similarity'] = num
    i = i + 1
    
df
