In [1]:
import json
import openai
import os
import tiktoken
import numpy as np



In [2]:
def load_html_data(fp):
  with open(fp) as html_data:
    data = json.load(html_data)["data"]
  return data

In [3]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
def get_embedding(text):
  response = openai.Embedding.create(
      input=text,
      model="text-embedding-ada-002",
      encoding_format="float"
  )

  return response["data"][0]["embedding"]

In [5]:
def get_embedding_data():
  encoding = tiktoken.get_encoding("cl100k_base")
  embedding_data = []
    
  data = load_html_data("../data/html-data.json")
  
  for data_obj in data:
    tokens = encoding.encode(data_obj["question"] + data_obj["answer"])
    if len(tokens) > 8190:
      tokens = tokens[:8190]
    
    embedding = get_embedding(encoding.decode(tokens))
    
    obj = {
      "embedding": embedding,
      "data": data_obj
    }
    embedding_data.append(obj)
  
  return embedding_data

In [6]:
def save_embedding_data(fp, embedding_data):
  with open("../data/embedding-data-with-answer.json", "w+") as embedding_file:
    json.dump({
      "embedding-data": embedding_data
    }, embedding_file)

In [7]:
def load_embedding_data(fp):
  with open(fp) as embedding_data_file:
    data = json.load(embedding_data_file)
  return data

In [10]:
def get_similar_faq(question, embedding_data):
  input_embedding = get_embedding(question)

  most_similar = [0, None]
  for embedding in embedding_data:
    similarity = np.dot(embedding["embedding"], input_embedding)
    if similarity > most_similar[0]:
      most_similar[0] = similarity
      most_similar[1] = embedding["data"]
  
  return {"similarity": most_similar[0], "data": most_similar[1]}

In [43]:
ed = load_embedding_data("../data/embedding-data-with-answer.json")["embedding-data"]

In [141]:
def get_summarized_answer(question, embedding_data):
  response = get_similar_faq(question, embedding_data)
  
  similarity = response["similarity"]
  
  if similarity < 0.75:
    return "I'm sorry, I don't quite understand the question"
  
  faq_data = response["data"]["answer"]
  
  chat_response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
      {
        "role": "system",
        "content": f"you are a chatbot for giving users a concise yet accurate answer. use only this data to answer the users question. you should respond it at most 3 sentences. the data is: {faq_data}"
      },
      {
        "role": "user",
        "content": f"{question}"
      }
    ]
  )

  return chat_response["choices"][0]["message"]["content"]

In [146]:
chat_response = get_summarized_answer("give me some examples of network etiquette", ed)

In [147]:
print(chat_response)

Examples of network etiquette include:
1. Respecting others' time and resources by not sending chain mail or irrelevant material to a widespread audience.
2. Avoiding spamming or sending irrelevant emails to mailing lists, as it wastes computer and network resources.
3. Posting articles or messages that are relevant to the topic of a newsgroup to avoid causing confusion and unnecessary reactions.
