In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [2]:
import pandas as pd
import os
import tiktoken
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

openai = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

DOMAIN = "developer.mozilla.org"

def remove_newlines(series):
  series = series.str.replace('\n', ' ')
  series = series.str.replace('\\n', ' ')
  series = series.str.replace('  ', ' ')
  series = series.str.replace('  ', ' ')
  return series



In [3]:
# Create a list to store the text files
texts=[]

# Get all the text files in the text directory
for file in os.listdir("../data/paul/HowtoDoGreatWork.html" + DOMAIN + "/"):

  # Open the file and read the text
  with open("text/" + DOMAIN + "/" + file, "r", encoding="UTF-8") as f:
    text = f.read()
    # we replace the last 4 characters to get rid of .txt, and replace _ with / to generate the URLs we scraped
    filename = file[:-4].replace('_', '/')
    """
    There are a lot of contributor.txt files that got included in the scrape, this weeds them out. There are also a lot of auth required urls that have been scraped to weed out as well
    """ 
    if filename.endswith(".txt") or 'users/fxa/login' in filename:
      continue

    # then we replace underscores with / to get the actual links so we can cite contributions
    texts.append(
      (filename, text))

# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns=['fname', 'text'])

# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('processed/scraped.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/pauldeveloper.mozilla.org/'

In [None]:
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [None]:
chunk_size = 1000  # Max number of tokens

text_splitter = RecursiveCharacterTextSplitter(
        # This could be replaced with a token counting function if needed
    length_function = len,  
    chunk_size = chunk_size,
    chunk_overlap  = 0,  # No overlap between chunks
    add_start_index = False,  # We don't need start index in this case
)

shortened = []

for row in df.iterrows():

  # If the text is None, go to the next row
  if row[1]['text'] is None:
    continue

  # If the number of tokens is greater than the max number of tokens, split the text into chunks
  if row[1]['n_tokens'] > chunk_size:
    # Split the text using LangChain's text splitter
    chunks = text_splitter.create_documents([row[1]['text']])
    # Append the content of each chunk to the 'shortened' list
    for chunk in chunks:
      shortened.append(chunk.page_content)

  # Otherwise, add the text to the list of shortened texts
  else:
    shortened.append(row[1]['text'])

df = pd.DataFrame(shortened, columns=['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [None]:
df['embeddings'] = df.text.apply(lambda x: openai.embeddings.create(
    input=x, model='text-embedding-ada-002').data[0].embedding)

df.to_csv('processed/embeddings.csv')

In [None]:
# questions.py
import numpy as np
import pandas as pd
from openai import OpenAI
from typing import List
from scipy import spatial
import os

def distances_from_embeddings(
  query_embedding: List[float],
  embeddings: List[List[float]],
  distance_metric="cosine",
) -> List[List]:
  """Return the distances between a query embedding and a list of embeddings."""
  distance_metrics = {
      "cosine": spatial.distance.cosine,
      "L1": spatial.distance.cityblock,
      "L2": spatial.distance.euclidean,
      "Linf": spatial.distance.chebyshev,
  }
  distances = [
      distance_metrics[distance_metric](query_embedding, embedding)
      for embedding in embeddings
  ]
  return distances


openai = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

df = pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

In [None]:
def create_context(question, df, max_len=1800):
  """
    Create a context for a question by finding the most similar context from the dataframe
    """
  # Get the embeddings for the question
  q_embeddings = openai.embeddings.create(
      input=question, model='text-embedding-ada-002').data[0].embedding

  # Get the distances from the embeddings
  df['distances'] = distances_from_embeddings(q_embeddings,
                                              df['embeddings'].values,
                                              distance_metric='cosine')

  returns = []
  cur_len = 0

  # Sort by distance and add the text to the context until the context is too long
  for i, row in df.sort_values('distances', ascending=True).iterrows():
    # Add the length of the text to the current length
    cur_len += row['n_tokens'] + 4

    # If the context is too long, break
    if cur_len > max_len:
      break

    # Else add it to the text that is being returned
    returns.append(row["text"])

  # Return the context
  return "\n\n###\n\n".join(returns)

In [None]:
def answer_question(df,
                    model="gpt-3.5-turbo-1106",
                    question="What is the meaning of life?",
                    max_len=1800,
                    debug=False,
                    max_tokens=150,
                    stop_sequence=None):
  """
    Answer a question based on the most similar context from the dataframe texts
    """
  context = create_context(
      question,
      df,
      max_len=max_len,
  )
  # If debug, print the raw model response
  if debug:
    print("Context:\n" + context)
    print("\n\n")

  try:
    # Create a completions using the question and context
    response = openai.chat.completions.create(
        model=model,
        messages=[{
            "role":
            "user",
            "content":
            f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know.\" Try to site sources to the links in the context when possible.\n\nContext: {context}\n\n---\n\nQuestion: {question}\nSource:\nAnswer:",
        }],
        temperature=0,
        max_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=stop_sequence,
    )
    return response.choices[0].message.content
  except Exception as e:
    print(e)
    return ""

# Telegram bot main

Congrats! You’ve successfully generated your own embeds, and created a way to ask questions about it. You could use this to generate summaries, or ask questions on just about any documents now. This is the process that a lot of companies that have “GPT Powered Docs” are doing under the hood.

In [None]:
import pandas as pd
import numpy as np
from questions import answer_question

df = pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

In [None]:
async def mozilla(update: Update, context: ContextTypes.DEFAULT_TYPE):
      answer = answer_question(df, question=update.message.text, debug=True)
      await context.bot.send_message(chat_id=update.effective_chat.id, text=answer)

# Own implementation

Aim: generate embeds for a given text and ask questions about it.

## Embeds
tokenize with tiktoken and save the number of tokens

In [None]:
from langchain_community.document_loaders import UnstructuredHTMLLoader