In [5]:
!poetry add pandas tiktoken langchain openai  

The following packages are already present in the pyproject.toml and will be skipped:

  • [36mpandas[39m
  • [36mtiktoken[39m
  • [36mlangchain[39m
  • [36mopenai[39m

If you want to update it to the latest compatible version, you can use `poetry update package`.
If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.

Nothing to add.


In [60]:
import os

import pandas as pd
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI

pd.set_option("display.max.columns", None)

openai = OpenAI(api_key="your_api_key")

DOMAIN = "developer.mozilla.org"

def remove_newlines(series):
    series = series.str.replace("\n", " ")
    series = series.str.replace("\\n", " ")
    series = series.str.replace("  ", " ")
    series = series.str.replace("  ", " ")
    return series

In [61]:
texts = []

# Get all the text files in the text directory
for file in os.listdir("text/" + DOMAIN + "/"):

    # Open the file and read the text
    with open("text/" + DOMAIN + "/" + file, "r", encoding="UTF-8") as f:
        text = f.read()
        filename = file[:-4].replace("_", "/")
        """
        There are a lot of contributor.txt files that got included in the scrape, this weeds them out. There are also a lot of auth required urls that have been scraped to weed out as well
        """
        if filename.endswith(".txt") or "users/fxa/login" in filename:
            continue

        # then we replace underscores with / to get the actual links so we can cite contributions
        texts.append((filename, text))


df = pd.DataFrame(texts, columns=["fname", "text"])
display(df)

Unnamed: 0,fname,text
0,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,HTML attribute reference - HTML: HyperText Mar...
1,developer.mozilla.org/en-US/docs/Web/JavaScrip...,Expressions and operators - JavaScript | MDNSk...
2,developer.mozilla.org/en-US/docs/Web/JavaScrip...,Lexical grammar - JavaScript | MDNSkip to main...
3,developer.mozilla.org/en-US/docs/Web/API/HTMLE...,HTMLElement: dragover event - Web APIs | MDNSk...
4,developer.mozilla.org/en-US/docs/Web/Security/...,Referer header: privacy and security concerns ...
...,...,...
263,developer.mozilla.org/en-US/docs/Web/API/Docum...,DocumentFragment: getElementById() method - We...
264,developer.mozilla.org/en-US/docs/Web/API/Docum...,DocumentType: name property - Web APIs | MDNSk...
265,developer.mozilla.org/en-US/docs/Web/API/HTMLT...,HTMLTableElement: createTBody() method - Web A...
266,developer.mozilla.org/en-US/docs/Web/JavaScrip...,let - JavaScript | MDNSkip to main contentSkip...


Great, now we have the file names, and all of the text for that file. However when we do our chunking and embedding, we want to have the filename in the actual text so that we can cite where we got the text from.

In [62]:
df["text"] = df.fname + ". " + remove_newlines(df.text)
display(df)

Unnamed: 0,fname,text
0,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,developer.mozilla.org/en-US/docs/Web/HTML/Attr...
1,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...
2,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...
3,developer.mozilla.org/en-US/docs/Web/API/HTMLE...,developer.mozilla.org/en-US/docs/Web/API/HTMLE...
4,developer.mozilla.org/en-US/docs/Web/Security/...,developer.mozilla.org/en-US/docs/Web/Security/...
...,...,...
263,developer.mozilla.org/en-US/docs/Web/API/Docum...,developer.mozilla.org/en-US/docs/Web/API/Docum...
264,developer.mozilla.org/en-US/docs/Web/API/Docum...,developer.mozilla.org/en-US/docs/Web/API/Docum...
265,developer.mozilla.org/en-US/docs/Web/API/HTMLT...,developer.mozilla.org/en-US/docs/Web/API/HTMLT...
266,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...


We can now go through and tokenize all of our text, and save the nubmer of tokens to a new column

In [63]:
tokenizer = tiktoken.get_encoding("cl100k_base")
df["n_tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))
display(df)

Unnamed: 0,fname,text,n_tokens
0,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,5361
1,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...,3708
2,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...,8240
3,developer.mozilla.org/en-US/docs/Web/API/HTMLE...,developer.mozilla.org/en-US/docs/Web/API/HTMLE...,1132
4,developer.mozilla.org/en-US/docs/Web/Security/...,developer.mozilla.org/en-US/docs/Web/Security/...,1245
...,...,...,...
263,developer.mozilla.org/en-US/docs/Web/API/Docum...,developer.mozilla.org/en-US/docs/Web/API/Docum...,1308
264,developer.mozilla.org/en-US/docs/Web/API/Docum...,developer.mozilla.org/en-US/docs/Web/API/Docum...,615
265,developer.mozilla.org/en-US/docs/Web/API/HTMLT...,developer.mozilla.org/en-US/docs/Web/API/HTMLT...,920
266,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...,3838


Now we will want to go through and chunk everything that has more than 1000 tokens

In [64]:
chunk_size = 1000  # Max number of tokens

text_splitter = RecursiveCharacterTextSplitter(
    # This could be replaced with a token counting function if needed
    length_function=len,
    chunk_size=chunk_size,
    chunk_overlap=0,  # No overlap between chunks, you can play with this if you want to have overlapping context
    add_start_index=False,  # We don't need start index in this case
)

shortened = []

for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]["text"] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]["n_tokens"] > chunk_size:
        # Split the text using LangChain's text splitter
        chunks = text_splitter.create_documents([row[1]["text"]])
        # Append the content of each chunk to the 'shortened' list
        for chunk in chunks:
            shortened.append(row[1]["fname"] + chunk.page_content)

    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append(row[1]["fname"] + row[1]["text"])

Now, we have chunked everything, so item should have less than 1000 tokens, as well as the file name at the beginning (for citation purposes)

We will load that into a new dataframe with just these new array, and then run the tokenizer on it again

In [67]:
df = pd.DataFrame(shortened, columns=["text"])
df["n_tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))
display(df)

Unnamed: 0,text,n_tokens
0,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,186
1,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,182
2,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,277
3,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,235
4,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,202
...,...,...
2132,developer.mozilla.org/en-US/docs/Web/JavaScrip...,257
2133,developer.mozilla.org/en-US/docs/Web/JavaScrip...,263
2134,developer.mozilla.org/en-US/docs/Web/JavaScrip...,264
2135,developer.mozilla.org/en-US/docs/Web/JavaScrip...,176


Now with all of these smaller chunks, we can embed them, and add that as a column to the new dataframe. After that, we will write it all to a CSV so we can load it into our application easily.

In [70]:
df = df[1:10]
new_df = df.iloc[:10]


new_df["embeddings"] = df.text.apply(
    lambda x: openai.embeddings.create(input=x, model="text-embedding-ada-002")
    .data[0]
    .embedding
)
display(new_df)

df.to_csv("processed/embeddings.csv")

Unnamed: 0,text,n_tokens,embeddings
3,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,235,"[-0.00908578559756279, 0.014887887053191662, 0..."
4,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,202,"[-0.006047440692782402, 0.021770786494016647, ..."
5,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,243,"[-0.01809210330247879, 0.02606789954006672, 0...."
6,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,243,"[-0.02236572839319706, 0.03484515845775604, 0...."
7,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,226,"[-0.0037865217309445143, 0.030482348054647446,..."
8,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,228,"[-0.014814070425927639, 0.036394719034433365, ..."
9,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,245,"[-0.02875336818397045, 0.03071197122335434, 0...."
