In [1]:
import pandas as pd
import os
import tiktoken
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

In [2]:
load_dotenv() # take environment variables from .env. 
openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
DOMAIN = "developer.mozilla.org" 

In [3]:
def remove_newlines(series): 
    series = series.str.replace('\n', ' ') 
    series = series.str.replace('\\n', ' ') 
    series = series.str.replace('  ', ' ') 
    series = series.str.replace('  ', ' ') 
    return series

In [4]:
# Create a list to store the text files 
texts=[]

In [7]:
# Get all the text files in the text directory 
for file in os.listdir("../text/" + DOMAIN + "/"): 
    # Open the file and read the text 
    with open("../text/" + DOMAIN + "/" + file, "r", encoding="UTF-8") as f:
        text = f.read()
        # we replace the last 4 characters to get rid of .txt, and replace _ with / to generate the URLs we scraped 
        filename = file[:-4].replace('_', '/') 
        """ 
        There are a lot of contributor.txt files that got included in the scrape, this weeds them out. There are also a lot of auth required urls that have been scraped to weed out as well 
        """  
    if filename.endswith(".txt") or 'users/fxa/login' in filename: 
        continue 

    # then we replace underscores with / to get the actual links so we can cite contributions 
    texts.append( 
      (filename, text))

In [8]:
# Create a dataframe from the list of texts 
df = pd.DataFrame(texts, columns=['fname', 'text']) 

In [9]:
df

Unnamed: 0,fname,text
0,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,HTML attribute reference - HTML: HyperText Mar...
1,developer.mozilla.org/en-US/docs/Web/JavaScrip...,Expressions and operators - JavaScript | MDNSk...
2,developer.mozilla.org/en-US/docs/Web/JavaScrip...,Lexical grammar - JavaScript | MDNSkip to main...
3,developer.mozilla.org/en-US/docs/Web/API/HTMLE...,HTMLElement: dragover event - Web APIs | MDNSk...
4,developer.mozilla.org/en-US/docs/Web/Security/...,Referer header: privacy and security concerns ...
...,...,...
263,developer.mozilla.org/en-US/docs/Web/API/Docum...,DocumentFragment: getElementById() method - We...
264,developer.mozilla.org/en-US/docs/Web/API/Docum...,DocumentType: name property - Web APIs | MDNSk...
265,developer.mozilla.org/en-US/docs/Web/API/HTMLT...,HTMLTableElement: createTBody() method - Web A...
266,developer.mozilla.org/en-US/docs/Web/JavaScrip...,let - JavaScript | MDNSkip to main contentSkip...


In [11]:
# Set the text column to be the raw text with the newlines removed 
df['text'] = df.fname + ". " + remove_newlines(df.text) 
df.to_csv('../processed/scraped.csv')

In [15]:
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model 
tokenizer = tiktoken.get_encoding("cl100k_base")
df = pd.read_csv('../processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column 
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [16]:
df

Unnamed: 0,title,text,n_tokens
0,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,developer.mozilla.org/en-US/docs/Web/HTML/Attr...,5065
1,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...,3714
2,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...,8226
3,developer.mozilla.org/en-US/docs/Web/API/HTMLE...,developer.mozilla.org/en-US/docs/Web/API/HTMLE...,1141
4,developer.mozilla.org/en-US/docs/Web/Security/...,developer.mozilla.org/en-US/docs/Web/Security/...,1267
...,...,...,...
263,developer.mozilla.org/en-US/docs/Web/API/Docum...,developer.mozilla.org/en-US/docs/Web/API/Docum...,1315
264,developer.mozilla.org/en-US/docs/Web/API/Docum...,developer.mozilla.org/en-US/docs/Web/API/Docum...,628
265,developer.mozilla.org/en-US/docs/Web/API/HTMLT...,developer.mozilla.org/en-US/docs/Web/API/HTMLT...,932
266,developer.mozilla.org/en-US/docs/Web/JavaScrip...,developer.mozilla.org/en-US/docs/Web/JavaScrip...,3837


In [17]:
# Tokenizing the CSV
chunk_size = 1000  # Max number of tokens 

text_splitter = RecursiveCharacterTextSplitter( 
    # This could be replaced with a token counting function if needed 
    length_function = len,
    chunk_size = chunk_size,
    chunk_overlap  = 0,  # No overlap between chunks 
    add_start_index = False,  # We don't need start index in this case 
) 

shortened = [] 

In [18]:
for row in df.iterrows(): 
    # If the text is None, go to the next row 
    if row[1]['text'] is None: 
        continue 

  # If the number of tokens is greater than the max number of tokens, split the text into chunks 
    if row[1]['n_tokens'] > chunk_size: 
        # Split the text using LangChain's text splitter 
        chunks = text_splitter.create_documents([row[1]['text']]) 

        # Append the content of each chunk to the 'shortened' list 
        for chunk in chunks:
            shortened.append(chunk.page_content)

  # Otherwise, add the text to the list of shortened texts 
    else: 
        shortened.append(row[1]['text']) 

In [19]:
df = pd.DataFrame(shortened, columns=['text']) 
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [20]:
df['embeddings'] = df.text.apply(lambda x: openai.embeddings.create(
    input=x, model='text-embedding-ada-002').data[0].embedding)

In [28]:
df.to_csv('../processed/embeddings.csv')