In [1]:
import pandas as pd
import os
import tiktoken
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import faiss

In [2]:
load_dotenv() # take environment variables from .env. 
openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 

In [3]:
def remove_newlines(series): 
    series = series.str.replace('\n', ' ') 
    series = series.str.replace('\\n', ' ') 
    series = series.str.replace('  ', ' ') 
    series = series.str.replace('  ', ' ') 
    return series

In [4]:
# Create a list to store the text files 
texts=[]

In [5]:
# Get all the text files in the text directory 
for file in os.listdir("../text/"): 
    # Open the file and read the text 
    with open("../text/" + file, "r", encoding="UTF-8") as f:
        text = f.read()
        # we replace the last 4 characters to get rid of .txt, and replace _ with / to generate the URLs we scraped 
        filename = file[:-4].replace('_', '/') 
        """ 
        There are a lot of contributor.txt files that got included in the scrape, this weeds them out. There are also a lot of auth required urls that have been scraped to weed out as well 
        """  
    if filename.endswith(".txt") or 'users/fxa/login' in filename: 
        continue

    
    
    # then we replace underscores with / to get the actual links so we can cite contributions 
    texts.append( 
      (filename, text))

In [6]:
# Create a dataframe from the list of texts 
df = pd.DataFrame(texts, columns=['fname', 'text']) 

In [7]:
df

Unnamed: 0,fname,text
0,collections/video-games?page/num=18,Video Games – Page 18 – Gamers Hideout\nLog in...
1,collections/video-games?page/num=24,Video Games – Page 24 – Gamers Hideout\nLog in...
2,collections/video-games?page/num=30,Video Games – Page 30 – Gamers Hideout\nLog in...
3,collections/-for-product-listing-changing-nint...,Nintendo Switch Games – Page 28 – Gamers Hideo...
4,collections/-for-product-listing-changing-nint...,Nintendo Switch Games – Page 14 – Gamers Hideo...
...,...,...
431,collections/-for-product-listing-changing-nint...,Nintendo Switch Games – Page 25 – Gamers Hideo...
432,collections/playstation-5,Playstation 5 – Gamers Hideout\nLog in\nSHOP A...
433,collections/-for-product-listing-changing-nint...,Nintendo Switch Games – Page 31 – Gamers Hideo...
434,collections/video-games?page/num=29,Video Games – Page 29 – Gamers Hideout\nLog in...


In [8]:
# Set the text column to be the raw text with the newlines removed 
df['text'] = df.fname + ". " + remove_newlines(df.text) 


In [9]:
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model 
tokenizer = tiktoken.get_encoding("cl100k_base")
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column 
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [10]:
# Tokenizing the CSV
chunk_size = 700  # Max number of tokens 

text_splitter = RecursiveCharacterTextSplitter( 
    # This could be replaced with a token counting function if needed 
    length_function = len,
    chunk_size = chunk_size,
    chunk_overlap  = 100,  # No overlap between chunks 
    add_start_index = False,  # We don't need start index in this case 
) 

shortened = [] 

In [11]:
for row in df.iterrows(): 
    # If the text is None, go to the next row 
    if row[1]['text'] is None: 
        continue 

  # If the number of tokens is greater than the max number of tokens, split the text into chunks 
    if row[1]['n_tokens'] > chunk_size: 
        # Split the text using LangChain's text splitter 
        chunks = text_splitter.create_documents([row[1]['text']]) 

        # Append the content of each chunk to the 'shortened' list 
        for chunk in chunks:
            shortened.append(chunk.page_content)

  # Otherwise, add the text to the list of shortened texts 
    else: 
        shortened.append(row[1]['text']) 

In [12]:
df = pd.DataFrame(shortened, columns=['text']) 
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [13]:
df['embeddings'] = df.text.apply(lambda x: openai.embeddings.create(
    input=x, model='text-embedding-ada-002').data[0].embedding)

In [15]:
df.to_csv('embeddings.csv')

## Bringing to FAISS

In [16]:
import numpy as np
vectors = np.vstack(df['embeddings'].values).astype(np.float32)

In [17]:
# Create the FAISS index
d = vectors.shape[1]  # Dimensionality of the vectors
index = faiss.IndexFlatL2(d)

# Add vectors to the index
index.add(vectors)

In [18]:
id_to_text = {i: text for i, text in enumerate(df['text'])}
id_to_text

{0: 'collections/video-games?page/num=18. Video Games – Page 18 – Gamers Hideout Log in SHOP ALL MONTHLY SPECIAL NINTENDO Switch Consoles Switch Games Switch Accessories Nintendo Digital PLAYSTATION Playstation 5 Consoles & VR2 Playstation 5 Accessories PlayStation 5 Games Playstation 4 Consoles Playstation 4 Games Playstation 4 Accessories Playstation Digital XBOX Series X|S XBOX X|S Accessories XBOX Consoles XBOX Digital GAMING ACCESSORIES Case & Pouch Controllers Charger & Stand Skin & Screen Protector Fighting stick Memory Thumbgrip Racing Simulator TRADING CARD GAME Pokemon TCG One Piece TCG TCG Accessories PC Headsets Keyboards Mouse Mousepads Speakers Gaming Controller VIRTUAL REALITY VR',
 1: 'TCG Accessories PC Headsets Keyboards Mouse Mousepads Speakers Gaming Controller VIRTUAL REALITY VR Consoles VR Accessories DIGITAL CODE ☎\u2002016-2991681 Enter here and click search SHOP ALL MONTHLY SPECIAL NINTENDO Switch Consoles Switch Games Switch Accessories Nintendo Digital PLAYST

In [19]:
faiss.write_index(index, '../faiss_index.index')

In [20]:
import pickle 
with open('../id_to_text.pkl', 'wb') as f:
    pickle.dump(id_to_text, f)