In [1]:
import pandas as pd
import os
import tiktoken
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import faiss

In [2]:
load_dotenv() # take environment variables from .env. 
openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 

In [3]:
def remove_newlines(series): 
    series = series.str.replace('\n', ' ') 
    series = series.str.replace('\\n', ' ') 
    series = series.str.replace('  ', ' ') 
    series = series.str.replace('  ', ' ') 
    return series

In [4]:
# Create a list to store the text files 
texts=[]

In [6]:
def read_file(file_path):
    encodings = ['utf-8', 'iso-8859-1', 'windows-1252']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Unable to decode the file {file_path} with the attempted encodings")

texts = []

# Get all the text files in the text directory 
for file in os.listdir("../text/"): 
    try:
        # Open the file and read the text 
        text = read_file("../text/" + file)
        
        # we replace the last 4 characters to get rid of .txt, and replace _ with / to generate the URLs we scraped 
        filename = file[:-4].replace('_', '/') 
        
        if filename.endswith(".txt") or 'users/fxa/login' in filename: 
            continue
        
        # then we replace underscores with / to get the actual links so we can cite contributions 
        texts.append((filename, text))
    
    except UnicodeDecodeError as e:
        print(f"Error reading file {file}: {str(e)}")
        continue

In [7]:
# Create a dataframe from the list of texts 
df = pd.DataFrame(texts, columns=['fname', 'text']) 

In [8]:
df

Unnamed: 0,fname,text
0,collections/video-games?page/num=18,Video Games\nVideo Games\nFilter\nFilter\nAvai...
1,collections/video-games?page/num=24,Video Games\nVideo Games\nFilter\nFilter\nAvai...
2,collections/video-games?page/num=30,Video Games\nVideo Games\nFilter\nFilter\nAvai...
3,collections/-for-product-listing-changing-nint...,Nintendo Switch Games\nNintendo Switch Games\n...
4,collections/-for-product-listing-changing-nint...,Nintendo Switch Games\nNintendo Switch Games\n...
...,...,...
432,collections/-for-product-listing-changing-nint...,Nintendo Switch Games\nNintendo Switch Games\n...
433,collections/playstation-5,Playstation 5\nPlaystation 5\nFilter\nFilter\n...
434,collections/-for-product-listing-changing-nint...,Nintendo Switch Games\nNintendo Switch Games\n...
435,collections/video-games?page/num=29,Video Games\nVideo Games\nFilter\nFilter\nAvai...


In [9]:
# Set the text column to be the raw text with the newlines removed 
df['text'] = df.fname + ". " + remove_newlines(df.text) 

In [10]:
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model 
tokenizer = tiktoken.get_encoding("cl100k_base")
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column 
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [12]:
# Tokenizing the CSV
chunk_size = 1000  # Max number of tokens 

text_splitter = RecursiveCharacterTextSplitter( 
    # This could be replaced with a token counting function if needed 
    length_function = len,
    chunk_size = chunk_size,
    chunk_overlap  = 100,  # No overlap between chunks 
    add_start_index = False,  # We don't need start index in this case 
) 

shortened = [] 

In [13]:
for row in df.iterrows(): 
    # If the text is None, go to the next row 
    if row[1]['text'] is None: 
        continue 

  # If the number of tokens is greater than the max number of tokens, split the text into chunks 
    if row[1]['n_tokens'] > chunk_size: 
        # Split the text using LangChain's text splitter 
        chunks = text_splitter.create_documents([row[1]['text']]) 

        # Append the content of each chunk to the 'shortened' list 
        for chunk in chunks:
            shortened.append(chunk.page_content)

  # Otherwise, add the text to the list of shortened texts 
    else: 
        shortened.append(row[1]['text']) 

In [14]:
df = pd.DataFrame(shortened, columns=['text']) 
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [15]:
df['embeddings'] = df.text.apply(lambda x: openai.embeddings.create(
    input=x, model='text-embedding-ada-002').data[0].embedding)

## Bringing to FAISS

In [16]:
import numpy as np
vectors = np.vstack(df['embeddings'].values).astype(np.float32)

In [17]:
# Create the FAISS index
d = vectors.shape[1]  # Dimensionality of the vectors
index = faiss.IndexFlatL2(d)

# Add vectors to the index
index.add(vectors)

In [18]:
id_to_text = {i: text for i, text in enumerate(df['text'])}
id_to_text

{0: 'collections/video-games?page/num=18. Video Games Video Games Filter Filter Availability In stock only Price Maximum price is RM998.90 RM RM 1536 items Clear Confirm Sort Sort Recommended Top sellers New arrivals Price low to high Price high to low Recommended Recommended Top sellers New arrivals Price low to high Price high to low 1536 items Clear all 1536 items Mon-Yu - Nintendo Switch Sold out Mon-Yu - Nintendo Switch From RM169.90 RM228.90 Pikmin 1+2 Bundle - Nintendo Switch Sold out Pikmin 1+2 Bundle - Nintendo Switch From RM139.90 RM160.00 Virche Evermore -Error: Salvation - Nintendo Switch Virche Evermore -Error: Salvation - Nintendo Switch From RM179.90 Stardew Valley - Nintendo Switch SAVE MYR\xa070.00 Stardew Valley - Nintendo Switch From RM99.90 Fate/Samurai Remnant - Nintendo Switch Sold out Fate/Samurai Remnant - Nintendo Switch From RM149.90 RM228.90 EA Sports FC 24 - Nintendo Switch EA Sports FC 24 - Nintendo Switch From RM139.90 Jujutsu Kaisen: Cursed Clash - Ninten

In [19]:
faiss.write_index(index, '../faiss_index.index')

In [20]:
import pickle 
with open('../id_to_text.pkl', 'wb') as f:
    pickle.dump(id_to_text, f)