# Generate Embeddings

🎯 Clean text data, generate embeddings and store them in Chroma DB

In [57]:
import sys
sys.executable

'/Users/panos/Projects/my-ventures/learn-openai/.venv/bin/python'

In [3]:
import os 
import re
from tqdm import tqdm

## Clean files

In [None]:
# Clean Data
def clean_webpage_data(file_path):
    
    # Common words to drop
    logo = ["A.C. LASKARIDIS CHARITABLE FOUNDATION", "ΚΟΙΝΩΦΕΛΕΣ ΙΔΡΥΜΑ ΑΘΑΝΑΣΙΟΣ Κ. ΛΑΣΚΑΡΙΔΗΣ"]
    
    header = ["THE FOUNDATION", "ΤΟ ΙΔΡΥΜΑ",
              "SEA", "ΘΑΛΑΣΣΑ",
              "ENTREPRENEURSHIP", "ΕΠΙΧΕΙΡΗΜΑΤΙΚΟΤΗΤΑ",
              "EDUCATION", "ΕΚΠΑΙΔΕΥΣΗ",
              "EVENTS", "ΕΚΔΗΛΩΣΕΙΣ",
              "COLLABORATIONS", "ΣΥΝΕΡΓΑΣΙΕΣ",
              "OTHER PROJECTS", "ΑΛΛΑ ΠΡΟΓΡΑΜΜΑΤΑ"]
    
    menu = ["OUR TEAM", "Η ΟΜΑΔΑ ΜΑΣ",
            "SUPPORTERS", "ΥΠΟΣΤΗΡΙΚΤΕΣ",
            "NEWS", "ΝΕΑ", 
            "LATEST NEWS", "ΤΕΛΕΥΤΑΙΑ ΝΕΑ"]
    
    footer = ["CONTACT", "ΕΠΙΚΟΙΝΩΝΙΑ",
              "EVENTS", "ΕΚΔΗΛΩΣΕΙΣ",
              "AWARDS", "ΒΡΑΒΕΙΑ", "2020 aclcfWebsite by Cloudevo"]
    
    for file_name in tqdm(os.listdir(file_path)):
        if file_name.endswith(".txt"):
            file = os.path.join(file_path, file_name)
            with open(file, "r") as f:
                text = f.read()
            
            # Drop words
            text = drop_words(text, logo)
            text = drop_words(text, header)
            text = drop_words(text, menu)
            text = drop_words(text, footer)
            
            # Remove extra whitespace and newlines
            text = re.sub(r'\s+', ' ', text).strip()
            
            # Drop Everything before OTHER PROJECTS
            text = drop_before(text, "OTHER PROJECTS")
            
            # Drop Everything after latest news
            text = drop_after(text, "LATEST NEWS")
            
            with open(file, "w") as f:
                f.write(text)
    
    return None

In [None]:
def drop_words(text, words):
    for word in words:
        text = text.replace(word, "")

    return text

In [None]:
def drop_before(text, keyword):
    index = text.find(keyword)
    if index != -1:
        text = text[index:]
    
    return text

In [None]:
def drop_after(text, keyword):
    index = text.find(keyword)
    if index != -1:
        text = text[:index]
    
    return text

In [None]:
# Provide the path to the folder containing the text files
folder_path = "../data/"
clean_webpage_data(folder_path)

## Generate Embeddings

In [52]:
# Load all txt files from a directory
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader('../data/', glob = "./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [53]:
# Splitting the text into 
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
print(f'There are {len(texts)} documents')

There are 497 documents


In [54]:
## LOCAL .env file
from dotenv import dotenv_values
secrets = dotenv_values("../.env")
OPENAI_API_KEY = secrets['OPENAI_API_KEY']

## Step 2: Initialize Embeddings engine
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [55]:
## Step 3: Initialize and Persist ChromDB with embeddings
from langchain.vectorstores import Chroma

# Supplying a persist_directory will store the embeddings on disk
persist_directory = '../db'

vectordb = Chroma.from_documents(documents=documents,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
vectordb.persist()
vectordb = None

Using embedded DuckDB with persistence: data will be stored in: ../db
