Installation of packages

Import of packages

In [1]:
from tqdm.notebook import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import itertools
import re
from concurrent.futures import ThreadPoolExecutor
import yaml
import pickle
import random
import numpy as np
import time

# for synthetic testset
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

OPENAI_KEY = os.getenv("OPENAI_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_KEY

Reading yaml configuration

In [14]:
# Read YAML configuration
def read_config(filename):
    with open(filename, 'r') as file:
        config = yaml.safe_load(file)
    return config

config = read_config("config.yaml")

In [15]:
def search_pdf_files(directory):
    # List to store found files
    found_files = []

    # Walk through the directory and its subdirectories
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                # save path to list
                found_files.append(file_path)

    return found_files

def count_files_in_folders(directory):
    # Iterate over each directory in the given directory
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        # Check if the current item is a directory
        if os.path.isdir(folder_path):
            # Count the number of files in the directory
            num_files = len([filename for filename in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, filename))])
            print(f"Folder '{foldername}' contains {num_files} file(s).")

# Specify the directory path
directory_path = r"c:\BUPL/pdffiles/" 

# Search for PDF files containing a keyword
found_files = search_pdf_files(directory_path)

print(f'Number of pdf\'s found: {len(found_files)}')


count_files_in_folders(directory_path)

Number of pdf's found: 82


In [16]:
print(found_files) # Finanskalender --- Forløb af generalforsamling --- Indkaldelse til generalforsamling --- Intern viden --- Selskabsvægter --- , Storaktionærmeddelser --- Årsrapport

['c:\\BUPL/pdffiles/05.86 O.13 Rammeaftale om medindflydelse og medbestemmelse - med underskrifter.pdf', 'c:\\BUPL/pdffiles/21-0287-232-med-h-ndbog-til-nettet-rltn-teknisk-opdateret-28-11-2022.pdf', 'c:\\BUPL/pdffiles/22-0195-28-med-h-ndbog-2021-som-webfil-endelig.pdf', 'c:\\BUPL/pdffiles/Aftale om arbejdstidsregler for pædagogisk personale.pdf', 'c:\\BUPL/pdffiles/Aftale om fravær af familiemæssige årsager pr. 1. april 2021.pdf', 'c:\\BUPL/pdffiles/Aftale om TR, samarbejde og samarbejdsudvalg.pdf', 'c:\\BUPL/pdffiles/AMR_folder.indd.pdf', 'c:\\BUPL/pdffiles/April 2021 Introduktion til valg af TR på en privat arbejdsplads.pdf', 'c:\\BUPL/pdffiles/Arbejdsmiljørepræsentantens opgaver_BUPL.pdf', 'c:\\BUPL/pdffiles/Barsel _ BUPL.pdf', 'c:\\BUPL/pdffiles/Beskyttelse af tillidsrepræsentanten _ BUPL.pdf', 'c:\\BUPL/pdffiles/Bilag til Eksempler og tekster, Tjekliste til MED og TR på skoleniveau.pdf', 'c:\\BUPL/pdffiles/Bliv arbejdsmiljørepræsentant (AMR) _ BUPL.pdf', 'c:\\BUPL/pdffiles/BUPL sa

In [17]:
# Shuffle list of files to get a random subset óf embeddings
random.seed(42)

shuffled_list = random.sample(found_files, len(found_files))
print(shuffled_list)

['c:\\BUPL/pdffiles/Vejledning i valg af TR på private overenskomster.pdf', 'c:\\BUPL/pdffiles/BUPL Samarbejdsportal _ BUPL.pdf', 'c:\\BUPL/pdffiles/Aftale om arbejdstidsregler for pædagogisk personale.pdf', 'c:\\BUPL/pdffiles/Kontingent _ BUPL.pdf', 'c:\\BUPL/pdffiles/Hvorfor_skal_kollegaen_vaere_medlem.pdf', 'c:\\BUPL/pdffiles/Hvem er medlem_ _ BUPL.pdf', 'c:\\BUPL/pdffiles/DialogPaaArbejdspladsen_Acc-Fobu-LDD-DLO-FDDB.pdf', 'c:\\BUPL/pdffiles/BUPL saa mange gode grunde.pdf', 'c:\\BUPL/pdffiles/Tillidsrepræsentantens materialer til organisering og relationsstyrkende indsats på arbejdspladsen _ BUPL.pdf', 'c:\\BUPL/pdffiles/Bilag til Eksempler og tekster, Tjekliste til MED og TR på skoleniveau.pdf', 'c:\\BUPL/pdffiles/Samarbejdet med tillids- og arbejdsmiljørepræsentanten _ BUPL.pdf', 'c:\\BUPL/pdffiles/Aftale om fravær af familiemæssige årsager pr. 1. april 2021.pdf', 'c:\\BUPL/pdffiles/Uddannelse af tillidsrepræsentanter _ BUPL.pdf', 'c:\\BUPL/pdffiles/Tjenstlige_samtaler_Juni23 pri

### Process docs

In [18]:
# Set hyperparameters
CHUNKSIZE = 1000 
CHUNKOVERLAP = 100

In [19]:
def preprocessing(doc, use_meta=False):
    if use_meta:
        page_content, metadata = doc.page_content, doc.metadata

        #some processing
        page_content_processed = re.sub(r'\s+', ' ', page_content.replace("\\n", " "))


        # one could do the following instead>
        # doc.page_content = page_content_processed
        # return doc
        dict_data = {'metadata': metadata}
        dict_data['metadata']['text'] = page_content_processed
        
        return dict_data
    else:
        doc = str(doc)    
        #Remove whitespace
        full_text = re.sub(r'\s+', ' ', doc.replace("\\n", " ")) 

    return full_text

In [20]:
def langchain_loader_splitter(list_pdf_paths, chunk_size, overlap):

    list_of_docs_non_processed = []     # used for synthetic dataset
    list_of_docs_processed = []
    for pdf_path in tqdm(list_pdf_paths):    
        try:
            # Use load_and_split() to split the document into sentences
            loader = PyPDFLoader(pdf_path)
            data = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
            # will create the chunks in "documents"
            documents = text_splitter.split_documents(data)
            list_of_docs_non_processed.append(documents)
            # One can have lists of shape [[..],[..], ..., [..]] for number of chunks created 
            documents = [preprocessing(doc, True) for doc in documents]  
            list_of_docs_processed.append(documents)
                
        except Exception as e:
            print(f"Error loading {pdf_path}: {e}")

    # can have shape [ [..], [[..], [..]] ]. Need to unpack
    list_of_docs__non_processed_unpacked = list(itertools.chain(*list_of_docs_non_processed))
    list_of_docs_processed_unpacked = list(itertools.chain(*list_of_docs_processed))

    return list_of_docs_processed_unpacked, list_of_docs__non_processed_unpacked


# Write the list of strings to a file
def write_dicts_to_pickle(dicts, pickle_file_path):
    with open(pickle_file_path, 'wb') as f:
        pickle.dump(dicts, f)


# ONLY RUN IF THE dataload configuration says so
if config['run_langchain']:
    # First half
    docs1, pure_docs1 = langchain_loader_splitter(shuffled_list[:50], CHUNKSIZE, CHUNKOVERLAP)
    write_dicts_to_pickle(docs1, "data/BUPL/dicts1.pkl")
    print(f"{len(docs1)} dicts have been written to 'dicts1.pkl'")

    # Second half
    docs2, pure_docs2 = langchain_loader_splitter(shuffled_list[50:], CHUNKSIZE, CHUNKOVERLAP)
    write_dicts_to_pickle(docs2, "data/BUPL/dicts2.pkl")
    print(f"{len(docs2)} dicts have been written to 'dicts2.pkl'")

  0%|          | 0/50 [00:00<?, ?it/s]

1279 dicts have been written to 'dicts1.pkl'


  0%|          | 0/32 [00:00<?, ?it/s]

1322 dicts have been written to 'dicts2.pkl'


In [31]:
if config['run_create_synthetic_testset']:
    # generator with openai models
    documents = pure_docs1 + pure_docs2

    # cannot change chunk_overlap!

    generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
    critic_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )

    # generate testset in splits
    testset = generator.generate_with_langchain_docs(documents[:1300], test_size=20, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
    df_synthetic_testset = testset.to_pandas()
    # save synthetic data 
    df_synthetic_testset.to_csv('data/BUPL/synthetic_testset/synthetic_data1.csv', index=False)

    time.sleep(60)

    testset = generator.generate_with_langchain_docs(documents[1300:], test_size=20, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
    df_synthetic_testset = testset.to_pandas()
    # save synthetic data 
    df_synthetic_testset.to_csv('data/BUPL/synthetic_testset/synthetic_data2.csv', index=False)

embedding nodes:   0%|          | 0/2600 [00:00<?, ?it/s]

In [27]:
# Read the data dicts
with open('data/BUPL/dicts1.pkl', 'rb') as file:
    dicts1 = pickle.load(file)

with open('data/BUPL/dicts2.pkl', 'rb') as file:
    dicts2 = pickle.load(file)


print(f"{len(dicts1)} dicts have been read from dicts1.pkl.")
print(f"{len(dicts2)} dicts have been read from dicts2.pkl.")

# Concate all strings
docs = dicts1 + dicts2

1279 dicts have been read from dicts1.pkl.
1322 dicts have been read from dicts2.pkl.


## Move to Pinecone

https://app.pinecone.io/organizations/-NxMHT03GCEQHVgGQcfo/projects/b8d42a36-6186-413b-9291-64d9b3e9f6a9/indexes

https://docs.pinecone.io/guides/getting-started/quickstart


Other approaches were considered. One was from: https://medium.com/@varsha.rainer/building-a-rag-application-from-scratch-using-langchain-openais-whisper-pinecone-6b2fbf22f77f
- However, giving the vectors id's and saving intermediate results was made complicated
- The code skeleton is outcommented below:

In [None]:
#from langchain_openai.embeddings import OpenAIEmbeddings



#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
#documents = text_splitter.split_documents(text_documents)

#embeddings = OpenAIEmbeddings(api_key=os.environ.get("OPENAI_KEY"))

#pinecone = PineconeVectorStore.from_documents(
#    documents, embeddings, index_name=index_name
#)

### Create pinecone index

Create index

In [2]:
pc = Pinecone(api_key=os.environ.get("PINECONE_KEY"))

INDEX_NAME = "bupl-index"
DIM_OF_VECTOR = 1536
SIM_METRIC = 'cosine'


def create_pinecone_index(bool):
    if bool == True:
        pc.create_index(
            name=INDEX_NAME,
            dimension=DIM_OF_VECTOR, 
            metric=SIM_METRIC, 
            spec=ServerlessSpec(
                cloud="aws",
                region="eu-west-1"
            ) 
        )
    else:   
        print('Passing the creation of the pinecone index, since it already exists')
        pass

create_pinecone_index(False)

Passing the creation of the pinecone index, since it already exists


### Upsert embeddings in the pinecone index

In [32]:
OPENAI_KEY = os.getenv("OPENAI_KEY")
client = OpenAI(api_key=OPENAI_KEY)

MODEL = 'text-embedding-3-small' #"text-embedding-ada-002" 


In [36]:

def process_text(dicto):
        res = client.embeddings.create(input=[dicto['metadata']['text']], model=MODEL)
        embedding = res.data[0].embedding

        # insert the embedding in dictionary with key 'values'
        dicto['values'] = embedding
        return dicto


# PARALLEL FUNCTION
def create_embeddings_parallel(dicts):
    embeddings_list = []

    # Do parallel for-loop (improves performance more than x5)
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_text, dicto) for dicto in dicts]
        for future in tqdm(futures, total=len(dicts)):
            embeddings_list.append(future.result())

    return embeddings_list #(Then here will be a list of dicts)


# define function for saving embeddings as pickle

def get_interval_borders_from_name(interval_name):
    start_n, end_n = interval_name.split('-')
    start_n = int(start_n)
    end_n = int(end_n)
    return start_n, end_n

def run_and_save_embedding_list(interval_name, docs):
    # get interval
    start_n, end_n = get_interval_borders_from_name(interval_name)
    # run
    embeddings_list = create_embeddings_parallel(docs[start_n : end_n])
    # save
    filename = f'new_embeddings_{interval_name}.pkl'
    with open('data/BUPL/new_embeddings/' + filename, 'wb') as file:
        pickle.dump(embeddings_list, file)





In [37]:
embedding_interval1 = '0-5000'
#embedding_interval2 = '5000-10000'
#embedding_interval3 = '10000-15000'


if config['run_create_embeddings']:

    run_and_save_embedding_list(embedding_interval1, docs)

    #run_and_save_embedding_list(embedding_interval2, docs)

    #run_and_save_embedding_list(embedding_interval3, docs)

else:
    print('Not running embeddings...')

  0%|          | 0/2601 [00:00<?, ?it/s]

In [38]:
# open embeddings

# Open the file in read-binary mode and use pickle to load the data
with open('data/BUPL/new_embeddings/' + f'new_embeddings_{embedding_interval1}.pkl', 'rb') as file:
    embeddings1 = pickle.load(file)

#with open('data/BUPL/new_embeddings/' + f'new_embeddings_{embedding_interval2}.pkl', 'rb') as file:
#    embeddings2 = pickle.load(file)

#with open('data/BUPL/new_embeddings/' + f'new_embeddings_{embedding_interval3}.pkl', 'rb') as file:
#    embeddings3 = pickle.load(file)

embeddings_list = embeddings1 #+ embeddings2 + embeddings3

### Upsert in Pinecone

In [39]:
# Retrieve key
PINECONE_KEY = os.getenv("PINECONE_KEY")
pc = Pinecone(api_key=PINECONE_KEY)

DIM_OF_VECTOR = len(embeddings_list[0]) #1536
SIM_METRIC = 'cosine'

index = pc.Index(INDEX_NAME)

In [40]:
def upsert_embeddings_to_pinecone(index, embeddings_list, bool_add):

    # Will first create ids according to if we want to overwrite or add to pinecone database
    
    if bool_add:    # will start id at last id value in database
        nr_embeddings_exists = index.describe_index_stats()['total_vector_count']
        ids = [str(x) for x in range(nr_embeddings_exists, len(embeddings_list) + 1 + nr_embeddings_exists)]
        for id, dicto in tqdm(zip(ids, embeddings_list)):
            dicto.update({'id': id})
        #embeddings_list_w_ids = [(dicto |= {'id': id}) for id, dicto in tqdm(zip(ids, embeddings_list))]
        #print(embeddings_list)
        
    else:           # will start id from 0 (and overwrite)
        ids = [str(x) for x in range(len(embeddings_list) + 1)]
        for id, dicto in tqdm(zip(ids, embeddings_list)):
            dicto.update({'id': id})
        #print(embeddings_list)
    
    # Pinecone can only handle 2MB at a time. 
        # We divide into batches
    batch_size = 253 
    lower_b = 0
    upper_b = batch_size
    while lower_b < len(embeddings_list):

        # send batch size of embeddings to database
        index.upsert(vectors=[dicto  for dicto in tqdm(embeddings_list[lower_b:upper_b])])   # ADDED THE TEXT

        # increase lower and upper
        lower_b += batch_size
        upper_b += batch_size


In [41]:
if config['run_upsert_embeddings']:
    upsert_embeddings_to_pinecone(index, embeddings_list, bool_add=False)
else:
    print('Will not upsert embeddings.')

0it [00:00, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

In [4]:
[index['name'] for index in pc.list_indexes()]

['tester', 'index-cas-onboarding', 'bupl-index']