Installation of packages

Import of packages

In [None]:
from tqdm.notebook import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import itertools
import re
from concurrent.futures import ThreadPoolExecutor
import yaml
import pickle
import random

import numpy as np

Reading yaml configuration

In [None]:
# Read YAML configuration
def read_config(filename):
    with open(filename, 'r') as file:
        config = yaml.safe_load(file)
    return config

config = read_config("config.yaml")

In [None]:
def search_pdf_files(directory):
    # List to store found files
    found_files = []

    # Walk through the directory and its subdirectories
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                # save path to list
                found_files.append(file_path)

    return found_files

def count_files_in_folders(directory):
    # Iterate over each directory in the given directory
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        # Check if the current item is a directory
        if os.path.isdir(folder_path):
            # Count the number of files in the directory
            num_files = len([filename for filename in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, filename))])
            print(f"Folder '{foldername}' contains {num_files} file(s).")

# Specify the directory path
directory_path = r"C:\omx_data"

# Search for PDF files containing a keyword
found_files = search_pdf_files(directory_path)

print(f'Number of pdf\'s found: {len(found_files)}')


count_files_in_folders(directory_path)

Number of pdf's found: 761
Folder 'Finanskalender' contains 102 file(s).
Folder 'Forløb af generalforsamling' contains 124 file(s).
Folder 'Indkaldelse til generalforsamling' contains 147 file(s).
Folder 'Intern viden' contains 98 file(s).
Folder 'Selskabsvedtægter' contains 100 file(s).
Folder 'Storaktionærmeddelelser' contains 102 file(s).
Folder 'Årsrapport' contains 88 file(s).


In [None]:
print(found_files) # Finanskalender --- Forløb af generalforsamling --- Indkaldelse til generalforsamling --- Intern viden --- Selskabsvægter --- , Storaktionærmeddelser --- Årsrapport

['C:\\omx_data\\Finanskalender\\06306376.pdf', 'C:\\omx_data\\Finanskalender\\06306377.pdf', 'C:\\omx_data\\Finanskalender\\07_2023 Green Hydrogen Systems - Financial calendar 2023_updated v1.pdf', 'C:\\omx_data\\Finanskalender\\08_2022_Financial Calendar 2022-23 - Coloplast BV.pdf', 'C:\\omx_data\\Finanskalender\\08_2022_Finanskalender 2022-23.pdf', 'C:\\omx_data\\Finanskalender\\09_2023_Finanskalender_2023-24.pdf', 'C:\\omx_data\\Finanskalender\\11280986.pdf', 'C:\\omx_data\\Finanskalender\\11_2023 Finanskalender 2023.pdf', 'C:\\omx_data\\Finanskalender\\15-2022 Finanskalender 2023 Djurslands Bank.pdf', 'C:\\omx_data\\Finanskalender\\17 Announcement_16122022_2023 Financial Calendar.pdf', 'C:\\omx_data\\Finanskalender\\18_2022 Green Hydrogen Systems - Financial calendar 2023.pdf', 'C:\\omx_data\\Finanskalender\\18_2023 Company Announcement.pdf', 'C:\\omx_data\\Finanskalender\\1_2023 Company Announcement.pdf', 'C:\\omx_data\\Finanskalender\\2022-12 Finanskalender 2023.pdf', 'C:\\omx_da

In [None]:
# Shuffle list of files to get a random subset óf embeddings
random.seed(42)

shuffled_list = random.sample(found_files, len(found_files))
print(shuffled_list)

['C:\\omx_data\\Storaktionærmeddelelser\\NKT notification - signed.pdf', 'C:\\omx_data\\Forløb af generalforsamling\\06_Proceedings_AGM_29062023.pdf', 'C:\\omx_data\\Finanskalender\\Announcement337_Changes to the Financial Calendar_final.pdf', 'C:\\omx_data\\Indkaldelse til generalforsamling\\Bilag 3 - CV.pdf', 'C:\\omx_data\\Indkaldelse til generalforsamling\\236 - Indkaldelse ordinr GF 2023.pdf', 'C:\\omx_data\\Indkaldelse til generalforsamling\\03a_Annex_1_Notice_AGM_06062023.pdf', 'C:\\omx_data\\Forløb af generalforsamling\\20230427 AB Sveriges Sakerstallda Obligationer publ arsstamma_2023.pdf', 'C:\\omx_data\\Forløb af generalforsamling\\03109509.pdf', 'C:\\omx_data\\Årsrapport\\Arsrapport 2022 - Vrdipapirfonden Sydinvest ny.pdf', 'C:\\omx_data\\Selskabsvedtægter\\Vedtgter 2022 v2 DK endelig.pdf', 'C:\\omx_data\\Finanskalender\\Ress Life Investments AS - Company Announcement financial calendar 2023.pdf', 'C:\\omx_data\\Storaktionærmeddelelser\\21_2023 Green Hydrogen Systems - Majo

### Process docs

In [None]:
# Set hyperparameters
CHUNKSIZE = 1000 
CHUNKOVERLAP = 100

In [None]:
def preprocessing(doc, use_meta=False):
    if use_meta:
        page_content, metadata = doc.page_content, doc.metadata

        #some processing
        page_content_processed = re.sub(r'\s+', ' ', page_content.replace("\\n", " "))

        dict_data = {'metadata': metadata}
        dict_data['metadata']['text'] = page_content_processed
        
        return dict_data
    else:
        doc = str(doc)    
        #Remove whitespace
        full_text = re.sub(r'\s+', ' ', doc.replace("\\n", " ")) 

    return full_text

In [None]:
def langchain_loader_splitter(list_pdf_paths, chunk_size, overlap):

    list_of_docs = []
    for pdf_path in tqdm(list_pdf_paths):    
        try:
            # Use load_and_split() to split the document into sentences
            loader = PyPDFLoader(pdf_path)
            data = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
            # will create the chunks in "documents"
            documents = text_splitter.split_documents(data)
            #print(documents)
            # One can have lists of shape [[..],[..], ..., [..]] for number of chunks created 
            documents = [preprocessing(doc, True) for doc in documents]  
            # append to list of all documents
            list_of_docs.append(documents)
                
        except Exception as e:
            print(f"Error loading {pdf_path}: {e}")

    # can have shape [ [..], [[..], [..]] ]. Need to unpack
    list_of_docs_unpacked = list(itertools.chain(*list_of_docs))
    return list_of_docs_unpacked


# Write the list of strings to a file
def write_dicts_to_pickle(dicts, pickle_file_path):
    with open(pickle_file_path, 'wb') as f:
        pickle.dump(dicts, f)


# ONLY RUN IF THE dataload configuration says so
if config['run_langchain']:
    # First half
    docs1 = langchain_loader_splitter(shuffled_list[:350], CHUNKSIZE, CHUNKOVERLAP)
    write_dicts_to_pickle(docs1, "data/finance/dicts1.pkl")
    print(f"{len(docs1)} dicts have been written to 'dicts1.pkl'")

    # Second half
    docs2 = langchain_loader_splitter(shuffled_list[350:], CHUNKSIZE, CHUNKOVERLAP)
    write_dicts_to_pickle(docs2, "data/finance/dicts2.pkl")
    print(f"{len(docs2)} dicts have been written to 'dicts2.pkl'")

In [None]:
# Read the data dicts
with open('data/dicts1.pkl', 'rb') as file:
    dicts1 = pickle.load(file)

with open('data/dicts2.pkl', 'rb') as file:
    dicts2 = pickle.load(file)


print(f"{len(dicts1)} dicts have been read from dicts1.pkl.")
print(f"{len(dicts2)} dicts have been read from dicts2.pkl.")

# Concate all strings
docs = dicts1 + dicts2

22480 dicts have been read from dicts1.pkl.
26358 dicts have been read from dicts2.pkl.


## Move to Pinecone

https://app.pinecone.io/organizations/-NxMHT03GCEQHVgGQcfo/projects/b8d42a36-6186-413b-9291-64d9b3e9f6a9/indexes

https://docs.pinecone.io/guides/getting-started/quickstart


Other approaches were considered. One was from: https://medium.com/@varsha.rainer/building-a-rag-application-from-scratch-using-langchain-openais-whisper-pinecone-6b2fbf22f77f
- However, giving the vectors id's and saving intermediate results was made complicated
- The code skeleton is outcommented below:

In [None]:
#from langchain_openai.embeddings import OpenAIEmbeddings


#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
#documents = text_splitter.split_documents(text_documents)

#embeddings = OpenAIEmbeddings(api_key=os.environ.get("OPENAI_KEY"))

#pinecone = PineconeVectorStore.from_documents(
#    documents, embeddings, index_name=index_name
#)

### Create pinecone index

Create index

In [None]:
pc = Pinecone(api_key=os.environ.get("PINECONE_KEY"))

INDEX_NAME = "cas-index-new"
DIM_OF_VECTOR = 1536
SIM_METRIC = 'cosine'


def create_pinecone_index(bool):
    if bool == True:
        pc.create_index(
            name=INDEX_NAME,
            dimension=DIM_OF_VECTOR, 
            metric=SIM_METRIC, 
            spec=ServerlessSpec(
                cloud="aws",
                region="eu-west-1"
            ) 
        )
    else:   
        print('Passing the creation of the pinecone index, since it already exists')
        pass

create_pinecone_index(False)

Passing the creation of the pinecone index, since it already exists


### Upsert embeddings in the pinecone index

In [None]:
OPENAI_KEY = os.getenv("OPENAI_KEY")
client = OpenAI(api_key=OPENAI_KEY)

MODEL = 'text-embedding-3-small' #"text-embedding-ada-002" 


In [None]:

def process_text(dicto):
        res = client.embeddings.create(input=[dicto['metadata']['text']], model=MODEL)
        embedding = res.data[0].embedding

        # insert the embedding in dictionary with key 'values'
        dicto['values'] = embedding
        return dicto


# PARALLEL FUNCTION
def create_embeddings_parallel(dicts):
    embeddings_list = []

    # Do parallel for-loop (improves performance more than x5)
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_text, dicto) for dicto in dicts]
        for future in tqdm(futures, total=len(dicts)):
            embeddings_list.append(future.result())

    return embeddings_list #(Then here will be a list of dicts)


# NON-PARALLEL FUNCTION
def create_embeddings(texts):
    embeddings_list = []
    for text in tqdm(texts):
        res = client.embeddings.create(input=[text], model=MODEL)
        #print(f'Response from API: {res}')
        embeddings_list.append((res.data[0].embedding, text))
    return embeddings_list


# define function for saving embeddings as pickle

def get_interval_borders_from_name(interval_name):
    start_n, end_n = interval_name.split('-')
    start_n = int(start_n)
    end_n = int(end_n)
    return start_n, end_n

def run_and_save_embedding_list(interval_name, docs):
    # get interval
    start_n, end_n = get_interval_borders_from_name(interval_name)
    # run
    embeddings_list = create_embeddings_parallel(docs[start_n : end_n])
    # save
    filename = f'new_embeddings_{interval_name}.pkl'
    with open('data/finance/new_embeddings/' + filename, 'wb') as file:
        pickle.dump(embeddings_list, file)





In [None]:
embedding_interval1 = '0-5000'
embedding_interval2 = '5000-10000'
embedding_interval3 = '10000-15000'


if config['run_create_embeddings']:

    run_and_save_embedding_list(embedding_interval1, docs)

    run_and_save_embedding_list(embedding_interval2, docs)

    run_and_save_embedding_list(embedding_interval3, docs)

else:
    print('Not running embeddings...')

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
# open embeddings

# Open the file in read-binary mode and use pickle to load the data
with open('data/finance/new_embeddings/' + f'new_embeddings_{embedding_interval1}.pkl', 'rb') as file:
    embeddings1 = pickle.load(file)

with open('data/finance/new_embeddings/' + f'new_embeddings_{embedding_interval2}.pkl', 'rb') as file:
    embeddings2 = pickle.load(file)

with open('data/finance/new_embeddings/' + f'new_embeddings_{embedding_interval3}.pkl', 'rb') as file:
    embeddings3 = pickle.load(file)

embeddings_list = embeddings1 + embeddings2 + embeddings3

### Upsert in Pinecone

In [None]:
# Retrieve key
PINECONE_KEY = os.getenv("PINECONE_KEY")
pc = Pinecone(api_key=PINECONE_KEY)

#INDEX_NAME = "index-cas-onboarding"
DIM_OF_VECTOR = len(embeddings_list[0]) #1536
SIM_METRIC = 'cosine'

index = pc.Index(INDEX_NAME)

In [38]:
def upsert_embeddings_to_pinecone(index, embeddings_list, bool_add):

    # Will first create ids according to if we want to overwrite or add to pinecone database
    
    if bool_add:    # will start id at last id value in database
        nr_embeddings_exists = index.describe_index_stats()['total_vector_count']
        ids = [str(x) for x in range(nr_embeddings_exists, len(embeddings_list) + 1 + nr_embeddings_exists)]
        for id, dicto in tqdm(zip(ids, embeddings_list)):
            dicto.update({'id': id})
        #embeddings_list_w_ids = [(dicto |= {'id': id}) for id, dicto in tqdm(zip(ids, embeddings_list))]
        #print(embeddings_list)
        
    else:           # will start id from 0 (and overwrite)
        ids = [str(x) for x in range(len(embeddings_list) + 1)]
        for id, dicto in tqdm(zip(ids, embeddings_list)):
            dicto.update({'id': id})
        #print(embeddings_list)
    
    # Pinecone can only handle 2MB at a time. 
        # We divide into batches
    batch_size = 253 
    lower_b = 0
    upper_b = batch_size
    while lower_b < len(embeddings_list):

        # send batch size of embeddings to database
        index.upsert(vectors=[dicto  for dicto in tqdm(embeddings_list[lower_b:upper_b])])   # ADDED THE TEXT

        # increase lower and upper
        lower_b += batch_size
        upper_b += batch_size


In [39]:
if config['run_upsert_embeddings']:
    upsert_embeddings_to_pinecone(index, embeddings_list, bool_add=False)
else:
    print('Will not upsert embeddings.')

0it [00:00, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]