In [75]:
import os
import numpy as np
import pandas as pd
import PyPDF2

from openai import OpenAI
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

Embedding code

In [184]:
def get_embedding(text, model="text-embedding-3-small"):
   # Generates embedding for a text
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
 
# What does this do?
def get_embeddings(
    list_of_text: list[str], model="text-embedding-3-small", **kwargs) -> list[list[float]]:
    assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
    list_of_text = [text.replace("\n", " ") for text in list_of_text]
    data = client.embeddings.create(input=list_of_text, model=model, **kwargs).data
    return [d.embedding for d in data]

def create_embeddings_data(file,embedded_name, model="text-embedding-3-small"):
    df = pd.read_csv(file)
    df['embeddings'] = get_embeddings(df['text'], model=model)
    df.to_csv(embedded_name, index=False)
        
def load_embeddings_data(file):
    df = pd.read_csv(file)
    df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
    return df

def get_most_similar(text, df, n=3):
    search_embedded = get_embedding(text)
    df['similarities'] = df['embeddings'].apply(lambda x : cosine_similarity(x,search_embedded))
    df = df.sort_values("similarities", ascending=False)
    df = df.reset_index(drop=True)
    for i in range(n):
        print(f'Suggestion {i}: {df["text"][i]}')

def get_most_similar_unique(text, df, n=1):
    #  Split the input text into unique words
    words = set(text.split())
    # Iterate over each word and get the most similar embeddings
    for word in words:
        get_most_similar(word, df, n)
        
def jaccard_similarity(embedding1, embedding2):
    set1 = set(embedding1)
    set2 = set(embedding2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

Test example

In [77]:
# Example usage
answer = input("Do you want to create embeddings for the text in fed-speech.csv? (y/any key): ")
if answer == "y":
    create_embeddings_data("fed-speech.csv") # creates embeddings for the text in fed-speech.csv and saves it to fed-speech-embeddings.csv
    df = load_embeddings_data('fed-speech-embeddings.csv') # loads the embeddings from fed-speech-embeddings.csv
    search = "PCE fomc august prices" # search query
    get_most_similar(search,df) # get the most similar text to the search query
    get_most_similar_unique(search,df) # get the most similar text to the search query for each unique word
    get_most_similar("the federal open market comittee",df,n=3) # less specific search query

Split PDF by pages and split by contract number

In [175]:
import re

def split_pdf_by_page_to_csv(pdf_file):
    # Each page in the pdf is stored in a separate row in the csv file
    pdf = PyPDF2.PdfReader(pdf_file)
    chunks = []
    chunk = ""
    for i in range(len(pdf.pages)):
        page = pdf.pages[i]
        text = page.extract_text()
        for line in text.split("\n"):
            if line == "":
                chunks.append(chunk)
                chunk = ""
            else:
                chunk += line + " "
    df = pd.DataFrame(chunks, columns=["text"])
    return df


def insert_newline_before_contract_number(text,pattern):
    # Function to insert a newline before the contract number
    return re.sub(pattern, r'\n\1', text)

def chunk_by_contract_number(pdf_file, csv_file):
    df = split_pdf_by_page_to_csv(pdf_file)
    # Define the pattern for contract numbers (e.g., X., X.X., X.XX., XX.XX.)
    pattern = r' (\d{1,2}(\.\d{1,2})?\.) '
    
    # Apply the function to the sentences and concatenate all rows into one string
    all_text = '\n'.join(df['text'].apply(lambda x : insert_newline_before_contract_number(x,pattern)))

    # Split the concatenated text into separate rows based on the newline
    split_sentences = [line for line in all_text.split('\n') if line.strip()]

    # Create a new DataFrame with the split sentences
    new_df = pd.DataFrame(split_sentences, columns=['text'])

    # Save the new DataFrame to a CSV file
    new_df.to_csv('NS-test-split.csv', index=False)

In [186]:
str1 = "The fox jumped over the cat"
str2 = "The cat jumped over the fox"

emb1 = get_embedding(str1)
emb2 = get_embedding(str2)

print(cosine_similarity(emb1, emb2))
print(jaccard_similarity(str1, str2))

0.9713423673818518
1.0


In [187]:
create_embeddings_data("NS-test-split.csv", "NS-test-split-embeddings.csv")

In [188]:
df = load_embeddings_data("NS-test-split-embeddings.csv")

Ref. CCB krav: "OBI-CON-036 Bygg F 1 etasje - merarbeid lime plater"

In [189]:
search = "Iht. punkt 3.18 mener vi at merkostnad for tilpasning til nevnte forhold ovenfor berettiger tilleggskostnadene."

In [190]:
get_most_similar(search,df,n=3)

Suggestion 0: 3.30.  Prising av endringer   Alle priser i endringsmeldinger skal gjenspeile prisnivå på poster i postoppsettet.     Dette gjelder også for R&D  der 5% tillegges som fast sats.   Dersom det er spesielle forhold, f.eks. inntransport etter byggheis er demontert, er UE  berettiget til ytterlig ere kompensasjon for dette .    
Suggestion 1: 3.31. Fordelingskostnader   
Suggestion 2: 3.18.  Hulltaking i himling :  OBI har t ilbudt h ulltagning og kantforsegling himlinger kr 40, - pr m² .     Prisen inkluderer alle tilpasninger i himling . Blant annet, men ikke begrenset til:   hulltaking, tilpasning grid for lamper, tilpasning søyler, tilpasning til vegger,  kantforsegling av plater som kappes for både hull og tilpasninger. Prisen gjelde r for alle  typer himlinger.     Opplimt e akustisk plater (post 12 -13-14-15) omfattes ikke av d ette punkt. Plater  monteres etter teknisk inkl. i enhetspriser. Nødvendig tilpasning av plater som følge av  krasj med teknisk medfører tillegg

In [84]:
get_most_similar("3.3.",df,n=1)

Suggestion 0: 3.29.   Rigg og drift   Rigg og drift  (R&D) , post 1 i tilbud for systeminnredning og post 1 i tilbud for himling.     R&D honoreres med 5% av etter faktisk utførte mengder.       3.30.   Prising av endringer   Alle priser i endringsmeldinger skal gjenspeile prisnivå på poster i postoppsettet.     Dette gjelder også for R&D  der 5% tillegges som fast sats.   Dersom det er spesielle forhold, f.eks. inntransport etter byggheis er demontert, er UE  berettiget til ytterlig ere kompensasjon for dette .     3.31.


Create meaningful chunks. Semantic text splitter

In [113]:
from semantic_text_splitter import TextSplitter
from tokenizers import Tokenizer

with open("NS-test.csv", "r") as file:
    content = file.read()

max_tokens = 200
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
splitter = TextSplitter.from_huggingface_tokenizer(tokenizer, max_tokens)
chunks = splitter.chunks(content)
# Create a DataFrame from the chunks
df = pd.DataFrame({'text': chunks})
# Save the DataFrame to a CSV file
df.to_csv('NS-chunk.csv', index=False)