In [None]:
import os
import json
import os
import openai
import pandas as pd
import re

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PDFMinerLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, AzureOpenAI





In [None]:


API_ENDPOINT= os.getenv("AZURE_OPENAI_ENDPOINT")
API_VERSION = os.getenv("OPENAI_API_VERSION")
API_TYPE = os.getenv("OPENAI_API_TYPE")
API_KEY = os.getenv("OPENAI_API_KEY")
DEPLOYMENT_MODEL = "gpt-35-turbo-16k"
COMPLETIONS_MODEL = "gpt-3.5-turbo-16k"
EMBEDDING_MODEL = "text-embedding-ada-002"
TEMP = 0.3  # temperature


llm = AzureChatOpenAI(
        azure_endpoint = API_ENDPOINT,
        openai_api_version = API_VERSION,
        openai_api_key = API_KEY,
        openai_api_type = API_TYPE,
        deployment_name = DEPLOYMENT_MODEL,
        model_name = DEPLOYMENT_MODEL,
        tiktoken_model_name = COMPLETIONS_MODEL,
        temperature = TEMP
    )


embeddings = AzureOpenAIEmbeddings(
        azure_endpoint = API_ENDPOINT,
        openai_api_version = API_VERSION,
        openai_api_key = API_KEY,
        openai_api_type = API_TYPE,
        deployment = EMBEDDING_MODEL,
        model = EMBEDDING_MODEL
    )

In [None]:
def doc_creator_recusplit(file: chr, chunk_size: int, chunk_overlap: int, source_column = None):
    """
        This function takes a pdf file and returns a list of documents that are split into chunks of text.
        The function takes the following arguments:
            file: the pdf file that contains the documents
            chunk_size: the size of the chunks that the documents will be split into
            chunk_overlap: the overlap between the chunks
            source_column: the column that will be used as metadata for the documents
    """



    # create a text splitter
    loader = PDFMinerLoader(file)
    
    # load the documents
    documents = loader.load()

    # split the documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size =chunk_size, chunk_overlap = chunk_overlap)
    
    # split the documents into chunks
    return text_splitter.split_documents(documents)

In [None]:
def get_txt_summary(llm, text1, text2, score_threshold=0.5):
    """
    Args:
    llm (AzureChatOpenAI): The language model used for comparison.
    text1 (str): The first text.
    text2 (str): The second text.

    Returns:
    float: The score indicating the similarity between the texts.
    """
    
    prompt = f"""
    I want you to compare the two texts below and tell me if the second text completes the first text’s context or if they can be split apart,

    I want you to return score between 0 and 1 where 0 they don’t belong to the same context and should be split, and 1 they definatly belong to the same context

    please only tell me the score value and nothing else. For example “0.75”

    text 1: {text1}

    text 2: {text2}

    Think about your answer, does it only contain a float value and nothing else? if not, please correct that and only return the float value
    """
    txt_summary = llm.invoke(prompt)
    print(txt_summary.content)
    assert isinstance(float(txt_summary.content), float), "The return value cannot be casted into float"
    score = float(txt_summary.content)

    if score > score_threshold:
        return [text1 + "\n" + text2]
    else:
        return [text1, text2]

    





In [None]:
def parser(pdf_path):
    processed_documents=[]
    patterns=["table of contents", "......", ". . . . . . ."]
    documents = doc_creator_recusplit(file=pdf_path, chunk_size=300, chunk_overlap=60)

    for document in documents:
        clean_text = document.page_content
        clean_text= re.sub('\s+',' ', clean_text)

        if any(pat in clean_text.lower() for pat in patterns):
            continue


        document.page_content = clean_text
        
        processed_documents.append(document)
    return processed_documents


In [None]:

pdf_path='C:/github/Resume chatbot/test_data/example_cv.pdf'

docs = parser(pdf_path=pdf_path)



In [None]:

new_docs = []
txt_list=[] #instantiate results empty list
for i in range(len(docs)-1):

    if len(txt_list) == 1:  #texts were united
        txt1 = txt_list[0]  # will keep comparing similarity to the new and united text instead of the original chunk
    else:
        txt1 = docs[i].page_content

    txt2 = docs[i+1].page_content
    txt_list = get_txt_summary(llm = llm, text1=txt1, text2=txt2, score_threshold=0.4)
    
    if len(txt_list) == 2: # no uniting took place, texts are not related, appending txt1 
        print("<<---------------------------------------------------------------->>")
        print(txt1)
        print("<<---------------------------------------------------------------->>")
        new_docs.append(txt_list[0])

    


    


    