In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import openai
import faiss
from openai import OpenAI
import fitz
import numpy as np

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY") 


In [3]:
client = OpenAI()

In [4]:
def pagelevel_pdf_process(pdf_path):
    document = fitz.open(pdf_path)
    page_chunks_ = []
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text = page.get_text()
        page_chunks_.append(text) 
    return page_chunks_ 

In [5]:
def paralevel_pdf_process(pdf_path):
    paragraph_chunks_ = []
    page_chunks_ = pagelevel_pdf_process(pdf_path)  
    for page_text in page_chunks_:
        page_paragraphs = page_text.split('\n\n')  
        for para in page_paragraphs:
            if para.strip():
                paragraph_chunks_.append(para.strip())
    return paragraph_chunks_

In [6]:
Chunks_1 = pagelevel_pdf_process("/Users/mohammedjohnyshaik/Documents/ML_Projects/ML/LLM_Summarization/data/Generative_AI.pdf")
Chunks_2 = paralevel_pdf_process("/Users/mohammedjohnyshaik/Documents/ML_Projects/ML/LLM_Summarization/data/Generative_AI.pdf")

In [7]:
chunk_size = [len(chunk) for chunk in Chunks_2]
total_size = sum(chunk_size)
total_size

7498

In [8]:
models_ = openai.models.list()
for model in models_:
    print(model)

Model(id='dall-e-3', created=1698785189, object='model', owned_by='system')
Model(id='text-embedding-3-large', created=1705953180, object='model', owned_by='system')
Model(id='text-embedding-3-small', created=1705948997, object='model', owned_by='system')
Model(id='gpt-4-0125-preview', created=1706037612, object='model', owned_by='system')
Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal')
Model(id='dall-e-2', created=1698798177, object='model', owned_by='system')
Model(id='tts-1', created=1681940951, object='model', owned_by='openai-internal')
Model(id='tts-1-hd-1106', created=1699053533, object='model', owned_by='system')
Model(id='tts-1-1106', created=1699053241, object='model', owned_by='system')
Model(id='tts-1-hd', created=1699046015, object='model', owned_by='system')
Model(id='babbage-002', created=1692634615, object='model', owned_by='system')
Model(id='gpt-4-turbo-preview', created=1706037777, object='model', owned_by='system')


In [9]:
def tokenize_chunk(chunk, model="gpt-3.5-turbo"):
    tokens = openai.Tokenizer.encode(chunk, model=model)
    return tokens

In [10]:
def get_embeddings(tokenize_chunk, model="text-embedding-3-small"):
    embeddings = []
    for tokenize_chunk in tokenize_chunk:
        #chunk = chunk.replace("\n", " ")
        response = openai.embeddings.create(
            input=[tokenize_chunk],
            model=model
        )
        embeddings.append(response.data[0].embedding)
    return embeddings

In [11]:
pdf_chunks = paralevel_pdf_process('/Users/mohammedjohnyshaik/Documents/ML_Projects/ML/LLM_Summarization/data/Generative_AI.pdf')
embeddings = get_embeddings(pdf_chunks)

In [12]:
def store_embeddings(embeddings):
    dim = len(embeddings[0])
    index = faiss.IndexFlatL2(dim)
    embedding_array = np.array(embeddings).astype('float32')
    index.add(embedding_array)
    return index
index = store_embeddings(embeddings)


In [13]:
''''def truncate_text(text, max_tokens):
    tokens = text.split()[:max_tokens]
    return ' '.join(tokens)'''

In [13]:
def generate_response(query, index, chunks, temperature=0.5, top_p=1.0):
    query_vector = get_embeddings([query])[0]
    _, top_indices = index.search(np.array([query_vector]), k=5)
    relevant_chunks = [chunks[i] for i in top_indices[0]]
    context = " ".join(relevant_chunks)
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": context + "\n\nQ: " + query}
        ],
        max_tokens=150,
        temperature= temperature,
        top_p = top_p
    )
    message_content = response.choices[0].message.content.strip()
    return message_content

In [14]:
sample_query = "Summarize the document"
response = generate_response(
    query=sample_query,
    index=index,
    chunks=pdf_chunks,
    temperature=0.5,  
    top_p=0.9       
)
print("Response:", response)


Response: Generative AI, a transformative technology, mimics human-created data and has diverse applications in content creation, art, NLP, and healthcare. Key technologies like neural networks, GANs, transformers, and VAEs drive its development. However, ethical challenges such as bias, deepfakes, IP rights, and privacy must be addressed. The future of generative AI includes enhanced realism, integration with AR/VR/IoT, human-AI collaboration, and a focus on ethical development. Overall, generative AI has vast potential to reshape industries but requires responsible use and ethical considerations.
