In [20]:
from pdfminer.high_level import extract_text
from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_extracter(path):
    text = extract_text(path)
    return text

def text_chunk(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200,separators = ["\n\n", "\n", ".", " ", ""])
    chunks = splitter.split_text(text)
    return chunks

In [21]:
import spacy

nlp = spacy.load('en_core_web_sm')

def text_cleaning(chunks):
    cleaned_chunks = []
    for chunk in chunks:
        doc = nlp(chunk)
        cleaned = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct])
        cleaned_chunks.append(cleaned)
    return cleaned_chunks

def lemmatize_text(cleaned_chunks):
    lemma_chunks = []
    for chunk in cleaned_chunks:
        doc = nlp(chunk)
        lemma = " ".join([token.lemma_ for token in doc])
        lemma_chunks.append(lemma)
    return lemma_chunks

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import pickle
import os

def embed_store(lemma_chunks):
  docs = [Document(page_content=chunk) for chunk in lemma_chunks]
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  db = FAISS.from_documents(docs, embeddings)  # FAISS - Facebook ai similarity search (full form for revision)
  
  storage_dir = "/Users/pmanthan/Desktop/tomo.ai/faiss_storage"
  os.makedirs(storage_dir, exist_ok=True)
  db.save_local(storage_dir)
  
  with open("/Users/pmanthan/Desktop/tomo.ai/faiss_storage/faiss_storage.pkl", "wb") as f:
    pickle.dump(embeddings, f)
    
  return db


def load_store():
  storage_dir = "/Users/pmanthan/Desktop/tomo.ai/faiss_storage"
  with open("/Users/pmanthan/Desktop/tomo.ai/faiss_storage/faiss_storage.pkl", "rb") as f:
    embeddings = pickle.load(f)
    
    vector_store = FAISS.load_local(storage_dir, embeddings, allow_dangerous_deserialization=True)
    return vector_store
  
  return vector_store

In [58]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq

def rag_pipeline(): 
   extracted_text = text_extracter("/Users/pmanthan/Desktop/attentionisalluneed.pdf")
   chunked_text = text_chunk(extracted_text)
   cleaned_text = text_cleaning(chunked_text)
   lemma_text = lemmatize_text(cleaned_text)
   embed_text = embed_store(lemma_text)
   return load_store()


In [59]:
llm=ChatGroq(model="llama3-70b-8192",api_key="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")

In [None]:
vector_store = rag_pipeline()
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True,output_key="answer")
qa_chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever,memory=memory,return_source_documents=True)

In [62]:
def hybrid_answer(question):
    response = qa_chain.invoke({"question": question})

    if not response["source_documents"]:
        fallback_answer = llm.invoke(question)
        return fallback_answer

    return response["answer"]

def input_output(question):
    hy_answer = hybrid_answer(question)
    print(hy_answer)


input_output("can u summarize the pdf for me")


Unfortunately, the provided text appears to be a collection of fragments from a research paper, including tables, figures, and references. It does not form a coherent narrative, making it challenging to provide a concise summary.

The text seems to be related to natural language processing, machine translation, and attention mechanisms in neural networks. There are snippets about model architectures, training regimes, and experimental results, but the context is lacking.

If you could provide more context or clarify what specific aspects of the paper you would like me to summarize, I'll do my best to assist you.
