In [1]:
from IPython.display import clear_output

In [2]:
!pip install pandas urllib3 feedparser PyPDF2
clear_output()

In [3]:
# Import libraries
import pandas as pd
import urllib.request as libreq
import feedparser

# Display setting to show more characters in column
pd.set_option('display.max_colwidth', None)

# Accessing arXiv articles using search query, render as ParserDictionary, create PDF link for each article
https://info.arxiv.org/help/api/examples/python_arXiv_parsing_example.txt

## Search query and Render as Parser Dictionary

In [4]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
# to search for articles that contain at least one of multiple keywords (using "OR" logic) in all fields 
search_query = "all:llms"
# search_query = 'all:electron' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 5

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

# perform a GET request using the base_url and query
with libreq.urlopen(base_url+query) as url:
      response = url.read()

# parse the response using feedparser
feed = feedparser.parse(response)

In [5]:
# Create a DataFrame from the list of dictionaries
entries_df = pd.DataFrame(feed.entries)

# Extract the arxiv_id from the URL
def extract_arxivId_from_url(link):
    parts = link.split('/')
    return '/'.join(parts[4:])

# Extract and add the PDF URL
entries_df['arxiv_id'] = entries_df['link'].apply(extract_arxivId_from_url)
entries_df['pdf_url'] = entries_df['arxiv_id'].apply(lambda arxiv_id: f'http://arxiv.org/pdf/{arxiv_id}.pdf')

## Get PDF read

In [6]:
import requests
from PyPDF2 import PdfReader
from io import BytesIO

In [7]:
def get_pdf_text(pdf_url):
    text = ""
    # Download the PDF content from the URL
    response = requests.get(pdf_url)
    if response.status_code == 200:
        # Create a BytesIO object from the response content
        pdf_data = BytesIO(response.content)
        pdf_reader = PdfReader(pdf_data)
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text
    else:
        print(f"Failed to retrieve PDF from URL: {pdf_url}")
        return None

entries_df['pdf_content'] = entries_df.pdf_url.apply(get_pdf_text)

# Creating RAG Model

### -- Following this
- https://github.com/Just-A-Dash/RAGwithLLAMAv2 
- https://github.com/alejandro-ao/ask-multiple-pdfs/blob/main/app.py


In [8]:
#install langchain , openai
!pip install langchain  openai

clear_output()

In [9]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

 You should use double backslashes or a raw string (with an 'r' prefix) for file paths in Windows

In [10]:
import os
filePath = r"C:\Users\swornm\Documents\Sworna Vidhya\Gen AI\Code\genai-openaikey.txt"
with open(filePath,"r") as f:
  os.environ["OPENAI_API_KEY"] = " ".join(f.readlines())

## Following for text splitting
- https://medium.com/@gustavo-espindola/%EF%B8%8F-%EF%B8%8F-text-splitters-smart-text-division-with-langchain-1fa8ac09eb3c


### Define RecursiveCharacterTextSplitter

Following for text splitting - create_document:
- https://github.com/gustavoespindola/chunkerizer/blob/main/chunkerizer.py

- Explanation between using split_text & create_documents from text_splitter
https://www.reddit.com/r/LangChain/comments/137pv5q/when_to_use_split_text_vs_create_documents/

## Storing Chunks, Text, Tokens-length, Characters-length

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 0
)

# Create a function to process each row of the DataFrame
def process_row(row):
    file_content = row['pdf_content']
    chunks = text_splitter.create_documents([file_content])
    
    # Create new columns with the required information
    row['chunks'] = chunks
    
    return row

# Apply the function to each row of the DataFrame
entries_df = entries_df.apply(process_row, axis=1)

## Create Embeddings and Vectorstore

In [12]:
!pip install faiss-cpu sentence-transformers

clear_output()

In [13]:
# Define a function to compute the vectorstore for a given text
def compute_vectorstore(text, embeddings):
    # Your code to compute the vectorstore using FAISS
    # You may need to adapt this part based on your specific implementation
    # The result should be a vectorstore for the given text
    # For example, you can use the following as a placeholder:
    vectorstore = FAISS.from_documents(text, embeddings)
    return vectorstore

# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})

# Apply the function to each row of the DataFrame and store the vectorstore in a new column
entries_df['Vectorstore'] = entries_df.apply(lambda row: compute_vectorstore(row['chunks'], embeddings), axis=1)

  from .autonotebook import tqdm as notebook_tqdm


# QA with LLamma2 from PDF

## Question Answering using CTransformers as LLM and RetrievalQA
### Following for creating LLM
- https://github.com/Just-A-Dash/RAGwithLLAMAv2/blob/main/src/llm.py

In [14]:
from langchain.llms import CTransformers

In [15]:
# Local CTransformers model
llm = CTransformers(model="TheBloke/Llama-2-7B-Chat-GGML", model_file='llama-2-7b-chat.ggmlv3.q5_K_M.bin', model_type='llama', config={'max_new_tokens': 4000, 'temperature': 0.2})

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 970.45it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 979.98it/s]


#### Following to overcome the above runtime error and load llama model 
- https://github.com/marella/ctransformers - Documentation on ctransformers
- https://stackoverflow.com/a/77015576

#### Download GGML - Quantised Text Generation LLamma2 model with model filename from HF

### Following 
- https://www.youtube.com/watch?v=lbFmceo4D5E


### Following for creating pdf_query function
- https://medium.com/@ahmed.mohiuddin.architecture/using-ai-to-chat-with-your-documents-leveraging-langchain-faiss-and-openai-3281acfcc4e9

In [16]:
from langchain.chains import RetrievalQA

In [17]:
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

def model_memory():
    # Adding history to the model.
    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
    just say that you don't know, don't try to make up an answer.

    {context}

    {history}
    Question: {question}
    Helpful Answer:"""

    prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
    memory = ConversationBufferMemory(input_key="question", memory_key="history")

    return prompt, memory

prompt, memory = model_memory()

In [18]:
persisted_vectorstore = entries_df['Vectorstore'][0]

# Use RetrievalQA chain for orchestration
def query_pdf(query):
    qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", 
                                 retriever=persisted_vectorstore.as_retriever(
                                     search_kwargs={'k': 2},
                                     return_source_documents=True,
                                     chain_type_kwargs={"prompt": prompt, "memory": memory}
                                    ))
    
    
    result = qa.run(query)
    return result

In [19]:
list_of_questions = [
    "What is the objective of the study",
    "What are ths scope of the study",
    "What is the Main Aim of the proposed Research?",
    "What question or problem is author addressing?",
    "What are the key theories, models and methods?",
    "What is the evaluation metric used in the paper",
    "Is it an established approach/innovative approach",
    "What are the results and conclusions of the study?",
    "What are the key insights and arguments?",
    "Does it confirm, add to, or challenge established knowledge?",
    "What are the strengths of the research?",
    "What is the limitations of this study",
    "What are the future recommendations of the study",
    "What are the weakness of the research?"
]

qa_ans = []

# Use a lambda function to call query_pdf for each question and append the result to qa_ans
qa_ans.extend(map(lambda question: query_pdf(question), list_of_questions))

print(qa_ans)

[" The objective of the study is to develop a comprehensive and systematic approach to organize, classify, and understand these LLMs.\n\nI don't know the answer to the question at the end of the passage.", ' The scope of this study is to explore the potential of LLMs in various contexts, including but not limited to:\n\n* Training and sharing LLMs among smaller research groups and individuals.\n* Organizing LLMs through Hugging Face.\n* Few attempts have been made to organize these LLMs, perhaps due to the immense number of models available.', ' The main aim of the proposed research is to keep pace with developments in the field of Large Language Models (LLMs) and to encourage more systematic and informed engagement with these models.', ' The author is addressing the problem of how to keep up with the rapid pace of technological advancements in the field of artificial intelligence (AI).', ' The key theories, models, and methods in AI research include deep learning, reinforcement learni

In [20]:
qa_ans

[" The objective of the study is to develop a comprehensive and systematic approach to organize, classify, and understand these LLMs.\n\nI don't know the answer to the question at the end of the passage.",
 ' The scope of this study is to explore the potential of LLMs in various contexts, including but not limited to:\n\n* Training and sharing LLMs among smaller research groups and individuals.\n* Organizing LLMs through Hugging Face.\n* Few attempts have been made to organize these LLMs, perhaps due to the immense number of models available.',
 ' The main aim of the proposed research is to keep pace with developments in the field of Large Language Models (LLMs) and to encourage more systematic and informed engagement with these models.',
 ' The author is addressing the problem of how to keep up with the rapid pace of technological advancements in the field of artificial intelligence (AI).',
 ' The key theories, models, and methods in AI research include deep learning, reinforcement le

In [21]:
# Strip each value and join them with space
raw_text = ' '.join(map(str.strip, qa_ans))
print(f'Lenght of raw_text : {len(raw_text)}')
raw_text

Lenght of raw_text : 3797


"The objective of the study is to develop a comprehensive and systematic approach to organize, classify, and understand these LLMs.\n\nI don't know the answer to the question at the end of the passage. The scope of this study is to explore the potential of LLMs in various contexts, including but not limited to:\n\n* Training and sharing LLMs among smaller research groups and individuals.\n* Organizing LLMs through Hugging Face.\n* Few attempts have been made to organize these LLMs, perhaps due to the immense number of models available. The main aim of the proposed research is to keep pace with developments in the field of Large Language Models (LLMs) and to encourage more systematic and informed engagement with these models. The author is addressing the problem of how to keep up with the rapid pace of technological advancements in the field of artificial intelligence (AI). The key theories, models, and methods in AI research include deep learning, reinforcement learning, natural langua

# Creating Literature Review (Summarisation)

In [22]:
def split_text(text):
    # Get segments from txt by splitting on .
    segments =  text.split('.')

    # Put the . back in
    segments = [segment + '.' for segment in segments]

    # Further split by comma
    segments = [segment.split(',') for segment in segments]

    # Flatten
    segments = [item for sublist in segments for item in sublist]

    # Further split by comma
    segments = [segment.split('\n') for segment in segments]

    # Flatten
    segments = [item for sublist in segments for item in sublist]
    
    return segments

In [23]:
# function splits the raw text into a list of values with specific conditions
pdf_text = split_text(raw_text)

# Remove empty strings from pdf_text
pdf_text_list = list(filter(None, pdf_text))

pdf_text = ' '.join(map(str.strip, pdf_text_list))
print(len(pdf_text_list))
pdf_text

66


"The objective of the study is to develop a comprehensive and systematic approach to organize classify and understand these LLMs. I don't know the answer to the question at the end of the passage. The scope of this study is to explore the potential of LLMs in various contexts including but not limited to: * Training and sharing LLMs among smaller research groups and individuals. * Organizing LLMs through Hugging Face. * Few attempts have been made to organize these LLMs perhaps due to the immense number of models available. The main aim of the proposed research is to keep pace with developments in the field of Large Language Models (LLMs) and to encourage more systematic and informed engagement with these models. The author is addressing the problem of how to keep up with the rapid pace of technological advancements in the field of artificial intelligence (AI). The key theories models and methods in AI research include deep learning reinforcement learning natural language processing co

In [24]:
!pip -q install tiktoken

In [25]:
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter

model_name = "gpt-3.5-turbo"

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    model_name=model_name
)

texts = text_splitter.split_text(pdf_text)

docs = [Document(page_content=t) for t in texts]
print(len(docs))
     

1


## Following the blog
- https://medium.com/@Ahmed-Haytham/google-palm2-api-with-langcahin-%EF%B8%8F-c55abd1d1651

In [26]:
!pip install -q google-generativeai

## Using PALM 2 model directly from Google Generative AI

Followed links
- https://developers.generativeai.google/examples/text_calculator?authuser=1

In [32]:
import pprint
import google.generativeai as palm

In [33]:
api_key = 'AIzaSyAxAG0Sae73vYMHCOwIhRdj-1QJagrtQio' # get this free api key from https://makersuite.google.com/

palm.configure(api_key=api_key)

In [34]:
models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
model = models[0].name
print(model)

models/text-bison-001


In [37]:
prompt =  """Summarize this text: 
SUMMARIZED TEXT MUST BE WITH MAXIMUM 600 WORDS AND NOT MORE THAN THAT. GIVE IT AS A PARAGRAPH
The content should start with In this study, the author

{pdf_text}

Summary:"""

completion = palm.generate_text(
    model=model,
    prompt=prompt,
    temperature=0,
    # The maximum length of the response
    max_output_tokens=1024,
)

In [38]:
literature_review = completion.result
print(len(literature_review))
literature_review

770


'In this study, the author investigated the effects of mindfulness meditation on pain perception and tolerance in patients with chronic pain. The author conducted a randomized controlled trial with 60 participants who were randomly assigned to either an eight-week mindfulness meditation intervention or a control group. The results showed that participants in the mindfulness meditation group had significantly reduced pain intensity, pain unpleasantness, and pain-related anxiety compared to the control group. Additionally, participants in the mindfulness meditation group had significantly increased pain tolerance. The author concluded that mindfulness meditation is an effective intervention for reducing pain perception and tolerance in patients with chronic pain.'