In [2]:
print("ok")

ok


### Importing packages

In [3]:
# Import the os module to interact with the operating system.
import os

# Import the load_dotenv function from the dotenv module to load environment 
# variables from a .env file.
from dotenv import load_dotenv

# Load environment variables from the .env file.
load_dotenv()

# Get the OpenAI API key from the loaded environment variables.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Set the OpenAI API key in the environment variables.
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Import the PyPDFLoader class from the langchain.document_loaders module 
# to load PDF documents.
from langchain.document_loaders import PyPDFLoader

# Import the TokenTextSplitter class from the langchain.text_splitter module to 
# split text into chunks.
from langchain.text_splitter import TokenTextSplitter

# Import the Document class from the langchain.docstore.document module to 
# create document objects.
from langchain.docstore.document import Document

# Import the PromptTemplate class from the langchain.prompts module to create
#  prompt templates.
from langchain.prompts import PromptTemplate

# Import the ChatOpenAI class from the langchain.chat_models module to interact 
# with the OpenAI GPT-3.5-turbo model.
from langchain.chat_models import ChatOpenAI

# Import the load_summarize_chain function from the langchain.chains.summarize
#  module to create a summarization chain.
from langchain.chains.summarize import load_summarize_chain

# Import the OpenAIEmbeddings class from the langchain_openai module to 
# generate embeddings using OpenAI.
from langchain_openai import OpenAIEmbeddings

# Import the FAISS class from the langchain.vectorstores module to
#  create a FAISS vector store.
from langchain.vectorstores import FAISS

# Import the RetrievalQA class from the langchain.chains module to create
#  a retrieval-based question-answering chain.
from langchain.chains import RetrievalQA

### Loading a PDF File

In [8]:
# Define the path to the PDF file you want to load
file_path = "../data/SDG.pdf"

# Create an instance of the PyPDFLoader class, passing the file path as an argument. 
# This loader will be used to read the content of the PDF file.
loader = PyPDFLoader(file_path)

# Use the load method of the loader instance to read and extract the data 
# from the PDF file. The extracted data is stored in the variable 'data'.
data = loader.load()

data

[Document(page_content='', metadata={'source': '../data/SDG.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': '../data/SDG.pdf', 'page': 1}),
 Document(page_content='IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD \nCAME TOGETHER TO FACE THE FUTURE.\nAnd what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. \nNot just in some faraway place, but in their own cities and towns and villages.\nThey knew things didn’t have to be this way. They knew we had enough  \nfood to feed the world, but that it wasn’t getting shared. They knew there \nwere medicines for HIV and other diseases, but they cost a lot. They knew  \nthat earthquakes and floods were inevitable, but that the high death  \ntolls were not. \nThey also knew that billions of people worldwide shared their hope for a \nbetter future.\nSo leaders from these countries created a plan called the Sustainable \nDevelopment Goals (SDGs). This set of 17 goals imagines a future just 15 years \noff th

### Extracting and Concatenating PDF Content

In [9]:
# Initialize an empty string variable to accumulate the content from all pages of the PDF.
question_gen = ""

# Iterate over each page in the data loaded from the PDF.
for page in data:
    # Concatenate the text content of the current page to the question_gen string.
    question_gen += page.page_content

# Output the concatenated content of all pages.
question_gen

'IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD \nCAME TOGETHER TO FACE THE FUTURE.\nAnd what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. \nNot just in some faraway place, but in their own cities and towns and villages.\nThey knew things didn’t have to be this way. They knew we had enough  \nfood to feed the world, but that it wasn’t getting shared. They knew there \nwere medicines for HIV and other diseases, but they cost a lot. They knew  \nthat earthquakes and floods were inevitable, but that the high death  \ntolls were not. \nThey also knew that billions of people worldwide shared their hope for a \nbetter future.\nSo leaders from these countries created a plan called the Sustainable \nDevelopment Goals (SDGs). This set of 17 goals imagines a future just 15 years \noff that would be rid of poverty and hunger, and safe from the worst effects of \nclimate change. It’s an ambitious plan. \nBut there’s ample evidence that we can succeed. In the past 15 

### Splitting Text into Chunks

In [13]:
# Create an instance of the TokenTextSplitter class with specific parameters.
# The 'model_name' parameter specifies the model to use for tokenization, 
# in this case, "gpt-3.5-turbo".
# The 'chunk_size' parameter sets the maximum size of each text chunk to 10,000 tokens.
# The 'chunk_overlap' parameter specifies the number of overlapping tokens between
#  chunks to ensure context continuity.
splitter_ques_gen = TokenTextSplitter(
    model_name="gpt-3.5-turbo", 
    chunk_size=10000,
    chunk_overlap=200
    )

# Use the split_text method of the splitter_ques_gen instance to divide
#  the concatenated text into chunks.
# The chunks are stored in the 'chunk_ques_gen' variable.
chunk_ques_gen = splitter_ques_gen.split_text(question_gen)

# Embedding model supports only document format as input and no other format 
document_ques_gen = [Document(page_content=t) for t in chunk_ques_gen]

# Create a list of Document objects from the text chunks.
# For each chunk of text in 'chunk_ques_gen', a new Document object is created 
# with 'page_content' set to the chunk text.
# The resulting list of Document objects is stored in the 'document_ques_gen' variable.
document_ques_gen = [Document(page_content=t) for t in chunk_ques_gen]

document_ques_gen

[Document(page_content='IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD \nCAME TOGETHER TO FACE THE FUTURE.\nAnd what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. \nNot just in some faraway place, but in their own cities and towns and villages.\nThey knew things didn’t have to be this way. They knew we had enough  \nfood to feed the world, but that it wasn’t getting shared. They knew there \nwere medicines for HIV and other diseases, but they cost a lot. They knew  \nthat earthquakes and floods were inevitable, but that the high death  \ntolls were not. \nThey also knew that billions of people worldwide shared their hope for a \nbetter future.\nSo leaders from these countries created a plan called the Sustainable \nDevelopment Goals (SDGs). This set of 17 goals imagines a future just 15 years \noff that would be rid of poverty and hunger, and safe from the worst effects of \nclimate change. It’s an ambitious plan. \nBut there’s ample evidence that we can s

### Splitting Documents into Smaller Chunks

In [14]:
# Create an instance of the TokenTextSplitter class with specific parameters for splitting answers.
# The 'model_name' parameter specifies the model to use for tokenization, in this case, "gpt-3.5-turbo".
# The 'chunk_size' parameter sets the maximum size of each text chunk to 1,000 tokens.
# The 'chunk_overlap' parameter specifies the number of overlapping tokens between chunks
# to ensure context continuity.
splitter_ans_gen = TokenTextSplitter(
    model_name="gpt-3.5-turbo", 
    chunk_size=1000,
    chunk_overlap=100
    )

# Use the split_documents method of the splitter_ans_gen instance to divide the 
# Document objects into smaller chunks.
# The resulting chunks are stored in the 'document_ans_gen' variable.
document_ans_gen = splitter_ans_gen.split_documents(document_ques_gen)

document_ans_gen

[Document(page_content='IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD \nCAME TOGETHER TO FACE THE FUTURE.\nAnd what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. \nNot just in some faraway place, but in their own cities and towns and villages.\nThey knew things didn’t have to be this way. They knew we had enough  \nfood to feed the world, but that it wasn’t getting shared. They knew there \nwere medicines for HIV and other diseases, but they cost a lot. They knew  \nthat earthquakes and floods were inevitable, but that the high death  \ntolls were not. \nThey also knew that billions of people worldwide shared their hope for a \nbetter future.\nSo leaders from these countries created a plan called the Sustainable \nDevelopment Goals (SDGs). This set of 17 goals imagines a future just 15 years \noff that would be rid of poverty and hunger, and safe from the worst effects of \nclimate change. It’s an ambitious plan. \nBut there’s ample evidence that we can s

### Creating Prompt Templates for Question Generation and Refinement

In [15]:
# Define a prompt template for generating questions based on coding materials and documentation.
# This template includes a placeholder {text} where the actual content will be inserted.
prompt_template = """
You are an expert at creating questions based on coding materials and documentation.
Your goal is to prepare a coder or programmer for their exam and coding tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the coders or programmers for their tests.
Make sure not to lose any important information.

QUESTIONS:
"""

# Create a PromptTemplate object for generating questions, using the defined template.
# The input_variables parameter specifies that the template expects a variable named "text".
PROMPT_QUESTIONS = PromptTemplate(
    template=prompt_template, 
    input_variables=["text"]
    )

print("PROMPT_QUESTIONS created:\n")
print(PROMPT_QUESTIONS)
print("\n---------------------------------------------------------------\n")

# Define a prompt template for refining existing practice questions based on additional context.
# This template includes placeholders {existing_answer} and {text} where the original 
# questions and new content will be inserted.
refine_template = ("""
You are an expert at creating practice questions based on coding material and documentation.
Your goal is to help a coder or programmer prepare for a coding test.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.
QUESTIONS:
"""
)

# Create a PromptTemplate object for refining questions, using the defined template.
# The input_variables parameter specifies that the template expects variables named 
# "existing_answer" and "text".
REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

print("REFINE_PROMPT_QUESTIONS created:\n")
print(REFINE_PROMPT_QUESTIONS)


PROMPT_QUESTIONS created:

input_variables=['text'] template='\nYou are an expert at creating questions based on coding materials and documentation.\nYour goal is to prepare a coder or programmer for their exam and coding tests.\nYou do this by asking questions about the text below:\n\n------------\n{text}\n------------\n\nCreate questions that will prepare the coders or programmers for their tests.\nMake sure not to lose any important information.\n\nQUESTIONS:\n'

---------------------------------------------------------------

PROMPT_QUESTIONS created:

input_variables=['existing_answer', 'text'] template='\nYou are an expert at creating practice questions based on coding material and documentation.\nYour goal is to help a coder or programmer prepare for a coding test.\nWe have received some practice questions to a certain extent: {existing_answer}.\nWe have the option to refine the existing questions or add new ones.\n(only if necessary) with some more context below.\n------------\

### Setting Up and Running the Question Generation Pipeline

In [20]:
# Create an instance of the ChatOpenAI class with specific parameters.
# The 'model' parameter specifies the model to use for generating questions, 
# in this case, "gpt-3.5-turbo".
# The 'temperature' parameter controls the randomness of the model's output, 
# set to 0.3 for balanced creativity and coherence.
llm_ques_gen_pipeline = ChatOpenAI(
    model='gpt-3.5-turbo',
    temperature=0.5
)

# Load a summarization chain that will generate and refine questions using 
# the provided model and prompt templates.
# The 'llm' parameter specifies the language model to use.
# The 'chain_type' parameter specifies the type of chain to use, in this case, "refine".
# The 'verbose' parameter is set to True to enable detailed logging of the process.
# The 'question_prompt' parameter specifies the initial prompt template for generating questions.
# The 'refine_prompt' parameter specifies the template for refining the generated questions.
ques_gen_chain = load_summarize_chain(
    llm=llm_ques_gen_pipeline,
    chain_type="refine", 
    verbose=True, 
    question_prompt=PROMPT_QUESTIONS, 
    refine_prompt=REFINE_PROMPT_QUESTIONS
)

# Run the question generation chain on the document chunks.
# The resulting questions are stored in the 'ques' variable.
ques = ques_gen_chain.run(document_ques_gen)

# Print the generated questions.
print(ques)



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert at creating questions based on coding materials and documentation.
Your goal is to prepare a coder or programmer for their exam and coding tests.
You do this by asking questions about the text below:

------------
IN THE YEAR 2015, LEADERS FROM 193 COUNTRIES OF THE WORLD 
CAME TOGETHER TO FACE THE FUTURE.
And what they saw was daunting. Famines. Drought. Wars. Plagues. Poverty. 
Not just in some faraway place, but in their own cities and towns and villages.
They knew things didn’t have to be this way. They knew we had enough  
food to feed the world, but that it wasn’t getting shared. They knew there 
were medicines for HIV and other diseases, but they cost a lot. They knew  
that earthquakes and floods were inevitable, but that the high death  
tolls were not. 
They also knew that billions of people worldwide shared their hope for a

### Generating Answers for Generated Questions and Saving to a File

In [21]:
# Create an instance of OpenAIEmbeddings to generate embeddings for the documents.
embeddings = OpenAIEmbeddings()

# Create a FAISS vector store from the documents with generated embeddings.
# This will allow for efficient retrieval of relevant documents based on the generated questions.
vector_store = FAISS.from_documents(
    documents=document_ans_gen, 
    embedding=embeddings
    )

# Split the generated questions string into a list of individual questions.
ques_list = ques.split("\n")

# Create a RetrievalQA chain for generating answers using the language model and the 
# vector store as a retriever.
# The 'chain_type' parameter is set to "stuff", indicating the type of retrieval 
# and processing method to use.
answer_generation_chain = RetrievalQA.from_chain_type(
    llm=llm_ques_gen_pipeline, 
    chain_type="stuff",
    retriever=vector_store.as_retriever()
)

# Iterate over each question in the list.
for question in ques_list:
    # Print the current question.
    print("Question: ", question)
    
    # Use the answer generation chain to generate an answer for the current question.
    answer = answer_generation_chain.run(question)
    
    # Print the generated answer.
    print("Answer: ", answer)
    print("--------------------------------------------------\n\n")
    
    # Save the question and answer to a file named "answers.txt".
    with open("answers.txt", "a") as f:
        f.write("Question: " + question + "\n")
        f.write("Answer: " + answer + "\n")
        f.write("--------------------------------------------------\n\n")

Question:  1. What is the main goal of the Sustainable Development Goals (SDGs) mentioned in the text?
Answer:  The main goal of the Sustainable Development Goals (SDGs) mentioned in the text is to end extreme poverty in all forms by 2030.
--------------------------------------------------


Question:  2. How has the international community made progress in reducing extreme poverty in the past 15 years?
Answer:  In the past 15 years, the international community has made significant progress in reducing extreme poverty by cutting it in half. This achievement demonstrates that with concerted efforts and global cooperation, it is possible to make a positive impact on reducing poverty levels worldwide.
--------------------------------------------------


Question:  3. What are some of the key achievements in the area of health mentioned in the text?
Answer:  Some key achievements in the area of health mentioned in the text include a significant reduction in preventable child deaths and mat