In [5]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.openai_functions import create_structured_output_runnable
from langchain.document_loaders import PyPDFLoader
from langchain import OpenAI, PromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
import openai
import PyPDF2
import glob
import os
from dotenv import load_dotenv

from typing import Optional

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
    create_openai_fn_runnable,
    create_structured_output_runnable,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# %load_ext dotenv
# %dotenv ./.env
config = load_dotenv('.env')

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY
model = "gpt-3.5-turbo"

In [6]:
def extract_pdf_text(filename):
    file = open(filename, 'rb')
    reader = PyPDF2.PdfReader(file)
    text = ''
    for page in range(len(reader.pages)):
        page = reader.pages[page]
        text += page.extract_text()
    return text

In [10]:
def extract_pdf_text_from_dir(dir):
    text = ''
    for filename in glob.glob(os.path.join(dir, '*.pdf')):
        text += extract_pdf_text(filename)
    return text

In [13]:
text = extract_pdf_text_from_dir(os.path.join('./data', 'pdfs'))
with open('../data/texts/exams.txt', 'w') as f:
    f.write(text)


In [23]:
text = extract_pdf_text('../data/pdfs/exam.pdf')

In [24]:
print(text[:100])

CS230: Deep Learning
Fall Quarter 2018
Stanford University
Midterm Examination
180 minutes
Problem F


In [25]:
llm = OpenAI(temperature=0.2, openai_api_key=OPENAI_API_KEY, model='gpt-3.5-turbo')

In [26]:
class Question(BaseModel):
    """ An Exam question """
    question_id: int = Field(..., description="The question id")
    qustion_type: str = Field(..., description="The type of question, e.g. multiple choice, short answer, long answer")
    question: str = Field(..., description="The question text")
    choices: list[str] = Field(..., description="The choices for the question")
    answer: str = Field(..., description="The answer to the question")
    topic: str = Field(..., description="The topic of the question")

In [27]:
class Questions(BaseModel):
    """ A collection of questions """
    questions: list[Question] = Field(..., description="The questions")

In [40]:
schema = {
    "properties": {
        "question_id": {"type": "string"},
        "question": {"type": "string"},
        "choices": {"type": "array", "option": {"type": "string"}},
        "answer": {"type": "string"},
        "question_type": {"type": "string"},
        "topic": {"type": "string"},
    },
    "required": ["question_id", "question", "answer", "question_type"]
}

In [30]:
def extract_question_answer_data(doc, llm):
    # input = prompt.format_prompt(exam_text=doc, schema=schema)
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a world class algorithm for extracting questions and answers from exams, and putting them in a structured format.",
            ),
            (
                "human",
                "Use the following exam text to extract the questions and answers. \n\n {exam_text}",
            ),
            (
                "human", "Tip:  Make sure to include the question id, question, answer, and question type in the output. Also make sure to use the correct format"
            )
        ]
    )
    chain = create_structured_output_chain(Questions, llm, prompt, verbose=True)
    extract = chain.run(doc)
    return extract

In [38]:
# response = extract_question_answer_data(text, llm)

In [None]:
# print(response)

In [48]:
loader = PyPDFLoader('data/pdfs/exam.pdf')
docs = loader.load_and_split()
print(len(docs))
print(docs[3])

25
page_content='CS230\nSolution: (iii)\n(e)(1 point) Consider the model deﬁned in question (d) with parameters initialized with\nzeros.W[1]denotes the weight matrix of the ﬁrst layer. You forward propagate a batch\nof examples, and then backpropagate the gradients and update the parameters. Which\nof the following statements is true?\n(i) Entries of W[1]may be positive or negative\n(ii) Entries of W[1]are all negative\n(iii) Entries of W[1]are all positive\n(iv) Entries of W[1]are all zeros\nSolution: (i)\n(f)(2 points) Consider the layers landl−1 in a fully connected neural network:\nThe forward propagation equations for these layers are:\nz[l−1]=W[l−1]a[l−2]+b[l−1]\na[l−1]=g[l−1](z[l−1])\nz[l]=W[l]a[l−1]+b[l]\na[l]=g[l](z[l])\nWhich of the following propositions is true? Xavier initialization ensures that :\n(i)Var(W[l−1]) is the same as Var(W[l]).\n(ii)Var(b[l]) is the same as Var(b[l−1]).\n(iii)Var(a[l]) is the same as Var(a[l−1]), at the end of training.\n(iv)Var(a[l]) is the sam

In [49]:
# text = docs[3].page_content
# text = """
# Question 3 (Loss Functions, 17 points + 3 bonus points)
# Equipped with cutting-edge Deep Learning knowledge, you are working with a biology lab.
# Specifically, you're asked to build a classifier that predicts the animal type from a given
# image into four ( ny= 4) classes: dog, cat, iguana, mouse . There's always exactly one
# animal per image. You decide to use cross-entropy loss to train your network. Recall that
# the cross-entropy (CE) loss for a single example is defined as follows:
# LCE(^y;y) =nyP
# i=1yilog ^yi
# where ^y= (^y1;:::; ^yny)>represents the predicted probability distribution over the classes
# andy= (y1;:::;yny)>is the ground truth vector, which is zero everywhere except for the
# correct class (e.g. y= (1;0;0;0)>fordog, andy= (0;0;1;0)>foriguana ).
# (a)(2 points) Suppose you're given an example image of an iguana. If the model correctly
# predicts the resulting probability distribution as ^ y= (0:25;0:25;0:3;0:2)>, what is the
# value of the cross-entropy loss? You can give an answer in terms of logarithms.
# Solution:log 0:3
# (b)(2 points) After some training, the model now incorrectly predicts mouse with distri-
# butionh0:0;0:0;0:4;0:6ifor the same image. What is the new value of the cross-entropy
# loss for this example?
# Solution:log 0:4
# (c)(2 points) Suprisingly, the model achieves lower loss for a misprediction than for a
# correct prediction. Explain what implementation choices led to this phenomenon.
# Solution: This is because our objective is to minimize CE-loss, rather than to
# directly maximize accuracy. While CE-loss is a reasonable proxy to accuracy, there
# is no guarantee that a lower CE loss will lead to higher accuracy.
# (d)(2 points) Given your observation from question (c), you decide to train your neural
# network with the accuracy as the objective instead of the cross-entropy loss. Is this a
# good idea? Give one reason. Note that the accuracy of a model is defined as
# Accuracy =(Number of correctly-classified examples)
# (Total number of examples)
# """


In [50]:
# prompt = PromptTemplate.from_template(
#     "Read the following exam paper and extract the information requested in the schema for each question \n\n{exam_text}\n\n\n\n"
# )

In [51]:
# chain = create_structured_output(schema, llm, prompt)
# input = prompt.format_prompt(exam_text=text)

In [52]:
# output = runnable.invoke(input)

In [53]:
# questions_json = extract_question_answer_data(docs[3], schema, llm)
# with open('data/exams.json', 'w') as f:
#     f.write(questions_json)

In [55]:
json_schema = {
    "title": "Person",
    "description": "Identifying information about a person.",
    "type": "object",
    "properties": {
        "name": {"title": "Name", "description": "The person's name", "type": "string"},
        "age": {"title": "Age", "description": "The person's age", "type": "integer"},
        "fav_food": {
            "title": "Fav Food",
            "description": "The person's favorite food",
            "type": "string",
        },
    },
    "required": ["name", "age"],
}

In [36]:
# prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "You are a world class algorithm for extracting information in structured formats.",
#         ),
#         (
#             "human",
#             "Use the given format to extract information from the following input: {input}",
#         ),
#         ("human", "Tip: Make sure to answer in the correct format"),
#     ]
# )

# runnable = create_structured_output_chain_runnable(json_schema, llm, prompt)
# runnable.invoke({"input": "Sally is 13"})

In [41]:
def aiprocessor(page_no, text, json_schema, llm=llm):
    print(f"\n\n..AI processing page {page_no}")
    messages = [
        {
            "role": "system",
            "content": """You are a world class algorithm for extracting questions and answers from exams, and putting them in a structured format.
- User input is messy raw text extracted from a PDF page by PyPDF2.
- The goal is to identify each question and extract the details cleanly as json.
- Make sure you get every question and answer, and use the correct format."""
        },
        {
            "role": "user",
            "content": """use the following schema to extract the questions and answers. \n\n {json_schema}"""
        },
        {
            "role": "user",
            "content": """raw pdf text; extract and format tables: {text}"""
        }
    ]

    api_params = {"model": llm, "messages": messages, "stream": True}
    try:
        api_response = openai.ChatCompletion.create(**api_params)
        reply = ""
        for delta in api_response:
            if not delta['choices'][0]['finish_reason']:
                word = delta['choices'][0]['delta']['content']
                reply += word
                print(word, end ="")       
        return reply
    except Exception as err:
        error_message = f"API Error page {page_no}: {str(err)}"
        print(error_message)

In [32]:
# Replace with your OpenAI API key and model
# Create a list to store AI-processed text
ai_processed_text_list = []

# Open the PDF file in binary mode
pdf_file = "../data/pdfs/exam.pdf"
with open(pdf_file, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Iterate through each page and extract text
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()

        if len(page_text)>20:
            # Dump unprocessed pages if desired
            page_text_file = pdf_file.name + "-extractedpage" + str(page_num) + ".txt"
            with open(page_text_file, 'w', encoding='utf-8') as output_file:
                output_file.write(page_text)

            # Process with AI
            ai_processed_text = aiprocessor(page_num, page_text)

            # Dump AI pages if desired
            page_text_file = pdf_file.name + "-AIpage" + str(page_num) + ".txt"
            with open(page_text_file, 'w', encoding='utf-8') as output_file:
                output_file.write(page_text)

            # Append the AI-processed text to the list
            ai_processed_text_list.append(ai_processed_text)

# Combine all AI-processed text into a single string
combined_text = "\n".join(ai_processed_text_list)

# Define the output text file name (same root name as the PDF)
output_text_file = pdf_file.name + "-AI-all.txt"

# Save the combined text into a .txt file
with open(output_text_file, 'w', encoding='utf-8') as output_file:
    output_file.write(combined_text)

print(f"AI-processed text saved to {output_text_file}")



..AI processing page 0
{
  "question": "Problem 1",
  "points": 10,
  "type": "Multiple Choice"
}

..AI processing page 1
[
  {
    "question_number": "1",
    "question_type": "Multiple Choice Questions",
    "question_text": "Which of the following techniques does NOT prevent a model from overfitting?",
    "choices": [
      "Data augmentation",
      "Dropout",
      "Early stopping",
      "None of the above"
    ],
    "correct_answer": "None of the above",
    "score": "10"
  },
  {
    "question_number": "2",
    "question_type": "Multiple Choice Questions",
    "question_text": "Consider the following data sets: Xtrain = (x(1);x(2);:::;x(mtrain)); Ytrain = (y(1);y(2);:::;y(mtrain)) Xtest = (x(1);x(2);:::;x(mtest)); Ytest = (y(1);y(2);:::;y(mtest)) You want to normalize your data before training your model. Which of the following propositions are true? (Circle all that apply.)",
    "choices": [
      "The normalizing mean and variance computed on the training set, and used t

KeyboardInterrupt: 

In [33]:
combined_text = "\n".join(ai_processed_text_list)


In [34]:
output_text_file = pdf_file.name + "-AI-all.txt"

# Save the combined text into a .txt file
with open(output_text_file, 'w', encoding='utf-8') as output_file:
    output_file.write(combined_text)

print(f"AI-processed text saved to {output_text_file}")

AI-processed text saved to ../data/pdfs/exam.pdf-AI-all.txt
