In [1]:
import warnings
warnings.filterwarnings('ignore')
import unstructured
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from filetype import guess
import pandas as pd
import os


# Function to detect document type
def detect_document_type(document_path):
    guess_file = guess(document_path)
    file_type = ""
    image_types = ['jpg', 'jpeg', 'png', 'gif']
    
    if guess_file is not None:
        if(guess_file.extension.lower() == "pdf"):
            file_type = "pdf"
        elif(guess_file.extension.lower() in image_types):
            file_type = "image"
        else:
            file_type = "unknown"
    else:
        file_type = "unknown"
            
    return file_type

# Function to extract document content
def extract_file_content(file_path):
    file_type = detect_document_type(file_path)
    
    if file_type == "pdf":
        loader = UnstructuredFileLoader(file_path)
    elif file_type == "image":
        loader = UnstructuredImageLoader(file_path)
    else:
        raise ValueError("Unsupported file type")
        
    document = loader.load()
    documents_content = '\n'.join(doc.page_content for doc in document)
    
    return documents_content

# Create the text splitter
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len)

# Create embeddings
os.environ["OPENAI_API_KEY"] = "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"
embeddings = OpenAIEmbeddings()

# Create a vector index
def get_doc_search(text_splitter):
    return FAISS.from_texts(text_splitter, embeddings)

# Load the question-answering chain
chain = load_qa_chain(OpenAI(), chain_type="map_rerank", return_intermediate_steps=True)

# Function to chat with a single file and questions
def chat_with_file(file_path, query, text_splitter, chain):
    file_content = extract_file_content(file_path)
    file_splitter = text_splitter.split_text(file_content)
    
    document_search = get_doc_search(file_splitter)
    documents = document_search.similarity_search(query)
    
    results = chain({
        "input_documents": documents,
        "question": query
    }, return_only_outputs=True)
    results = results['intermediate_steps'][0]
    
    return results

# Ask the user how many PDFs they want to process
num_pdfs = int(input("How many PDFs do you want to process? "))

# Collect the paths to the PDFs
pdf_paths = []
for i in range(num_pdfs):
    pdf_path = input(f"Enter the path to PDF {i + 1}: ")
    pdf_paths.append(pdf_path)

# Ask the user how many questions they want to ask (ask only once as you mentioned)
num_questions = int(input("How many questions do you want to ask? "))
questions = []

# Collect the questions from the user
for i in range(num_questions):
    question = input(f"Enter question {i + 1}: ")
    questions.append(question)

result_df = pd.DataFrame(columns=["PDF Path"] + questions)

# Process each PDF and apply the same set of questions
for pdf_path in pdf_paths:
    print(f"Processing PDF: {pdf_path}")
    pdf_results = {"PDF Path": pdf_path}
    for i, question in enumerate(questions):
        results = chat_with_file(pdf_path, question, text_splitter, chain)
        answer = results["answer"]
        confidence_score = results["score"]
        pdf_results[question] = answer, confidence_score
    # Append the results for this PDF to the DataFrame
    result_df = result_df.append(pdf_results, ignore_index=True)

# Export the result DataFrame to an Excel spreadsheet
desktop_path = "/Users/lukegeel/Desktop/"
result_df.to_excel(desktop_path + "research_paper_annotater_result.xlsx", index=False)

Processing PDF: /Users/lukegeel/Desktop/research/Thomas/Preventing undesirable behavior of intelligent machines _ Science.pdf


Created a chunk of size 1288, which is longer than the specified 1000
Created a chunk of size 1339, which is longer than the specified 1000
Created a chunk of size 1579, which is longer than the specified 1000
Created a chunk of size 1288, which is longer than the specified 1000
Created a chunk of size 1339, which is longer than the specified 1000
Created a chunk of size 1579, which is longer than the specified 1000


Processing PDF: /Users/lukegeel/Desktop/research/Miller/Quasi_convexity_of_Certain_Multinomial_sums.pdf


In [3]:
%%bash
pip -q install langchain faiss-cpu unstructured
pip -q install openai tiktoken
pip -q install pytesseract pypdf
pip install filetype
pip install langchain faiss-cpu unstructured
pip install openai tiktoken
pip install pytesseract pypdf
pip install "unstructured[pdf]"
pip install unstructured[pdf]
pip install protobuf==3.20
pip install openpyxl



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip

[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip

[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip


Collecting pdf2image (from unstructured[pdf])
  Using cached pdf2image-1.16.3-py3-none-any.whl (11 kB)
Collecting pdfminer.six (from unstructured[pdf])
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.6/5.6 MB 473.9 kB/s eta 0:00:00
Collecting unstructured-inference (from unstructured[pdf])
  Downloading unstructured_inference-0.5.25-py3-none-any.whl (51 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.2/51.2 kB 442.3 kB/s eta 0:00:00
Collecting cryptography>=36.0.0 (from pdfminer.six->unstructured[pdf])
  Downloading cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl (2.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.8/2.8 MB 214.4 kB/s eta 0:00:00
Collecting layoutparser[layoutmodels,tesseract] (from unstructured-inference->unstructured[pdf])
  Using cached layoutparser-0.3.4-py3-none-any.whl (19.2 MB)
Collecting python-multipart (from unstructured-inference->unstructured[pdf])
  Using cached python_multipart-0.


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip


Collecting protobuf==3.20
  Downloading protobuf-3.20.0-cp310-cp310-macosx_10_9_universal2.whl (962 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 962.3/962.3 kB 291.6 kB/s eta 0:00:00
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.24.3
    Uninstalling protobuf-4.24.3:
      Successfully uninstalled protobuf-4.24.3


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
onnx 1.14.1 requires protobuf>=3.20.2, but you have protobuf 3.20.0 which is incompatible.


Successfully installed protobuf-3.20.0



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: pip install --upgrade pip


Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
