In [240]:
import os
import openai

os.environ["OPENAI_API_KEY"] = #
openai.api_key = os.environ["OPENAI_API_KEY"]

from llama_index import (
    load_index_from_storage, SimpleDirectoryReader, StorageContext, 
    ServiceContext, GPTVectorStoreIndex, LLMPredictor, PromptHelper
)
from langchain import OpenAI
import os
from tqdm import tqdm
import shutil

In [241]:
def create_index(file_path, file='doc.pdf'):
    if os.path.exists('tmp_index'):
        shutil.rmtree('tmp_index')
    os.mkdir('tmp_index')

    with open(file_path, 'rb') as f:
        with open(f'tmp_index/{file}', 'wb') as f2:
            f2.write(f.read())

    # set maximum input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 256
    # set maximum chunk overlap
    chunk_overlap_ratio = 0.1
    # set chunk size limit
    chunk_size_limit = 600

    # define LLM
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=num_outputs))
    prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio, chunk_size_limit=chunk_size_limit)

    documents = SimpleDirectoryReader('tmp_index').load_data()

    index = GPTVectorStoreIndex(
        documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
    )

    index.storage_context.persist('tmp_index/index')

    return index

In [242]:
base_thought = """
    I'd like you to help me identify specific issues in the present assessment and how they could be resolved using better
    instrumentation and data. Include such things as requested updates to reduce bias, reduce uncertainty, or improve accuracy. 
    If there are specific unknowns or missing features that currently hamper the quality or capabilities of the assessment include
    those as well. Also determine if there are outcomes that are just impossible to determine with the present data
    that the authors believe would be beneficial for management of the fishery. Please format the document as follows:

    Instrumentation to improve the quality of the assessment of the <stock name> stock

    <Issue>
    - Specification: <To remove any uncertainty about what the issue is>
    - Evidence: <The data that supports this actually being an issue>
    - Resolution: <A resolution in the form of new data or improvement to current data - be specific>
    - Impact: <The impact that this resolution would have on the assessment outcomes>

    Please give me the top 10 issues identified.
"""

for path in tqdm(os.listdir('chatbot_data')):
    if not path.endswith('.pdf'):
        continue
    file_path = os.path.join('chatbot_data', path)
    create_index(file_path)
    service_context = ServiceContext.from_defaults(chunk_size=512)
    storage_context = StorageContext.from_defaults(persist_dir=f"tmp_index/index")
    index = load_index_from_storage(storage_context, service_context=service_context).as_query_engine()
    index_response = index.query(base_thought).response
    report_path = os.path.join("reports", f"{file_path.split('/')[-1].split('.')[0]}_report.txt")
    with open(report_path, 'w') as fh:
        fh.write(index_response)


100%|██████████| 18/18 [09:09<00:00, 30.51s/it]
