In [141]:
import os
import openai

os.environ["OPENAI_API_KEY"] = #
openai.api_key = os.environ["OPENAI_API_KEY"]

from llama_index import (
    load_index_from_storage, SimpleDirectoryReader, StorageContext, 
    ServiceContext, GPTVectorStoreIndex, LLMPredictor, PromptHelper
)
from langchain import OpenAI
import os

# The Idea

The whole idea behind Network Earth is to provide the instrumentation updates that catalyze new and tremendous science. However a catalyst needs to bring together all the constituent parts in the right way. And that means it's not enough for us to just build the instrumentation. We also need to pull those who would use it into the equation. What we are looking for specifically are those who are at the edge of what science can do and are looking to do more but just don't quite have the tools they need to make it happen. We're looking for the trail blazers. 

However identifying just one or two trailblazers is not enough because technology really brings benefits when it gives us generalized solutions to problems that can then form a foundation for many things to come. Therefore we want to find trailblazers who are all more or less seeing the same things so that we can find generalized rather than bespoke solutions to problems. 

Finding all of these trailblazers and organizing them into groups and categories would mean an absurd amount of reading on my part so let's see if there's a way to use some of the latest and greatest tech to reduce the effort on my end so I can focus on the more creative parts of this problem.

In [142]:
REPORT = 'chatbot_data'

def construct_index(directory_path):
    # set maximum input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 256
    # set maximum chunk overlap
    chunk_overlap_ratio = 0.1
    # set chunk size limit
    chunk_size_limit = 600

    # define LLM
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=num_outputs))
    prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio, chunk_size_limit=chunk_size_limit)

    documents = SimpleDirectoryReader(directory_path).load_data()

    index = GPTVectorStoreIndex(
        documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper
    )

    index.storage_context.persist('index.json')

    return index

construct_index(REPORT)

<llama_index.indices.vector_store.base.VectorStoreIndex at 0xffff5dbdf5b0>

In [143]:
import regex as re
import json
from tqdm import tqdm

openai.api_key = os.environ["OPENAI_API_KEY"]
client = openai.OpenAI()

service_context = ServiceContext.from_defaults(chunk_size=512)
storage_context = StorageContext.from_defaults(persist_dir=f"index.json")
index = load_index_from_storage(storage_context, service_context=service_context).as_query_engine()

def expand_on_it(thought, client, index):
    response = index.query(f"Expand on {thought}").response
    messages = [{
        "role": "user",
        "content": f"""
        Enumerate the key points identified in the following. 

        {response}
        """
    }]
    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=0.6,
        max_tokens=1000,
    )
    agent_response = response.choices[0].message.content
    enumeration = [match for match in re.findall(r"[0-9]+\.([^\n]*)", agent_response)]
    return enumeration

In [144]:
useful_thoughts = {
    0: "uncertainties, risks, and unknowns in stock assessments and modeling",
    1: "instrumentation that would improve the quality of the stock assessment and modeling",
}

the_thought = useful_thoughts[1]
results = expand_on_it(the_thought, client, index)

second_level = {}
for thought in tqdm(results):
    second_level[thought] = expand_on_it(thought, client, index)

content = ""
for thought, sub_thoughts in second_level.items():
    content += f"On {thought}: \n"
    for sub_thought in sub_thoughts:
        content += f" - {sub_thought}\n"

messages = [{
    "role": "user",
    "content": f"""
    In 500 words or less summarize the following. Try to be as consise as possible without losing any of the specifics.

    {content}
    """
}]
response = client.chat.completions.create(
    model="gpt-4",
    messages=messages,
    temperature=0.6,
    max_tokens=1000,
)
report = response.choices[0].message.content
with open(f'{REPORT}.txt', 'w') as fh:
    fh.write(report)

100%|██████████| 7/7 [01:07<00:00,  9.58s/it]
