In [7]:
import os
import openai
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

openai.api_key = os.environ.get("OPENAI_API_KEY")

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate, LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain

In [20]:
import pyperclip

In [24]:
input_content = """
Mitchell: 
Conditional on AGI being developed by 2070, what is the probability that humanity will suffer an existential catastrophe due to loss of control over an AGI system?

GPT4: 
The risks associated with AGI development will depend on factors such as safety precautions, governance, international cooperation, and broader awareness of possible risks. The best approach is to work on developing AI safety research, responsible AI policies and global cooperation to ensure that AGI is built in a manner that is beneficial to humanity and mitigates the risks of existential catastrophe.

Mitchell: 
What are the inputs into technical AGI safety alignment? And what are the most plausible approaches to solving AGI alignment?

GPT4: 
Inputs into technical AGI safety alignment generally involve understanding and defining what kind of behavior we want the AGI to exhibit, ensuring that its actions are aligned with human values, and developing methods to prevent misaligned behavior. Some inputs into AGI alignment include:

1. Value specification: Defining appropriate objective functions, goals, or reward signals for AGI that encompass human values and alignment with long-term human interests.
2. Robustness: Ensuring the AGI system remains stable, correct, and aligned under various conditions, including uncertainty, limited information, or adversarial attacks.
3. Interpretability: Building AGI systems that are transparent, explainable, and understandable by humans, helping us discern if and when the system is misaligned.
4. Scalability: Developing AGI safety solutions that can scale with the increasing complexity and capabilities of AI systems.
5. Continuous improvement: Ensuring that AGI systems can learn and adapt over time while maintaining alignment, avoiding value drift or undesirable extrapolation of initial objectives.

Some plausible approaches to solving AGI alignment include:

1. Inverse reinforcement learning (IRL): Learning a reward function from observed human behavior and then training the AGI system to optimize for that reward signal. This approach seeks to infer human values through observation and may be combined with other techniques to refine the reward function further.
2. Cooperative inverse reinforcement learning (CIRL): Casting the alignment problem as a cooperative game between the AGI and human, wherein both are trying to maximize a shared reward function. This approach emphasizes the importance of the AGI system learning to cooperate with humans and other AGI systems.
3. Debate and amplification: Using AI systems to simulate debates, in which different agents argue for possible actions or policies, with human judges selecting the most convincing arguments. This could help surface potential risks and avoid deceptive behavior from AGI systems.
4. Iterated distillation and amplification (IDA): Training AGI systems to perform tasks by iteratively breaking down complex tasks into simpler sub-tasks, and then using a combination of AI assistance and human feedback to improve the AI system's understanding of human values and its responses to alignment questions.
5. Value learning: Developing AGI systems that can infer and learn human values from different sources, including human feedback, natural language text, and other cultural artifacts. This approach aims to implicitly embed human preferences into AGI systems.

Collaborative efforts among AI researchers, ethicists, policymakers, and other stakeholders will be necessary to develop and test these approaches' effectiveness in ensuring AGI aligns with human values and interests.

Mitchell: 
Of those 5 plausible approaches, please rank them from most promising to least promising where 1 is the most promising. Also, give your reasoning why that type of research shows promise and possibly where to go next.

GPT4:
It is difficult to definitively rank these approaches, as each has its strengths and limitations, and the most promising long-term AGI alignment strategy may involve a combination of these techniques. However, I can provide an analysis of their current promise, noting that the effectiveness of these approaches may evolve over time as further research is undertaken. Please keep in mind that this is a subjective assessment.

1. Iterated distillation and amplification (IDA): IDA shows promise as it aims to harness the combined strengths of AI assistance and human feedback. By breaking down complex tasks into simpler sub-tasks and refining the AI's understanding of human values, IDA may enable AGI systems to understand and learn from nuanced feedback. Continued research in this area might involve refining the methods used to break down tasks and improving methods of providing human feedback to AI models.

2. Inverse reinforcement learning (IRL) and Cooperative inverse reinforcement learning (CIRL): Both IRL and CIRL are promising because they focus on learning a reward function from observed human behavior or direct cooperation with humans. Both of these approaches emphasize inferring human values and adapting AGI systems to optimize for those values. Future research can investigate how to obtain more accurate reward functions or develop more efficient learning algorithms to ensure AGI alignment.

3. Value learning: This approach is promising because it aims to leverage various sources of human value information, including text, cultural artifacts, and human feedback. These diverse sources may help create a more robust understanding of human preferences and values. Future directions for value learning might involve improving the methods used to extract value information from diverse sources, and further investigating how to ensure AGI systems adequately model this information.

4. Debate and amplification: This approach can potentially help identify deceptive AGI behavior and uncover hidden risks through argumentation. However, there may be some concerns regarding the scalability and limitations of relying on human judges for debate outcomes. Nonetheless, investigating the methods used to simulate debates and enhance human judges' decision-making could be fruitful avenues for future research.

It's important to note that this subjective ranking should not discourage research in any of these areas. The field of AGI safety is complex and rapidly evolving. To ensure the development of safe and beneficial AGI, a combination of approaches may be necessary, and substantial progress in one area may change the perceived promise of these techniques over time. Further research, development, and collaboration among different stakeholders will be critical to determining the most effective AGI safety alignment methods.

Mitchell: 
Please give me all the title references for Iterated distillation and amplification (IDA) papers. Also give a list of all the researchers who've contributed to IDA.

GPT4:
Iterated distillation and amplification (IDA) is a relatively new concept, and the number of papers directly addressing IDA is limited. However, the primary source of the concept can be found in the following paper:

1. Christiano, P. (2018). "Supervising strong learners by amplifying weak experts." AI Alignment Blog. Retrieved from https://ai-alignment.com/research-ideas-964639a853ea

Although the author, Paul Christiano, is credited with the introduction of iterated distillation and amplification, many researchers have contributed to related topics such as AI alignment, amplification, and others that fall under the broader umbrella of AGI safety. Some of these researchers include:

- Paul Christiano: Research Scientist at OpenAI
- Stuart Russell: Professor of Computer Science at UC Berkeley
- Ilya Sutskever: Co-founder and Chief Scientist of OpenAI
- Dario Amodei: Director of Research at OpenAI
- Jan Leike: Senior Research Scientist at DeepMind
- Chris Olah: Research Scientist at OpenAI
- Victoria Krakovna: Research Scientist at DeepMind
- Owain Evans: Research Scientist at OpenAI
- Tom Dietterich: Professor of Computer Science at Oregon State University

Keep in mind that research in AI alignment, including IDA, is highly interdisciplinary, and the list of researchers above is by no means exhaustive. The field is continuously growing, and there are many researchers working on different aspects of AI safety and AGI alignment who may, directly or indirectly, contribute to the development of IDA and related approaches in the future.

Mitchell: What about Buck Shlegeris?

GPT4: Buck Shlegeris is a Research Scientist at OpenAI focused on long-term safety in AI systems. He has been involved in researching various aspects of AI safety and has made valuable contributions to the field. Although he may not have specifically worked on iterated distillation and amplification (IDA) papers, his research and insights are still relevant to AGI alignment and safety. His work indirectly plays a role in shaping the broader landscape of AI safety research, which includes IDA as one of the approaches.
"""

completion = openai.ChatCompletion.create(
    model="gpt-4", 
    messages=[{"role": "user", 
               "content": input_content}]
    )
string_to_copy = completion.choices[0].message.content
pyperclip.copy(string_to_copy)
print(completion.choices[0].message.content)


Buck Shlegeris is a Research Scientist at OpenAI focused on long-term safety in AI systems. He has been involved in researching various aspects of AI safety and has made valuable contributions to the field. Although he may not have specifically worked on iterated distillation and amplification (IDA) papers, his research and insights are still relevant to AGI alignment and safety. His work indirectly plays a role in shaping the broader landscape of AI safety research, which includes IDA as one of the approaches.


## LangChain Example

In [14]:
llm = OpenAI(temperature=0.9)
prompt = PromptTemplate(
    input_variables=["math_func"],
    template="write a python function that outputs an S-curve {math_func} along with documentation on how to adjust the various parameters",
)

chain = LLMChain(llm=llm, prompt=prompt)

In [15]:
chain.run("sigmoid")

'\n\ndef sigmoid_scurve(slope, center_x, center_y):\n    """\n    Returns a sigmoid S-curve\n\n    Parameters\n    ----------\n    slope : float\n        The steepness of the curve\n    center_x : float\n        The x-coordinate of the center point of the curve\n    center_y : float\n        The y-coordinate of the center point of the curve\n\n    Returns\n    -------\n    sigmoid : function \n        A function of the form y = 1 / (1 + exp(-slope * (x - center_x))) + center_y\n    """\n    \n    def sigmoid(x):\n        return 1 / (1 + np.exp(-slope * (x - center_x))) + center_y\n    \n    return sigmoid'

In [8]:
chat = ChatOpenAI(temperature=0, 
                  model_name="gpt-4")

In [10]:
template="You are a helpful assistant that translates {input_language} to {output_language}."
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_template="{text}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
     

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, 
                                                human_message_prompt])

# get a chat completion from the formatted messages
chat(chat_prompt.format_prompt(input_language="English", 
                               output_language="Spanish", 
                               text="Artificial Intelligence is the future.").to_messages())
     


AIMessage(content='La inteligencia artificial es el futuro.', additional_kwargs={})