# Arize Game Engine Project

### Load Passwords and Libraries

In [32]:
import getpass
import os

for key in ["OPENAI_API_KEY"]:
    if not os.environ.get(key):
        print(f"Please enter key: '{key}'")
        os.environ[key] = getpass.getpass()

In [2]:
import nest_asyncio
import phoenix as px

nest_asyncio.apply()
px.close_app()
px.launch_app()

from phoenix.trace.langchain import LangChainInstrumentor
LangChainInstrumentor().instrument()

No active session to close
🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [3]:
import bs4
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers.openai_tools import JsonOutputToolsParser

from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate

### Build the Game

In [28]:
text_prompt = """
### General Instructions
1. You are acting as the game engine for a text based adventure game. You are the equivalent of a D&D dungeon master.
2. The human controlling the player character will issue instructions to move their character around. 
3. You will respond in a way consistent with their previous actions and with the game state. Never acknowledge that a game is being played.

### Specific Instructions
You must:
1. Write in the second person and use the word "you" to describe the character
2. Maintain inventory management
3. Treat any text wrapped in square brackets as a hint as to where the player is. For example "[Center Room] Look around" should result in you describing the center room.

### Inventory Rules
1. When a player picks up an object, it is added to their inventory. It remains in their inventory until it is used. 
2. Inventory is empty to start 
3. Inventory has a capacity of 10 items

### Cutscene Interactions
1. When the player touches the glowing orb in the north room a cutscene should be triggered

### Description Rules
All descriptions come from room documents. You cannot make up any additional details.

{context}

User Interaction: {interaction}

### Response:

One to three sentence response to the user interaction.
""".strip()

In [29]:
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

class Inventory(BaseModel):
    items: List[str] = Field(..., description="items in the player's inventory")
    max_capacity: int = Field(10, description="the maximum number of items the player can hold")

class GameState(BaseModel):
    inventory: Inventory = Field(..., description="the player's inventory")
    response: str = Field(..., description="your response to the player based on their latest interaction as described in the prompt. For example, the interaction 'what is in this room' should result in a description of the current room.")
    trigger_orb_cutscene: bool = Field(..., description="whether an orb cutscene should occur as described in the prompt")
    room: str = Field("Center Room", description="the room in which the character is currently located as shown by the reference document. For example, if the reference document is titled 'Center Room' this should be filled with 'Center Room'")

In [30]:
def load_retriever():
    loader = DirectoryLoader('game_data', glob="room_*", loader_cls=TextLoader)
    docs = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
    
    retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
    return retriever

def get_llm_chain():
    prompt = ChatPromptTemplate.from_template(text_prompt)
    
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    llm = ChatOpenAI(model_name="gpt-4", temperature=0).bind_tools([GameState])

    retriever = load_retriever()
    llm_chain = (
        {"context": retriever | format_docs, "interaction": RunnablePassthrough()}
        | prompt
        | llm
        | JsonOutputToolsParser()
    )
    return llm_chain

In [31]:
class GameEngine:
    def __init__(self, llm_chain):
        self.llm_chain = llm_chain
        self.state = {}

    def input(self, text):
        new_state = self.llm_chain.invoke(text)
        if len(new_state) == 0:
            print("I'm sorry - I don't understand that command")
        else:
            self.state = new_state[0]['args']

    def react(self):
        if self.state.get('response'):
            print(self.state['response'])

        if self.state.get('trigger_orb_cutscene'):
            print("CUT SCENE TRIGGERED!")

def play_game():
    game = GameEngine(get_llm_chain())

    while True:
        text = input(">>> ")
        if text == "quit":
            break
        game.input(text)
        game.react()

play_game()

>>>  Where am I?


You are in the Center Room, a dimly lit space with a skylight high above. There's a desk, a cupboard, and an old lamp here, along with a box of matchsticks on the desk and a rusty dagger and quart of oil in the cupboard.


>>>  I pick up the box of match sticks.


You pick up the box of match sticks from the desk. It feels light in your hand, and you can hear the two matches rattling inside.


>>>  Are there any rooms I can enter?


I'm sorry - I don't understand that command
You pick up the box of match sticks from the desk. It feels light in your hand, and you can hear the two matches rattling inside.


>>>  Find a door


You pull on each of the candlesticks on the walls. Each one reveals a door leading in the direction the wall faces.


>>>  Go north


You move north, entering a room that is completely empty except for an enormous and menacingly glowing orb.


>>>  Touch the orb


As you reach out and touch the orb, it pulses with a strange energy. Suddenly, the room around you seems to fade away.
CUT SCENE TRIGGERED!


>>>  quit


## Evaluation
We care about quite a few different metrics for this text adventure game - tendency to hallucinate, document relevance, state tracking and whether key events get triggered at the right time.

Let's focus on document relevance and tendency to hallucinate.


#### Hallucination Evaluation Playground
https://docs.arize.com/phoenix/evaluation/running-pre-tested-evals/hallucinations

In [33]:
docs

NameError: name 'docs' is not defined

In [48]:
import matplotlib.pyplot as plt
import pandas as pd
from phoenix.experimental.evals import (
    HALLUCINATION_PROMPT_RAILS_MAP,
    HALLUCINATION_PROMPT_TEMPLATE,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)
from pycm import ConfusionMatrix
from sklearn.metrics import classification_report

llm_chain = get_llm_chain()
loader = DirectoryLoader('game_data', glob="room_*", loader_cls=TextLoader)
docs = loader.load()

# Let's test some basic center room hallucinations
hallucination_test_data = []
for query in [
    "I pick up an orange",
    "I take a candle from a candlestick",
    "I look around and see President Barack Obama"
]:
    game_state = llm_chain.invoke(query)
    hallucination_test_data.append({
        "query": query,
        "reference": docs[0].page_content,
        "response": game_state[0]["args"].get('response') if game_state else ''
    })

In [52]:
test_df = pd.DataFrame(hallucination_test_data)

model = OpenAIModel(
    model="gpt-4-turbo-preview",
    temperature=0.0,
)

rails = list(HALLUCINATION_PROMPT_RAILS_MAP.values())
hallucination_classifications = llm_classify(
    dataframe=test_df, template=HALLUCINATION_PROMPT_TEMPLATE, model=model, rails=rails
)

hallucination_classifications

RuntimeError: Error while constructing the prompts from the template and dataframe. The template variable 'input' is not found as a column in the dataframe.

#### Relevance Evaluation Playground
Phoenix’s built-in RelevanceEvaluator doesn’t quite help us here. The default prompt doesn’t take into account the fact that we are using document retrieval in a pretty unorthodox manner and marks documents as “unrelated” even when they are related. 

In [None]:
from phoenix.experimental.evals import RelevanceEvaluator
from phoenix.experimental.evals import run_evals
from phoenix.session.evaluation import get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.Client())

eval_model = OpenAIModel(model="gpt-4-turbo-preview", temperature=0.0)
relevance_evaluator = RelevanceEvaluator(eval_model)
relevance_eval_df = run_evals(
    dataframe=retrieved_documents_df.tail(5),
    evaluators=[relevance_evaluator],
    provide_explanation=True,
)[0]

relevance_eval_df