In [2]:
import pandas as pd

In [11]:
df2 = pd.read_csv('../data/character_descriptions.csv')
df2.head(20)

Unnamed: 0,Name,Description,Medium,Setting
0,Emily,"A young woman in her early 20s, Emily is an as...",Play,England
1,Jack,"A middle-aged man in his 40s, Jack is a succes...",Play,England
2,Alice,"A woman in her late 30s, Alice is a warm and n...",Play,England
3,Tom,"A man in his 50s, Tom is a retired soldier and...",Play,England
4,Sarah,"A woman in her mid-20s, Sarah is a free-spirit...",Play,England
5,George,"A man in his early 30s, George is a charming a...",Play,England
6,Rachel,"A woman in her late 20s, Rachel is a shy and i...",Play,England
7,John,"A man in his 60s, John is a retired professor ...",Play,England
8,Maria,"A middle-aged Latina woman in her 40s, Maria i...",Movie,Texas
9,Caleb,"A young African American man in his early 20s,...",Movie,Texas


# STEP 1: Prepare dataset

In [4]:
# Create a new dataframe with concatenated text
df_combined = pd.DataFrame({
    'text': df2.apply(lambda row: 
                      f"NAME: {row['Name']}\n"
                      f"DESCRIPTION: {row['Description']}\n"
                      f"MEDIUM: {row['Medium']}\n"
                      f"SETTING: {row['Setting']}", 
                      axis=1)
})

# Display the first few rows
df_combined.head()
print(df_combined.iloc[0]['text'])

NAME: Emily
DESCRIPTION: A young woman in her early 20s, Emily is an aspiring actress and Alice's daughter. She has a bubbly personality and a quick wit, but struggles with self-doubt and insecurity. She's also in a relationship with George.
MEDIUM: Play
SETTING: England


# STEP 2:  Create embeddings and store them

In [5]:
import numpy as np
import openai

openai.api_base = "https://openai.vocareum.com/v1"
openai.api_key = "voc-2137010308126677160122967c5eed67b5251.74308753"

EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
BATCH_SIZE = 100

embeddings = []
for i in range(0, len(df_combined), BATCH_SIZE):
    response = openai.Embedding.create(
        engine=EMBEDDING_MODEL_NAME,
        input=df_combined.iloc[i : i + BATCH_SIZE]["text"].tolist(),
    )
    embeddings.extend([d["embedding"] for d in response["data"]])

df_combined["embeddings"] = embeddings
df_combined.head()

Unnamed: 0,text,embeddings
0,NAME: Emily\nDESCRIPTION: A young woman in her...,"[-0.01711198128759861, -0.005657887551933527, ..."
1,NAME: Jack\nDESCRIPTION: A middle-aged man in ...,"[0.00529055530205369, -0.017990553751587868, -..."
2,NAME: Alice\nDESCRIPTION: A woman in her late ...,"[0.005038198549300432, -0.0011249859817326069,..."
3,"NAME: Tom\nDESCRIPTION: A man in his 50s, Tom ...","[0.013849714770913124, -0.013562570326030254, ..."
4,NAME: Sarah\nDESCRIPTION: A woman in her mid-2...,"[-0.020137982442975044, -0.020373666658997536,..."


# STEP 3:  Similarity‑search helper

In [6]:
from openai.embeddings_utils import get_embedding, distances_from_embeddings

def get_rows_sorted_by_relevance(question: str, df) -> pd.DataFrame:
    """
    Return a copy of df sorted from most → least relevant for 'question'
    based on cosine distance between embeddings.
    """
    q_emb = get_embedding(question, engine=EMBEDDING_MODEL_NAME)
    df_tmp = df.copy()
    df_tmp["distance"] = distances_from_embeddings(
        q_emb, df_tmp["embeddings"].values, distance_metric="cosine"
    )
    return df_tmp.sort_values("distance")

In [7]:
get_rows_sorted_by_relevance("Who is Emily?", df_combined).head(3)[["text","distance"]]

Unnamed: 0,text,distance
0,NAME: Emily\nDESCRIPTION: A young woman in her...,0.12204
2,NAME: Alice\nDESCRIPTION: A woman in her late ...,0.162558
5,NAME: George\nDESCRIPTION: A man in his early ...,0.181913


# STEP 4:  Prompt builder

In [8]:

import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")

def create_prompt(question: str,
                  df,
                  max_token_count: int = 1800) -> str:
    """
    Build a prompt that contains as much relevant context as will fit
    inside 'max_token_count' tokens.
    """
    template = (
        "Answer the question based on the context below, and if the "
        "question cannot be answered from that context, say \"I don't know\".\n\n"
        "Context:\n\n{context}\n\n"
        "---\n\n"
        "Question: {question}\n"
        "Answer:"
    )

    used_tokens = len(tokenizer.encode(template.format(context="", question=question)))
    context_blocks = []
    for row_text in get_rows_sorted_by_relevance(question, df)["text"]:
        tokens_needed = len(tokenizer.encode(row_text))
        if used_tokens + tokens_needed > max_token_count:
            break
        context_blocks.append(row_text)
        used_tokens += tokens_needed

    context_str = "\n\n###\n\n".join(context_blocks)
    return template.format(context=context_str, question=question)

# STEP 5: Ask questions 

In [9]:

COMPLETION_MODEL_NAME = "gpt-3.5-turbo-instruct"

def answer_question(question: str,
                    df,
                    max_prompt_tokens: int = 1800,
                    max_answer_tokens: int = 150) -> str:
    prompt = create_prompt(question, df, max_prompt_tokens)
    try:
        response = openai.Completion.create(
            model=COMPLETION_MODEL_NAME,
            prompt=prompt,
            max_tokens=max_answer_tokens,
        )
        return response["choices"][0]["text"].strip()
    except Exception as ex:
        print(f"OpenAI error: {ex}")
        return ""

In [17]:
questions = [
    "Who is Emily?",
    "What is Jack's occupation?",
    "What is Max medium and setting?",
]

for question in questions:
    print(f"Q: {question}")
    print(f"A: {answer_question(question, df_combined)}\n")

Q: Who is Emily?
A: Name: Emily
Description: A young woman in her early 20s, Emily is an aspiring actress and Alice's daughter. She has a bubbly personality and a quick wit, but struggles with self-doubt and insecurity. She's also in a relationship with George.
Medium: Play
Setting: England

Q: What is Jack's occupation?
A: Successful businessman.

Q: What is Max medium and setting?
A: Limited Series in Australia



# Compare to an un‑contextualized answer


In [19]:
def compare_answers(question: str, df, max_tokens: int = 50):
    baseline = openai.Completion.create(
        model=COMPLETION_MODEL_NAME,
        prompt=f"Question: {question}\nAnswer:\n",
        max_tokens=max_tokens
    )["choices"][0]["text"].strip()

    custom = answer_question(question, df)

    print("Without context:", baseline)
    print("With context:   ", custom)

# Example usage
for question in questions:
    compare_answers(question, df_combined)

Without context: I'm sorry, I am an AI and I don't know who Emily is. Can you provide more context or information?
With context:    Emily is an aspiring actress and Alice's daughter in the play set in England.
Without context: Jack's occupation is not specified.
With context:    Jack is a successful businessman.
Without context: Max medium and setting is a term that refers to the maximum or highest level or intensity of a specific feature or setting in a particular application or device. It is commonly used in video games or computer graphics, where players or users can adjust the graphics quality
With context:    Max's medium is Limited Series and the setting is Australia.
