In [None]:
import hashlib
import json
import logging
import os
import pandas as pd
import sys
import tempfile
import textwrap
from datetime import timedelta
from functools import reduce
from pathlib import Path
from typing import Any, Dict, List, Tuple
from tqdm import tqdm
import urllib
import zipfile

from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_random_exponential

from llama_index import (
    VectorStoreIndex, 
    StorageContext, 
    load_index_from_storage, 
    ServiceContext, 
    LLMPredictor,
    SimpleWebPageReader,
)
from llama_index.callbacks import CallbackManager, OpenInferenceCallbackHandler
from llama_index.callbacks.open_inference_callback import as_dataframe, QueryData
from llama_index.callbacks.schema import CBEventType, EventPayload
from llama_index.embeddings.base import BaseEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.simple import SimpleGraphStore
from llama_index.indices.query.schema import QueryBundle
from llama_index.node_parser import SimpleNodeParser
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response.schema import Response

import openai

import phoenix as px


load_dotenv()
pd.set_option("display.max_colwidth", 1000)


# Now you can access your API key through an environment variable.
openai_api_key = os.getenv('OPENAI_API_KEY')

# You can continue with the rest of your code here
assert openai_api_key != "copy paste your api key here", "❌ Please set your OpenAI API key"
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key
print('working dir:', os.getcwd()) 


from dataclasses import asdict

from arize.api import Client



# ## Create a storage context from our tweets documents

callback_handler = OpenInferenceCallbackHandler()
callback_manager = CallbackManager([callback_handler])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)
storage_context = StorageContext.from_defaults(persist_dir='tweets_by_authenticexit/')
index = load_index_from_storage(storage_context=storage_context, service_context=service_context)

query_engine = index.as_query_engine(similarity_top_k=3)

# r = index.as_retriever(query='what did he say about hotub? ', num_results=4) 
# retrieved = r.retrieve('hottub')

def create_openinference_record(query_data, node_data_list):
    record = {
        ":feature.text:prompt": query_data.query_text,
        ":feature.[float].embedding:prompt": query_data.query_embedding,
        ":feature.[str].retrieved_document_ids:prompt": query_data.node_ids,
        ":prediction.text:response": query_data.response_text,
        ":timestamp.iso_8601:": query_data.timestamp,
        ":feature.[float].retrieved_document_scores:prompt": query_data.scores,
    }
    for i, node_id in enumerate(query_data.node_ids):
        node_data = next((nd for nd in node_data_list if nd.id == node_id), None)
        #we rename node to document for openinference
        if node_data:
            # record[f":feature.float:node_{i}_id"] = node_data.id
            record[f":feature.float:document_{i}_score"] = query_data.scores[i] 
            record[f":feature.text:document_{i}_text"] = node_data.node_text

    return record



def generate_dataframe(callback_handler):
    query_data_list = callback_handler.flush_query_data_buffer()
    node_data_list = callback_handler.flush_node_data_buffer()
    records = [create_openinference_record(query_data, node_data_list) for query_data in query_data_list]
    df = pd.DataFrame(records)
    return df

# r_nodex = query_engine.retrieve(QueryBundle(query_str="What did he say about hotub?"))

def execute_query(query, query_engine, callback_handler, callback_manager):
    query_bundle = QueryBundle(query_str=query)
    callback_handler.start_trace(trace_id="query")
    with callback_manager.event(CBEventType.QUERY, payload={EventPayload.QUERY_STR: query_bundle.query_str}) as retrieve_event:
        nodes = query_engine.retrieve(query_bundle)
        callback_handler.on_event_end(event_type=CBEventType.RETRIEVE, payload={EventPayload.NODES: nodes})
        callback_handler.on_event_end(event_type=CBEventType.LLM, payload={EventPayload.RESPONSE: 'test response from llm yhall'})
        callback_handler.end_trace(trace_id="query")

#sample test queries
queries = [
    "What did he say about hotub?",
    "What does he think about primary residences?",
    "what does he say about interest rates?",
    "Can he throw down and show down?"
]
for query in tqdm(queries):
    execute_query(query, query_engine, callback_handler, callback_manager)
df = generate_dataframe(callback_handler)
df.head()

#Easiest way to update the dataframe after each query?
#e.g.
# df = df.append(generate_dataframe(callback_handler))
# to csv
# df.to_csv('openinference.csv')

#OR:
# do we log our dataframes to arize? then pull from arize when we want to update them with evals?

#OR:
# do we log our dataframes to arize? then pull from arize info a df and turn that into retrerivals_data to perform eval & store/log the eval df 






# =========== non manual - calls the callback_manager and callback_handler lifecycle methods
# nodes = query_engine.query("What did he say about hotub?")
# nodes2 = query_engine.query("What does he think about primary residences?")






# CREATE DF FROM DATABASE/STORAGE ============ STORAGE SECTION
storage_context = StorageContext.from_defaults(
    persist_dir='tweets_by_authenticexit'
 # pass default graph store to prevent unauthorized request to GCS
)
def storage_context_to_dataframe(storage_context: StorageContext) -> pd.DataFrame:
    """Converts the storage context to a pandas dataframe. """
    document_ids = []
    document_texts = []
    document_embeddings = []
    docstore = storage_context.docstore
    vector_store = storage_context.vector_store
    for node_id, node in docstore.docs.items():
        document_ids.append(node.hash)  # use node hash as the document ID
        document_texts.append(node.text)
        document_embeddings.append(np.array(vector_store.get(node_id)))
    return pd.DataFrame(
        {
            "document_id": document_ids,
            "text": document_texts,
            "text_vector": document_embeddings,
        }
    )

database_df = storage_context_to_dataframe(storage_context)
database_df = database_df.drop_duplicates(subset=["text"])


NUM_RETRIEVED_DOCUMENTS = 3 #TODO ENSURE CORRECT / PARAMETERIZE
EVALUATION_MODEL_NAME = "gpt-4"
BINARY_TO_STRING_MAP = {0: "irrelevant", 1: "relevant"}

#NORMALIZATION OF DF (new) AND DATABASE_DF (old)
def get_centroid_and_normalize(df, column_name):
    centroid = df[column_name].apply(np.mean).mean()
    df[column_name] = df[column_name].apply(lambda x: np.array(x) - centroid)
    return df, centroid

database_df, _ = get_centroid_and_normalize(database_df, "text_vector")
df, _ = get_centroid_and_normalize(df, ":feature.[float].embedding:prompt")



def calculate_precision(df: pd.DataFrame) -> pd.DataFrame:
    num_relevant_docs_array = np.zeros(len(df))

    # Calculating number of relevant documents for each retrieved document and storing it in precision column.
    for retrieved_context_index in range(NUM_RETRIEVED_DOCUMENTS):
        num_relevant_docs_array += df[f":tag.str:openai_relevance_{retrieved_context_index}"].apply(lambda x: 1 if x == "relevant" else 0).values
        df[f":tag.float:openai_precision_at_{retrieved_context_index + 1}"] = num_relevant_docs_array.tolist() / (retrieved_context_index + 1)

    return df


def drop_unnecessary_columns(df: pd.DataFrame, columns_to_drop: List[str]) -> pd.DataFrame:
    df = df.drop(columns=columns_to_drop)
    df = df.loc[:, ~df.columns.duplicated()]
    return df

def generate_query_context_prompts(query: str, context: str) -> str:
    query_context_prompt_template = """# Query: {query}

    # Reference: {reference}

    # Binary: """
    return query_context_prompt_template.format(query=query, reference=context)








# LLM EVAL THE RETRIVALS ====================

# -- set up our retrerivals data
num_retrieved_documents = 3  # or any other number you want
document_id_to_text = dict(zip(database_df["document_id"], database_df["text"]))
query_texts = df[":feature.text:prompt"].to_list()

retrievals_data = {}
for i, query in enumerate(query_texts):
    doc_ids = df[":feature.[str].retrieved_document_ids:prompt"].tolist()[i]
    retrievals_data[query] = {index: document_id_to_text[doc_id] for index, doc_id in enumerate(doc_ids[:num_retrieved_documents])}


# -- EVAL PROMPT 
EVALUATION_SYSTEM_MESSAGE = "You will be given a query and a reference text. You must determine whether the reference text contains an answer to the input query. Your response must be binary (0 or 1) and should not contain any text or characters aside from 0 or 1. 0 means that the reference text does not contain an answer to the query. 1 means the reference text contains an answer to the query."
QUERY_CONTEXT_PROMPT_TEMPLATE = """# Query: {query}

# Reference: {reference}

# Binary: """
num_retrieved_documents = 3

# -- EVALUATION FUNCTIONS
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def evaluate_query_and_retrieved_context(query: str, context: str, model_name: str) -> str:
    prompt = QUERY_CONTEXT_PROMPT_TEMPLATE.format(
        query=query,
        reference=context,
    )
    response = openai.ChatCompletion.create(
        messages=[
            {"role": "system", "content": EVALUATION_SYSTEM_MESSAGE},
            {"role": "user", "content": prompt},
        ],
        model=model_name,
    )
    return response["choices"][0]["message"]["content"]


def evaluate_retrievals(
    retrievals_data: Dict[str, str],
    model_name: str,
) -> List[str]:
    responses = []
    for query, retrieved_context in tqdm(retrievals_data.items()):
        response = evaluate_query_and_retrieved_context(query, retrieved_context, model_name)
        responses.append(response)
    return responses


def process_binary_responses(
    binary_responses: List[str], binary_to_string_map: Dict[int, str]
) -> List[str]:
    """
    Parse binary responses and convert to the desired format
    converts them to the desired format. The binary_to_string_map parameter
    should be a dictionary mapping binary values (0 or 1) to the desired
    string values (e.g. "irrelevant" or "relevant").
    """
    processed_responses = []
    for binary_response in binary_responses:
        try:
            binary_value = int(binary_response.strip())
            processed_response = binary_to_string_map[binary_value]
        except (ValueError, KeyError):
            processed_response = None
        processed_responses.append(processed_response)
    return processed_responses




working dir: /Users/b/Code/LangDeployable


100%|██████████| 4/4 [00:00<00:00,  6.53it/s]


In [20]:
# RUN EVALUATION ON RETRIEVALS
for retrieved_document_index in range(NUM_RETRIEVED_DOCUMENTS):
    #ONLY EVAL THE NEW RETRIEVALS (DF)
    raw_responses = evaluate_retrievals(retrievals_data, EVALUATION_MODEL_NAME)
    processed_responses = process_binary_responses(raw_responses, {0: "irrelevant", 1: "relevant"})
    # df[f"retrieved_document_text_{retrieved_document_index}"] = list(retrievals_data.values())
    df[f":tag.str:openai_relevance_{retrieved_document_index}"] = processed_responses
    

100%|██████████| 4/4 [00:01<00:00,  2.34it/s]
100%|██████████| 4/4 [00:02<00:00,  1.90it/s]
100%|██████████| 4/4 [00:01<00:00,  2.56it/s]


In [21]:
df.head()

Unnamed: 0,:feature.text:prompt,:feature.[float].embedding:prompt,:feature.[str].retrieved_document_ids:prompt,:prediction.text:response,:timestamp.iso_8601:,:feature.[float].retrieved_document_scores:prompt,:feature.float:document_0_score,:feature.text:document_0_text,:feature.float:document_1_score,:feature.text:document_1_text,:feature.float:document_2_score,:feature.text:document_2_text,:tag.str:openai_relevance_0,:tag.str:openai_relevance_1,:tag.str:openai_relevance_2
0,What did he say about hotub?,"[0.0057588105078441405, 0.004654824453185251, 0.015580248272756983, -0.022466883809526512, -0.015425606445630144, 0.01166917495314305, -0.016462276504595825, -0.0042713073093789316, 0.0006243180282883657, -0.0009416782308715085, 0.02986475191895192, 0.007791760562698295, -0.0018108990520584087, -0.008478571475704263, 0.006216561141203096, 0.013567492550949503, 0.0443512026209039, 0.009797783753613879, 0.03988140560571377, -0.026236592606862136, -0.0114135591324764, 0.0026521667128083606, -0.0038606783386665087, 0.005529935191164663, -0.02837724685784156, 0.026714352144638947, 0.016414969413320473, -0.0055031942215162015, -0.011400095002015184, 0.009037111139635493, 0.04004296399537747, -0.012160768547316144, -0.01832020357366378, 0.004163416054810336, -0.026317371801693985, 0.0001496860163872441, -0.008909394779463361, 0.00685943076271837, 0.017761293052474907, 0.001123457313527291, 0.018515235929529122, 0.0065834342490536475, 0.01553985867534106, -0.011373168603737901, -0.02078397...","[a4ee888a293ed3e89cb6274ca00e8079fbc867bbcf77f8ae5b08f1fd136902cc, 97489a387f09f7101ecb15405b9c6e2462cee8422852a815c7dd3f54d8699c5b, a58cd14144fccf374d634f9c26ba898c2c07ef4444a0e26f16d53fde9b4c1477]",test response from llm yhall,2023-07-31T20:04:47.455661,"[0.7411799029638464, 0.7357889912332278, 0.7326789108192006]",0.74118,@TomDNaughton https://t.co/mWGhae0qTj\nLLMs get bored. 😆\nHow…human https://t.co/uLYo4RugEI\nRT @cory: Spot on \n\nFrom,0.735789,@stkirsch Yes. Environmental and/or vaxx related\n@davejavupride @HackForumsNet @stkirsch This right here is the Dunning Kruger effect👆😆\n@Spicytaco34 @ChrisBlec,0.732679,it\n@lylepratt AI sent it back from the future\nRT @LeadingReport: BREAKING: A new comprehensive study presented in front of the Pennsylvania Senate finds zero Amish children diagnosed wi…\n@ToroTheDog,irrelevant,irrelevant,irrelevant
1,What does he think about primary residences?,"[0.014673287234048297, 0.014970807543734957, 0.013050449265877654, -0.012644475140829633, -0.012292860822041582, 0.019054948537389687, -0.02085332833345706, 0.01296930778656428, -0.016593380646069595, -0.00801938846822555, 0.011130091919342925, 0.027439608423749855, -0.017228991539438316, 0.008763454108278204, 0.0005012469005425771, 0.01806772239391034, 0.035810743643681454, -0.02630335725959117, 0.02254404798034852, -0.022611400858719894, -0.02634392753358657, -0.003066354013268898, 0.0098723929363889, -0.007038924657960484, -0.0036749181349175672, 0.01781077235821908, 0.010893427951971938, -0.0007428228168162206, -0.011914199033041547, -0.0017832981434064608, 0.03827204666916554, -0.009053948150237153, -0.0027823574386882286, 0.005980964469502618, -0.01541682221110637, 0.005957298165897777, 0.01178598888103192, 0.008878405391017844, 0.013733393697778632, -0.0014029457711028557, 0.02111054137233918, 0.02396403178457444, 0.011441135964135577, -0.003008878604729722, 0.02028560005013...","[372e3f847f40afcb4443ed4c3f220aac9a36dd2660b5d5a34a0e6f6784d45828, 4a0a7b3f3349e90326686ca2891652ffad6c437e266dd0d182671ea266ab3a75, 0a3ae6926b16a9aa5d7f0bf34e01947cd4ad108ff059dafda64ef478a3ae9908]",test response from llm yhall,2023-07-31T20:04:47.626935,"[0.7941285882224618, 0.7555219766113798, 0.736823447252139]",0.794129,than you can flip your primary residence 😵\n@Route2FI @AndrewYoung_SX 🔥🔥\n@Eugene482 @cburniske Working on it…@langwallet\n@CryptoEcon_Li,0.755522,Yes. It’s coming. @langwallet https://t.co/CzepzCO6Ws\n@CryptoEcon_Li @Tablesalt13 They can change the interest rate faster than you can flip your primary residence,0.736823,end game\n@ABwoodelf @REWoman Name me a single person that pays $500 out of pocket to have a mortgage reviewed from a big 4 bank. You’re better off asking ChatGPT for $20 per month.\n@SCBuergel,irrelevant,irrelevant,irrelevant
2,what does he say about interest rates?,"[-0.026893225313385078, -0.009957734310288976, 0.03359355144087498, -0.02322532497461612, 0.002601131283457329, 0.011720500140230109, -0.017058414221962043, -0.0124903328728276, -0.04083930961902912, -0.030171495677192756, 0.03982763849083607, 0.03413097299520199, -0.0004982785798954467, -0.00274621141258354, 0.011572709492186953, 0.007343874963710954, 0.039800767971913266, -0.005000022520531962, 0.026298054679076126, -0.017837676034529754, -0.01287324521805818, 0.007400975747744491, -0.012315670774539064, -0.01500277854617412, -0.02162649557109649, 0.007061728461424758, 0.015469013898412634, -0.005114224554260323, 0.013285739532451083, -0.0037437996838587027, 0.023691562189499787, -0.009635281563957284, -0.02139809243496234, 0.00658812625218456, -0.016776268185337135, -0.012483615243096899, -0.014505662770231317, 0.009328975527326514, 0.0015443425031964977, -0.007700565085967133, 0.02089697047952836, 0.00916774868849938, 0.011035287937859942, 0.00011429779581998789, -0.01680313870...","[4a0a7b3f3349e90326686ca2891652ffad6c437e266dd0d182671ea266ab3a75, 235c80f8274fcd20d49d750f920c57dcf5746819b944fcda735eb6f2a1425496, 75153ce138e36e43919eaa7577b5db3eada47055d47419de26ebb7d5e7548fce]",test response from llm yhall,2023-07-31T20:04:47.791630,"[0.8106718512771199, 0.8069574715665491, 0.7855045832756103]",0.810672,Yes. It’s coming. @langwallet https://t.co/CzepzCO6Ws\n@CryptoEcon_Li @Tablesalt13 They can change the interest rate faster than you can flip your primary residence,0.806957,just said rates would remain low for a long time just before this soo maybe it’s not worth listening to them. \n\nThey ultimately are reactionary to the market.\n@JohnDon57763151 @dobettabbetta,0.785505,@bankofcanada You think rates remain elevated for longer?\n@zeroxLucky Paid ad cause it’s so sh*t\nWatch these guys closely next few days.,relevant,relevant,relevant
3,Can he throw down and show down?,"[-0.024314691127498695, -0.024151353911717483, 0.03767236806337063, -0.008164571413059781, -0.016501654685695717, 0.010088535210828234, -0.03229106851037319, -0.00702120019238169, -0.028234821559150764, -0.0069599485036330915, 0.013409756309132029, 0.015090785174409796, 0.008257779943535973, 0.006178613471577814, -0.002563413280283163, 0.00953046064708655, 0.02491833381239598, -0.00232180794816191, 0.02186934389058774, -0.0398046512168246, -0.00017373315173859394, -0.024001625181396553, 0.01078953053687041, -0.028779283912141868, -0.022708526627381393, 0.028334836660663536, 0.012028183041850974, -0.004200203624774764, 0.001278025225049754, -0.006047973689843962, 0.04178306199495022, 0.013716017081181456, 0.005538869834284236, -0.026669491828640052, -0.002733557765592883, -0.02637003809328849, 0.009666576235334326, 0.009789080544154097, -0.010464926065166543, -0.008879178949852537, 0.022794929518143586, -0.013132792712410043, -0.008287075721879552, -0.00926030222441728, 0.0087477957...","[18dd82eef1ac43cb388684bccfbc954739366bb211d225c2844278d3827552fc, 5fada1eef7a76134d8f382faa12973a2a0073645e04af3d0ae09b3eb37d2f868, 8b8786ba1d4643f8ae3db11c6be1824223e672ccdfd422960d2e2d8ab3a4a4ae]",test response from llm yhall,2023-07-31T20:04:47.928430,"[0.7324493584251837, 0.7289638041931592, 0.7287599313239213]",0.732449,layer and then move people over to use it https://t.co/nOy27rjvvd\n@rationalaussie Bingo!\nThis is pretty insane https://t.co/OdPppw5fe3\n@androolloyd Both,0.728964,@World_At_War_6 @ianellisjones @Jkylebass @HudsonInstitute Global\n@ApeFramework any docs for silverback?\n@CalebMaddix Beta\n@billions89 @geoeconomic10,0.72876,@dobettabbetta @ManyBeenRinsed @VinceGaetano @ronmortgageguy They are usually wrong\n@KONG99946700 There are levels to madness...I'm here for the end game\n@ABwoodelf @REWoman,irrelevant,irrelevant,irrelevant


In [22]:
def prepare_for_phoenix(database_df: pd.DataFrame):
    database_schema = px.Schema(
        prediction_id_column_name="document_id",
        prompt_column_names=px.EmbeddingColumnNames(
            vector_column_name="text_vector",
            raw_data_column_name="text",
        )
    )
    database_ds = px.Dataset(
        dataframe=database_df,
        schema=database_schema,
        name="database",
    )
    return database_ds

# CREATE DATASETS FROM DATAFRAMES - PRIMARY (queries) AND CORPUS (database)
database_ds = prepare_for_phoenix(database_df)
query_ds = px.Dataset.from_open_inference(df)

session = px.launch_app(primary=query_ds, corpus=database_ds)

🌍 To view the Phoenix app in your browser, visit http://localhost:60894/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix
