In [None]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/1

In [None]:
import numpy as np
import pandas as pd
import math
from datetime import datetime
from scipy.spatial.distance import cosine
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
from tqdm import tqdm
import re

In [None]:
openai.api_key = 'YOUR API HERE'

# Memory (base object)

In [None]:
class Memory:
    def __init__(self, description, creation_timestamp, most_recent_access_timestamp, memory_type):
        self.description = description
        self.creation_timestamp = creation_timestamp
        self.most_recent_access_timestamp = most_recent_access_timestamp
        self.importance = self.generate_importance_score(description)
        self.embedding_vector = self.generate_embedding_vector(description)
        self.memory_type = memory_type #O = observation, R = reflection, P = plan

    def generate_embedding_vector(self, input_text):
        model_engine = "text-embedding-ada-002"
        embeddings = get_embedding(
          input_text,
          engine=model_engine
        )
        return np.array(embeddings).reshape(1, -1)[0]

    def generate_importance_score(self, input_text, max_attempts=10):
        system_intel = '''
        You are an AI who rates events in terms of their perceived importance on a scale of 1 to 10.
        '''
        context = f"""
        On the scale of 1 to 10, where 1 is purely mundane
        (e.g., brushing teeth, making bed) and 10 is
        extremely poignant (e.g., a break up, college
        acceptance), rate the likely poignancy of the
        following piece of memory.
        Memory: {input_text}
        Rating: <fill in with a number between 1 and 10>
        Please return only the number rating as an integer (e.g. 5)
        Do not explain why you gave this rating.
        """
        attempts = 0
        while attempts < max_attempts:
            responses = openai.ChatCompletion.create(
              model="gpt-3.5-turbo",
              messages=[
                  {"role": "system", "content": system_intel},
                  {"role": "user", "content": context}
              ],
              temperature=0.0
            )
            try:
                score = float(re.sub(r'[^\d]', '', responses.choices[0]['message']["content"]))
                if math.isnan(score) or (score/10.0 > 10 or score/10.0 < 0):
                    raise ValueError("Score is NaN")
                return score/10.0
            except ValueError:
                attempts += 1
        print('failed to generate importance score:', responses.choices[0]['message']["content"])
        return None

class MemoryStream:
    def __init__(self, memories, alpha_recency, alpha_importance, alpha_relevance, decay_factor, context_window_size):
        self.memories = memories
        self.alpha_recency = alpha_recency #these are weightings, not init values
        self.alpha_importance = alpha_importance
        self.alpha_relevance = alpha_relevance
        self.decay_factor = decay_factor
        self.context_window_size = context_window_size
        
    def add_memory(self, memory):
        creation_timestamp = datetime.now()
        most_recent_access_timestamp = creation_timestamp
        self.memories.append(memory)
        
    def retrieve_memories(self, query_memory, filter_statement=None):
        now = datetime.now()
        recency_scores = []
        for memory in self.memories:
            elapsed_time = (now - memory.most_recent_access_timestamp).total_seconds()
            recency_scores.append(np.power(self.decay_factor, elapsed_time))
        
        relevance_scores = []
        for memory in self.memories:
            relevance_scores.append(1 - cosine(query_memory.embedding_vector, memory.embedding_vector))
        
        memory_df = pd.DataFrame({
            'description': [memory.description for memory in self.memories],
            'importance': [memory.importance for memory in self.memories],
            'recency': recency_scores,
            'relevance': relevance_scores,
            'pointer': [memory for memory in self.memories]
        })
        
        memory_df['recency'] = (memory_df['recency'] - memory_df['recency'].min()) / (memory_df['recency'].max() - memory_df['recency'].min())
        memory_df['relevance'] = (memory_df['relevance'] - memory_df['relevance'].min()) / (memory_df['relevance'].max() - memory_df['relevance'].min())
        memory_df['score'] = (self.alpha_recency * memory_df['recency'] + 
                              self.alpha_importance * memory_df['importance'] + 
                              self.alpha_relevance * memory_df['relevance'])
        
        # Allows you to filter on specified relevant memories and only update access time for those.
        if filter_statement is not None:
            memory_df = memory_df.query(filter_statement)

        memory_df = memory_df.sort_values(by='score', ascending=False).head(self.context_window_size)

        # Update latest access time of memories caught in the context window
        for memory in memory_df['description']:
            memory_obj = [m for m in self.memories if m.description == memory][0]
            memory_obj.most_recent_access_timestamp = datetime.now()

        return memory_df

    def get_memory_df(self):
        return pd.DataFrame({
            'description': [memory.description for memory in self.memories],
            'importance': [memory.importance for memory in self.memories],
            'creation_timestamp': [memory.creation_timestamp for memory in self.memories],
            'most_recent_access_timestamp': [memory.most_recent_access_timestamp for memory in self.memories],
            'memory_type': [memory.memory_type for memory in self.memories],
            'embedding_vector': [memory.embedding_vector for memory in self.memories]
        })

In [None]:
# Random, GPT generated list of data-related observations. 
memory_list = [
    "8% of values in the bitcoin_transaction_data are missing",
    "Ingestion of bitcoin_price_data is slow",
    "Inconsistent address formatting in bitcoin_addresses",
    "user_dims table contains duplicates for some users",
    "wallet_event_logs table is missing data for some events",
    "bitcoin_transactions table has missing information for some transactions",
    "Data in ethereum_erc20_tokens_dim table is not consistently formatted",
    "user_dims table is missing data for a large number of users",
    "wallet_event_logs table has incorrect data for some events",
    "Inconsistent date formatting in bitcoin_price_data",
    "Data in bitcoin_transactions table is not properly indexed",
    "Data in ethereum_erc20_tokens_dim table is missing information for some tokens",
    "user_dims table has incorrect information for some users",
    "wallet_event_logs table is not properly indexed",
    "bitcoin_price_data has a large number of missing values",
    "Data in bitcoin_transactions table is missing information for a large number of transactions",
    "ethereum_erc20_tokens_dim table has incorrect information for some tokens",
    "user_dims table has outdated information for some users",
    "wallet_event_logs table has outdated information for some events",
    "Incorrect data types in bitcoin_price_data",
    "Data in bitcoin_transactions table is not properly formatted",
    "ethereum_erc20_tokens_dim table is missing data for a large number of tokens",
    "user_dims table has inconsistent information for some users",
    "wallet_event_logs table has inconsistent information for some events",
    "Inconsistent data types in bitcoin_transactions",
    "Data in ethereum_erc20_tokens_dim table is not properly formatted",
    "user_dims table has missing information for some users",
    "wallet_event_logs table has missing information for some events",
    "Incorrect data types in ethereum_erc20_tokens_dim",
    "Data in bitcoin_transactions table is not properly indexed",
    "ethereum_erc20_tokens_dim table has incorrect information for some tokens",
    "user_dims table has incorrect information for some users",
    "wallet_event_logs table has incorrect information for some events",
    "Inconsistent data formats in bitcoin_price_data",
    "Data in bitcoin_transactions table is missing information for a large number of transactions",
    "ethereum_erc20_tokens_dim table is missing data for a large number of tokens",
    "user_dims table has outdated information for some users",
    "wallet_event_logs table has outdated information for some events",
    "Incorrect data formats in bitcoin_transactions",
    "Data in ethereum_erc20_tokens_dim table is not properly indexed",
    "user_dims table has inconsistent information for some users",
    "wallet_event_logs table has inconsistent information for some events",
    "user_dims table has inconsistent data types for age and income columns",
    "wallet_event_logs table is missing data for several events",
    "bitcoin_price_data has a significant amount of missing data for some days",
    "Some bitcoin_transactions are duplicated in the data",
    "ethereum_erc20_tokens_dim table is missing data for several tokens",
    "user_dims table has a large number of missing values for occupation",
    "wallet_event_logs table has inaccurate data for event timestamps",
    "bitcoin_price_data has a large number of zero values for some days",
    "Some bitcoin_transactions have negative values in the amount column",
    "ethereum_erc20_tokens_dim table has inconsistent data types for token symbol and name columns",
    "user_dims table has a large number of missing values for country",
    "wallet_event_logs table has missing data for some user IDs",
    "bitcoin_price_data has a large number of outliers in the daily price",
    "Some bitcoin_transactions have invalid addresses in the address column",
    "ethereum_erc20_tokens_dim table has inaccurate data for token prices",
    "user_dims table has a large number of duplicate user IDs",
    "wallet_event_logs table has missing data for some event types",
    "bitcoin_price_data has inconsistent data for weekends and holidays",
    "Some bitcoin_transactions have inconsistent data types for the transaction ID",
    "ethereum_erc20_tokens_dim table has missing data for some token descriptions",
    "user_dims table has inconsistent data for user email addresses",
    "wallet_event_logs table has missing data for some wallet IDs",
    "bitcoin_price_data has a large number of missing data for some months",
    "Some bitcoin_transactions have invalid values for the transaction fee",
    "ethereum_erc20_tokens_dim table has inconsistent data for token supply"
]

In [None]:
# Initialize the memory stream
memory_stream = MemoryStream([], 1, 1, 1, 0.99, 5) #memories, alpha_recency, alpha_importance, alpha_relevance, decay_factor, context_window_size

for memory_desc in tqdm(memory_list):
  memory_stream.add_memory(Memory(memory_desc, datetime.now(), datetime.now(), 'observation')) #Observations

100%|██████████| 67/67 [01:07<00:00,  1.00s/it]


In [None]:
# Retrieve memories from the memory stream based on the query memory
query_memory_description = "Our query requires the use of bitcoin_price_data"
query_memory = Memory(query_memory_description, datetime.now(), datetime.now(), "observation")
print(f"Importance score for new memory (1-10): {query_memory.importance}")
retrieved_mems = memory_stream.retrieve_memories(query_memory)
retrieved_mems

Importance score for new memory (1-10): 0.2


Unnamed: 0,description,importance,recency,relevance,pointer,score
64,bitcoin_price_data has a large number of missi...,0.2,0.96073,0.912454,<__main__.Memory object at 0x7f75ea3e3340>,2.073183
59,bitcoin_price_data has inconsistent data for w...,0.2,0.863146,0.845449,<__main__.Memory object at 0x7f75ea3e38b0>,1.908595
54,bitcoin_price_data has a large number of outli...,0.2,0.771329,0.929703,<__main__.Memory object at 0x7f75ea3e3be0>,1.901032
65,Some bitcoin_transactions have invalid values ...,0.2,0.979578,0.694479,<__main__.Memory object at 0x7f75ea3e3490>,1.874057
44,bitcoin_price_data has a significant amount of...,0.2,0.616422,0.936387,<__main__.Memory object at 0x7f75ea4b9190>,1.752809


In [None]:
memory_archive = memory_stream.get_memory_df()
memory_archive

Unnamed: 0,description,importance,creation_timestamp,most_recent_access_timestamp,memory_type,embedding_vector
0,8% of values in the bitcoin_transaction_data a...,0.2,2023-04-18 14:29:45.639229,2023-04-18 14:29:45.639233,O,"[-0.009523766115307808, -0.01182722207158804, ..."
1,Ingestion of bitcoin_price_data is slow,0.2,2023-04-18 14:29:46.489823,2023-04-18 14:29:46.489836,O,"[-0.016384730115532875, 0.010251141153275967, ..."
2,Inconsistent address formatting in bitcoin_add...,0.2,2023-04-18 14:29:47.595264,2023-04-18 14:29:47.595268,O,"[-0.012446166016161442, 0.011733734980225563, ..."
3,user_dims table contains duplicates for some u...,0.2,2023-04-18 14:29:48.464096,2023-04-18 14:29:48.464099,O,"[-0.005781781394034624, -0.00407521752640605, ..."
4,wallet_event_logs table is missing data for so...,0.2,2023-04-18 14:29:49.443213,2023-04-18 14:29:49.443216,O,"[-0.006027030758559704, -0.009225854650139809,..."
...,...,...,...,...,...,...
62,user_dims table has inconsistent data for user...,0.2,2023-04-18 14:30:48.027821,2023-04-18 14:30:48.027823,O,"[-0.005743246991187334, -0.007196654565632343,..."
63,wallet_event_logs table has missing data for s...,0.2,2023-04-18 14:30:49.041145,2023-04-18 14:30:49.041147,O,"[0.0018187625100836158, -0.009912618435919285,..."
64,bitcoin_price_data has a large number of missi...,0.2,2023-04-18 14:30:50.092275,2023-04-18 14:30:55.231230,O,"[-0.021600276231765747, -0.017947591841220856,..."
65,Some bitcoin_transactions have invalid values ...,0.2,2023-04-18 14:30:51.018669,2023-04-18 14:30:55.231250,O,"[0.009053058922290802, -0.012646075338125229, ..."


# Reflection (special kind of Memory)

You can call the `generate_reflection` function to generate a reflection memory and add it to the memory stream. The threshold argument is the minimum sum of importance scores that must be reached before a reflection is generated. The `recent_memories` variable is used to store the 100 most recent memories in the memory stream. The `question_response` and `insights_response` variables are generated by calling the OpenAI API to complete prompts generated from the recent memories and the question prompt. The reflection variable is then created as an instance of the Reflection class, with the description set to the `insights_response`, creation and access timestamps set to the current time, and pointers set to the descriptions of the 100 most recent memories. Finally, the reflection is added to the memory stream using the `memory_stream.add_memory` method.

In the context of the Reflection class, the `pointers` parameter represents a list of memory objects that were cited as evidence for the insights generated by the language model. The pointers refer to the memory objects in the memory stream that served as the basis for the reflection.

In [None]:
class Reflection(Memory):
    def __init__(self, description, creation_timestamp, most_recent_access_timestamp, pointers):
        super().__init__(description, creation_timestamp, most_recent_access_timestamp, memory_type="reflection")
        self.pointers = pointers

def parse_insights(insight_responses):
    pattern = re.compile(r"(\d+\.) (.*) \[(.*)\]")
    matches = pattern.findall(insight_responses)
    result = []
    for match in matches:
        insight = match[1]
        index = [int(x) for x in [re.sub(r'\D', '', x) for x in match[2].split(", ")]]
        result.append((insight, index))
    return result

def condense_insights(insight_list):
    intel = '''
    You are an AI who is an expert at condensing information into concise numbered, bullet points. 
    Please condense the following insights into a maximum of 5 bullet points, each starting with a number followed by a period.
    Include all the appropriate index numbers referencing the information in square brackets, listed at the end of each insight.
    Do not include multiple, unrelated insights in one bullet point. Ensure each insight explicitly mentions the subject by name. 
    '''
    context = f"""
    Please condense the insights generated in this list:

    {insight_list}

    """

    condesed_response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": intel},
          {"role": "user", "content": context}
      ],
      temperature=0.0
    )
    msg = condesed_response.choices[0]['message']["content"]
    return msg


def generate_insight(memory_stream, context_window, threshold, max_attempts=5):
    recent_memories = memory_stream.memories[-context_window:]
    memory_stream.context_window_size = context_window

    if len(recent_memories) == 0:
        return None
    
    recent_memory_importance = [memory.importance for memory in recent_memories]
    if sum(recent_memory_importance) < threshold:
        return None
    
    recent_memory_prompt = '\n'.join([f"{i+1}. {memory.description}" for i, memory in enumerate(recent_memories)])

    questions_intel = '''
    You are an intelligent AI system designed to reflect on past experiences and generate meaningful insights. 
    Your goal is to help the user gain a deeper understanding of their experiences and identify patterns or themes that may not have been immediately obvious. 
    When generating questions, focus on high-level, abstract questions that will prompt the subject to think more deeply about their experiences. 
    Avoid simple, surface-level questions that can easily be answered by the memories themselves. 
    '''
    question_context = f"""
    Given only the information above, what are 3 most salient high-level questions we can answer about the subjects in the statements? 
    Be as specific as possible and refer to subjects explicitly by their name.\n\nStatements about subjects\n{recent_memory_prompt}"""

    question_responses = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": questions_intel},
          {"role": "user", "content": question_context}
      ]
    )
    questions = question_responses.choices[0]['message']["content"]
    questions_fmt = questions.split('\n')[:3]

    # Retrieve memories from the memory stream relevant to the question
    relevant_memories = []
    for question in questions_fmt:
        question_memory = Memory(question, datetime.now(), datetime.now(), "reflection")
        relevant_memories.append(memory_stream.retrieve_memories(question_memory, filter_statement='relevance > 0.5'))   

    insight_intel = '''
    You are an intelligent AI system designed to reflect on past experiences and generate meaningful insights.
    Your goal is to help yourself gain a deeper understanding of your experiences and identify patterns or themes that may not have been immediately obvious.
    When generating insights, focus on high-level, abstract insights that will prompt you to think more deeply about your experiences.
    Cite the index of the statement as evidence, do not explicitly mention the statement or evidence. The desired format is: "
    1. insight [`index number(s) of evidence`]
    2. insight [`index number(s) of evidence`]
    3. insight [`index number(s) of evidence`]
    etc"

    REQUIREMENTS:
    - You MUST ensure to use square brackets enclosing the index numbers at the end of each insight.
    - You MUST only include one insight per line.
    - You MUST be as concise and specific as possible
    '''

    print('Generating reflections ...')
    insights_list = []
    for question_df in tqdm(list(zip(questions_fmt, relevant_memories))):
        question = question_df[0]
        relevant_memories_for_question = question_df[1]

        insight_context = f"What high-level insights can you infer from the above statements? Cite the index of the statement as evidence, do not explicitly mention the statement or evidence. Be as specific and concise as possible; mention explicitly subject names for each insight. (example format: - insight [`index of evidence`] \n\nStatements about subjects\n{relevant_memories_for_question.description}\n\n{question}"
        attempts = 0
        while attempts < max_attempts:
            try:
                insight_responses = openai.ChatCompletion.create(
                  model="gpt-3.5-turbo",
                  messages=[
                      {"role": "system", "content": insight_intel},
                      {"role": "user", "content": insight_context}
                  ]
                )
                insights = insight_responses.choices[0]['message']["content"]
                condensed_insight = condense_insights(insights)
                insights_fmt = parse_insights(condensed_insight)
                break
            except Exception as e:
                attempts += 1
                if attempts == max_attempts:
                    raise Exception("Max attempts reached. Failed to retrieve insights.")
                continue

        insights_list.append(insights_fmt)
    return questions_fmt, insights_list

def generate_reflection(memory_stream, insights_list):
    for insight_question in insights_list:
        for insight in insight_question:
            reflection = Reflection(insight[0], datetime.now(), datetime.now(), pointers=insight[1])
            memory_stream.add_memory(reflection) 
    return memory_stream


In [None]:
# Generate insight list
questions_fmt, insights_list = generate_insight(memory_stream, context_window=15, threshold=3)

Generating reflections ...


100%|██████████| 3/3 [00:41<00:00, 13.99s/it]


In [None]:
questions_fmt

['1. How does the large number of missing values for country in the user_dims table impact data analysis and decision-making for marketing campaigns and user segmentation? ',
 '2. How can the missing data for some user IDs in the wallet_event_logs table affect the analysis of user behavior and the accuracy of predictions based on that data?',
 '3. What impact does the large number of outliers in the daily price of bitcoin in the bitcoin_price_data table have on the accuracy of historical trend analysis and predictions for the future?']

In [None]:
insights_list

[[('Missing data in user_dims table for country may impact accuracy of data analysis and decision-making for marketing campaigns and user segmentation.',
   [7, 52, 47]),
  ('Inconsistent information in user_dims table may lead to incorrect analysis and decision-making for user segmentation and marketing campaigns.',
   [22, 40]),
  ('Duplicated and inconsistent data in user_dims table may impact accuracy of data-driven decisions for user segmentation and marketing campaigns.',
   [3, 57, 62, 42]),
  ('Large number of missing values in bitcoin_price_data table may affect accuracy of analysis and decision-making for cryptocurrency investments.',
   [14])],
 [('The user_dims table has data issues for many users',
   [26, 7, 22, 12, 17, 62, 47, 57, 52]),
  ('Missing data for specific user IDs in the wallet_event_logs table can affect behavior analysis and predictions',
   [53, 63, 43, 27, 58, 4])],
 [('The bitcoin_price_data table has missing and inconsistent data, which may affect the ac

In [None]:
# Update memory stream with reflections, NOTE: some important scores on reflects are returned as NaN
updated_memory_stream = generate_reflection(memory_stream, insights_list)

In [None]:
updated_memory_archive = memory_stream.get_memory_df()

In [None]:
updated_memory_archive.tail(20)

Unnamed: 0,description,importance,creation_timestamp,most_recent_access_timestamp,memory_type,embedding_vector
56,ethereum_erc20_tokens_dim table has inaccurate...,0.2,2023-04-18 14:30:42.489418,2023-04-18 14:30:42.489421,O,"[-0.0005994586390443146, -7.806596113368869e-0..."
57,user_dims table has a large number of duplicat...,0.2,2023-04-18 14:30:43.366718,2023-04-18 14:33:09.214406,O,"[-0.007843722589313984, -0.003966702148318291,..."
58,wallet_event_logs table has missing data for s...,0.2,2023-04-18 14:30:44.128639,2023-04-18 14:33:09.214436,O,"[-0.005605914164334536, -0.01496855728328228, ..."
59,bitcoin_price_data has inconsistent data for w...,0.2,2023-04-18 14:30:45.153043,2023-04-18 14:33:10.011618,O,"[-0.02271866798400879, 0.0019283471629023552, ..."
60,Some bitcoin_transactions have inconsistent da...,0.2,2023-04-18 14:30:46.287458,2023-04-18 14:30:46.287461,O,"[-0.00015052920207381248, -0.00335211539641022..."
61,ethereum_erc20_tokens_dim table has missing da...,0.2,2023-04-18 14:30:47.279668,2023-04-18 14:30:47.279671,O,"[0.00046313315397128463, -0.000905761844478547..."
62,user_dims table has inconsistent data for user...,0.2,2023-04-18 14:30:48.027821,2023-04-18 14:33:09.214396,O,"[-0.005743246991187334, -0.007196654565632343,..."
63,wallet_event_logs table has missing data for s...,0.2,2023-04-18 14:30:49.041145,2023-04-18 14:33:09.214421,O,"[0.0018187625100836158, -0.009912618435919285,..."
64,bitcoin_price_data has a large number of missi...,0.2,2023-04-18 14:30:50.092275,2023-04-18 14:33:10.011613,O,"[-0.021600276231765747, -0.017947591841220856,..."
65,Some bitcoin_transactions have invalid values ...,0.2,2023-04-18 14:30:51.018669,2023-04-18 14:30:55.231250,O,"[0.009053058922290802, -0.012646075338125229, ..."
