In [1]:
from openai import OpenAI
import os
import json
import random

from chatgpt_parser import load_conversations, extract_linear_conversation

FILE_PATH = "data/conversations.json"
LLM_MODE = os.getenv("LLM_MODE", "openai")

if LLM_MODE == "local":
    # Ollama client with OpenAI-compatible API
    client = OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama"  # required but unused
    )
    MODEL_NAME = "llama3.1:8b"
    print(f"Using local LLaMA model: {MODEL_NAME}")
elif LLM_MODE == "openai":
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    MODEL_NAME = "gpt-4o-mini"
    print(f"Using OpenAI model: {MODEL_NAME}")
else:
    raise ValueError(f"Invalid LLM_MODE: {LLM_MODE}. Use 'local' or 'openai'")

random.seed(42)

Using local LLaMA model: llama3.1:8b


In [19]:
# with open(FILE_PATH, "r") as f:
#     conversations = json.load(f)

# print(f"Loaded {len(conversations)} conversations")
# conversations[0].keys()

In [2]:
raw_data = load_conversations(FILE_PATH)
print(len(raw_data))

3418


In [58]:
# for i in extract_linear_conversation(raw_data[202]):
#     print(f"[{i['role']}] {i['text']}\n")

In [None]:
convs_selected = random.sample(raw_data, 50)
# convs_selected[0]

In [39]:
def flatten_turn(turn: dict, agent_mode: str = 'summarised_conversation') -> str:
    for s, t in turn.items():
        if s == 'user':
            return f"[user]\n{t}\n"
        if s == agent_mode:
            return f"[agent]\n{t}\n"
    return ''

In [None]:
from tqdm import tqdm

# Create a simple prompt from the conversation
base_prompt = """
You are an expert in summarizing content. You will be given a conversation between an agent and a human and your task is to summarise the last message content in a way that no content
is lost but we avoid all the details that are not relevant to understand the conversation the user is having. give brief simple bullet points (no nested, no formatting) that cover the topic of what was talked.
The content should be the minimum for someone reading the agent's summary to understand the users's response.
Keep the language of the conversation.

[OUTPUT EXAMPLE]
- Explica al usuario qué evitar, cómo aliviar el dolor, qué alimentos consumir suaves.
- Mejora típica en 48-72 horas, sugiere consultar con un médico si persisten síntomas graves.
[END OF EXAMPLE]

This is the turn to summarise:
"""
summarised_conversation = []
from tqdm import tqdm
for i, conv in enumerate(tqdm(convs_selected, desc="Processing conversations")):
    new_conversation_record = {
        "id": conv['id'],
        "title": conv['title'],
        "create_time": conv['create_time'],
        "update_time": conv['update_time'],
        "turns": []
    }
    for message in extract_linear_conversation(conv):
        if message['role'] == 'user':
            # if len(message['text']) > 500:
            #     summarise()
            turn = {message['role']: message['text']}
            print(turn)
        elif message['role'] ==  'assistant':
            prompt = base_prompt + "\n".join([flatten_turn(turn) for turn in new_conversation_record['turns']]) + f"{message['role']}: {message['text']}"
            response = client.chat.completions.create(
                model=MODEL_NAME,  # Use dynamic model name
                messages=[{"role": "user", "content": prompt}]
            )
            turn = {message['role']: message['text'], "summarised_conversation": response.choices[0].message.content}
            print(turn['summarised_conversation'])
        new_conversation_record['turns'].append(turn)
        print()
    summarised_conversation.append(new_conversation_record)

In [56]:
# for conv in summarised_conversation:
#     for turn in conv['turns']:
#         print(flatten_turn(turn))
#     print("--------")

In [9]:
DEBUG_FILE_NAME = "summarised_conversation_debug.json"

def save_conversation_history(conversation_data: list, filename: str):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            # Use indent=4 for a human-readable file format
            json.dump(conversation_data, f, indent=4)
        print(f"Conversation history successfully saved to **{filename}**.")
    except IOError as e:
        print(f"Error saving file **{filename}**: {e}")

def load_conversation_history(filename: str) -> list:
    if not os.path.exists(filename):
        print(f"File **{filename}** not found. Starting with an empty conversation history.")
        return []

    try:
        with open(filename, 'r', encoding='utf-8') as f:
            conversation_data = json.load(f)
            print(f"Conversation history successfully loaded from **{filename}**.")
            return conversation_data
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from **{filename}**: {e}. Returning an empty list.")
        return []
    except IOError as e:
        print(f"Error reading file **{filename}**: {e}. Returning an empty list.")
        return []

In [None]:
summarised_conversation

In [10]:
save_conversation_history(summarised_conversation, DEBUG_FILE_NAME)

Conversation history successfully saved to **summarised_conversation_debug.json**.


In [12]:
samples_convs = load_conversation_history('summarised_conversation_debug.json')
print(len(samples_convs))

Conversation history successfully loaded from **summarised_conversation_debug.json**.
50


In [55]:
# [
#     print(f"[{r}] {t}\n")
#     for conv in samples_convs
#     for turns in conv['turns']
#     for r, t in turns.items()
# ];

In [53]:
from tqdm import tqdm
# Create a simple prompt from the conversation
base_prompt = """
[System Role] You are a dedicated Memory Manager. Your sole purpose is to extract actionable, long-term user data from conversations to build a personalized user profile.

[Extraction Criteria] Extract details ONLY if they fall into these categories:
- Explicit Preferences: Likes, dislikes, dietary restrictions, favorite items/media.
- Biographical current or historic facts: Name, location, job, age, family members, pets, facts about previous life.
- Recurring Routines: Daily habits, schedules, frequent activities.
- Future Intent: Specific upcoming plans, goals, or milestones.

[Exclusion Criteria]
- Ignore temporary states (e.g., "I am hungry now").
- Ignore general conversation topics or opinions unless they indicate a strong preference.
- Ignore summaries of the chat.

[Format Constraints]
- Output strictly a bulleted list.
- Do not include introductory or concluding text.
- The facts that you store should contain all the relevant context that make that piece of data wholesome.
- If no relevant data is found, output "None".

[Input Conversation]
"""

samples_convs_with_memories = []
for i, conv in enumerate(tqdm(samples_convs, desc="Processing conversations")):
    prompt = base_prompt + "\n".join([flatten_turn(turn) for turn in conv['turns']])
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}]
    )
    conv['memories'] = response.choices[0].message.content
    samples_convs_with_memories.append(conv)

Processing conversations: 100%|██████████| 50/50 [24:51<00:00, 29.84s/it]


In [54]:
save_conversation_history(samples_convs_with_memories, 'summarised_conversation_debug_memories.json')

Conversation history successfully saved to **summarised_conversation_debug_memories.json**.


In [13]:
from datetime import datetime

class Memory():
    """Stores conversation history with full and summarized versions."""

    def __init__(self, memory: dict, conversations: list):
        self.creation = datetime.now().isoformat()
        self.memory = memory
        self.full_conversation, self.summarised_conversation = self.parse_conversations(conversations)

    def parse_conversations(self, conversations: list) -> tuple[list, list]:
        """Split conversations into full and summarized versions."""
        full_convs = []
        summarised_convs = []

        for conv in conversations:
            # Full conversation: all messages with complete content
            full_turns = []
            summarised_turns = []

            for turn in conv.get('turns', []):
                if 'user' in turn:
                    # User messages: same in both versions
                    full_turns.append({'role': 'user', 'content': turn['user']})
                    summarised_turns.append({'role': 'user', 'content': turn['user']})

                elif 'assistant' in turn:
                    # Assistant messages: full vs summarized
                    full_turns.append({'role': 'assistant', 'content': turn['assistant']})

                    if 'summarised_conversation' in turn:
                        summarised_turns.append({
                            'role': 'assistant',
                            'content': turn['summarised_conversation']
                        })

            # Store conversations with metadata
            full_convs.append({
                'id': conv['id'],
                'title': conv['title'],
                'create_time': conv.get('create_time'),
                'update_time': conv.get('update_time'),
                'turns': full_turns
            })

            summarised_convs.append({
                'id': conv['id'],
                'title': conv['title'],
                'create_time': conv.get('create_time'),
                'update_time': conv.get('update_time'),
                'turns': summarised_turns
            })

        return full_convs, summarised_convs