In [110]:
import json
import os
import pandas as pd
import re
from urllib.parse import urlparse

In [111]:
logs_dir = "../../../logs"
log_file_paths = []
for root, dirs, files in os.walk(logs_dir):
    if "archive" in root:
        continue

    for file in files:
        if file == "log.jsonl":
            log_file_path = os.path.join(root, file)  # Construct file path
            log_file_paths.append(log_file_path)
print(log_file_paths)

['../../../logs/vision_magnetic_easy_01_20241215230819/log.jsonl', '../../../logs/vision_magnetic_advanced_02_20241217204927/log.jsonl', '../../../logs/vision_magnetic_advanced_01_20241217203222/log.jsonl', '../../../logs/vision_magnetic_easy_00_20241215221928/log.jsonl', '../../../logs/vision_magnetic_advanced_00_20241215233606/log.jsonl']


In [112]:
# TODO: Replanning: ../../../logs/vision_magnetic_easy_01_20241215230819/log.jsonl

In [214]:
log_file_path = log_file_paths[4]
print(log_file_path)

IndexError: list index out of range

In [195]:
# take log_file_path, remove "log.jsonl" and add "trajectory.csv"
csv_file_path  = log_file_path.replace("log.jsonl", "trajectory.csv")

In [196]:

# Define CSV column names
csv_columns = ["log_entry", "type", "source", "action", "url", "website_type", "browsing_stage", "action_goal", "orchestrator", "is_in_loop", "is_progress_being_made", "description"]


In [197]:
# Read and process JSONL log file
log_entries = []
errors = []  # Store unexpected "Orchestrator" sources

with open(log_file_path, "r", encoding="utf-8") as jsonl_file:
    for line in jsonl_file:
        try:
            entry = json.loads(line.strip())

            # Ignore "LLMCallEvent" entries
            if entry.get("type") == "LLMCallEvent":
                continue

            # Extract relevant fields
            source = entry.get("source", "")
            message = entry.get("message", "")
            timestamp = entry.get("timestamp", "")

            # Default: orchestrator_value, is_in_loop, is_progress_being_made, description are empty
            orchestrator_value = ""
            is_in_loop_value = ""
            is_progress_being_made_value = ""
            description_value = ""

            # Assign orchestrator values based on the clarified conditions
            if source.startswith("Orchestrator"):
                if source in [
                    "Orchestrator (-> WebSurfer)", 
                    "Orchestrator (-> Coder)", 
                    "Orchestrator (-> Executor)"
                ]:
                    orchestrator_value = "outer"

                elif source == "Orchestrator (termination condition)":
                    orchestrator_value = "outer"
                    description_value = "limit reached"

                elif source == "Orchestrator (thought)":
                    # Check predefined conditions
                    if message.startswith("Next speaker") or message.startswith("Initial plan"):
                        orchestrator_value = "outer"
                    elif message.startswith("Updated Ledger:"):
                        orchestrator_value = "inner"

                        # Extract the JSON part after "Updated Ledger:"
                        try:
                            json_part = message[len("Updated Ledger:"):].strip()
                            if json_part:
                                ledger_json = json.loads(json_part)  # Convert to dict
                                is_in_loop_value = ledger_json.get("is_in_loop", {}).get("answer", None)
                                is_progress_being_made_value = ledger_json.get("is_progress_being_made", {}).get("answer", None)
                            else:
                                errors.append(f"Empty JSON after 'Updated Ledger:' in message: {message}")
                        except json.JSONDecodeError as e:
                            errors.append(f"Failed to parse Updated Ledger JSON: {message}\nError: {str(e)}")

                    elif message == "Stalled.... Replanning...":
                        orchestrator_value = "outer"
                        description_value = "plan stopped"

                    elif message.lower().startswith("new plan"):
                        orchestrator_value = "outer"
                        description_value = "plan stopped"

                    else:
                        errors.append(f"Unexpected Orchestrator source: {source}, message: {message}")

                else:
                    errors.append(f"Unexpected Orchestrator source: {source}, message: {message}")

            # Append processed log entry
            log_entries.append({
                "timestamp": timestamp,
                "log_entry": json.dumps(entry),
                "type": entry.get("type", ""),
                "source": source,
                "action": entry.get("action", ""),
                "url": entry.get("url", ""),
                "website_type": "",  # Placeholder
                "browsing_stage": "",  # Placeholder
                "action_goal": "",  # Placeholder
                "orchestrator": orchestrator_value,  # Only set for specific Orchestrator cases
                "is_in_loop": is_in_loop_value,  # Extracted JSON value for "inner" orchestrator
                "is_progress_being_made": is_progress_being_made_value,  # Extracted JSON value for "inner" orchestrator
                "description": description_value  # Now properly classified
            })
        except json.JSONDecodeError as e:
            errors.append(f"Skipping invalid JSON line due to decoding error: {str(e)}")
            continue  # Skip invalid JSON lines

# Print errors if any unexpected "Orchestrator" sources are found
if errors:
    print("\nErrors Detected in Orchestrator Sources")
    for error in errors:
        print(error)


In [198]:
# json to pandas dataframe
df = pd.DataFrame(log_entries)

In [199]:
df.head(5)

Unnamed: 0,timestamp,log_entry,type,source,action,url,website_type,browsing_stage,action_goal,orchestrator,is_in_loop,is_progress_being_made,description
0,2024-12-15T23:36:06.004036,"{""timestamp"": ""2024-12-15T23:36:06.004036"", ""t...",Initialization,,,,,,,,,,
1,2024-12-15T23:36:07.961759,"{""timestamp"": ""2024-12-15T23:36:07.961759"", ""t...",WebSurferEvent,WebSurfer,,https://www.google.com/,,,,,,,
2,2024-12-15T23:36:15.285707,"{""timestamp"": ""2024-12-15T23:36:15.285707"", ""s...",OrchestrationEvent,UserProxy,,,,,,,,,
3,2024-12-15T23:36:21.405811,"{""timestamp"": ""2024-12-15T23:36:21.405811"", ""s...",OrchestrationEvent,Orchestrator (thought),,,,,,outer,,,
4,2024-12-15T23:36:24.577431,"{""timestamp"": ""2024-12-15T23:36:24.577431"", ""s...",OrchestrationEvent,Orchestrator (thought),,,,,,inner,False,True,


In [156]:
df.tail(10)

Unnamed: 0,timestamp,log_entry,type,source,action,url,website_type,browsing_stage,action_goal,orchestrator,is_in_loop,is_progress_being_made,description
167,2024-12-15T22:44:19.516648,"{""timestamp"": ""2024-12-15T22:44:19.516648"", ""t...",WebSurferEvent,WebSurfer,,https://www.rightmove.co.uk/property-to-rent/L...,,,,,,,
168,2024-12-15T22:44:31.606510,"{""timestamp"": ""2024-12-15T22:44:31.606510"", ""s...",OrchestrationEvent,WebSurfer,,,,,,,,,
169,2024-12-15T22:44:44.561734,"{""timestamp"": ""2024-12-15T22:44:44.561734"", ""s...",OrchestrationEvent,Orchestrator (thought),,,,,,inner,False,True,
170,2024-12-15T22:44:44.561849,"{""timestamp"": ""2024-12-15T22:44:44.561849"", ""s...",OrchestrationEvent,Orchestrator (-> WebSurfer),,,,,,outer,,,
171,2024-12-15T22:44:44.561968,"{""timestamp"": ""2024-12-15T22:44:44.561968"", ""s...",OrchestrationEvent,Orchestrator (thought),,,,,,outer,,,
172,2024-12-15T22:44:45.004394,"{""timestamp"": ""2024-12-15T22:44:45.004394"", ""t...",WebSurferEvent,WebSurfer,,https://www.rightmove.co.uk/property-to-rent/L...,,,,,,,
173,2024-12-15T22:45:04.122779,"{""timestamp"": ""2024-12-15T22:45:04.122779"", ""t...",WebSurferEvent,WebSurfer,web_search,https://www.rightmove.co.uk/property-to-rent/L...,,,,,,,
174,2024-12-15T22:45:07.910967,"{""timestamp"": ""2024-12-15T22:45:07.910967"", ""t...",WebSurferEvent,WebSurfer,,https://www.google.com/search?q=studio+apartme...,,,,,,,
175,2024-12-15T22:45:15.407980,"{""timestamp"": ""2024-12-15T22:45:15.407980"", ""s...",OrchestrationEvent,WebSurfer,,,,,,,,,
176,2024-12-15T22:45:15.408181,"{""timestamp"": ""2024-12-15T22:45:15.408181"", ""s...",OrchestrationEvent,Orchestrator (termination condition),,,,,,outer,,,limit reached


In [200]:
df.action.value_counts()

action
                   47
click               6
web_search          1
answer_question     1
history_back        1
summarize_page      1
Name: count, dtype: int64

In [201]:
from urllib.parse import urlparse
import re

# Known domains
SEARCH_ENGINES = {"google.com"}
REAL_ESTATE_SITES = {
    "rightmove.co.uk", "zoopla.co.uk", "onthemarket.com",
    "zillow.com", "realtor.com", "apartments.com", "craigslist.org",
    "rentcafe.com", "apartmentguide.com", "losangeles.craigslist.org", "austin.craigslist.org"
}

# Function to extract base domain
def extract_base_domain(url):
    parsed_url = urlparse(url)
    base_domain = parsed_url.netloc.split(":")[0].lower().replace("www.", "")  # Remove subdomains like "www."
    return base_domain

# Rightmove browsing stage classifier
def classify_rightmove_browsing_stage(url):
    path = urlparse(url).path.lower()
    if "/property-to-rent/" in path:
        return "overview"
    elif "/properties/" in path or re.search(r"#prop\d+$", url):
        return "detail"
    else:
        return "other"

# Apartments.com browsing stage classifier
def classify_apartments_browsing_stage(url):
    path = urlparse(url).path.strip("/").split("/")
    if len(path) == 1:  # Example: "apartments.com/"
        return "start"
    elif len(path) == 2:  # Example: "apartments.com/austin-tx/"
        return "overview"
    elif len(path) > 2:  # Example: "apartments.com/residences-at-the-domain-austin-tx/0039d11/"
        return "detail"
    else:
        return "other"

# RentCafe browsing stage classifier
def classify_rentcafe_browsing_stage(url):
    path = urlparse(url).path.lower()
    if "/apartments-for-rent/" in path or "/1-bedroom-apartments-for-rent/" in path:
        return "overview"
    return "other"

# ApartmentGuide browsing stage classifier
def classify_apartmentguide_browsing_stage(url):
    path = urlparse(url).path.lower()
    if "/apartments/" in path:
        return "overview"
    return "other"

# Craigslist browsing stage classifier
def classify_craigslist_browsing_stage(url):
    path = urlparse(url).path.lower()
    if "/search/" in path:
        return "overview"
    return "other"

# Zoopla browsing stage classifier
def classify_zoopla_browsing_stage(url):
    path = urlparse(url).path.lower()
    # if base domain is zoopla.co.uk, return "overview"
    if extract_base_domain(url)  == "zoopla.co.uk":
        return "overview"
    elif "/details/" in path: 
            return "detail"
    elif "/to-rent/" in path: # Overview pages contain "to-rent" but not "dtails"
        return "overview"
    return "other"

# Google browsing stage classifier
def classify_google_browsing_stage(url):
    path = urlparse(url).path.lower()
    if path in ["", "/"]:
        return "start"
    elif "/search?" in url:
        return "overview"
    else:
        return "other"

# Function to classify URLs and determine browsing stage
def classify_url_and_stage(url):
    if not url:  # Handle empty URLs
        return None, None, None

    base_domain = extract_base_domain(url)

    # Classify website type
    if base_domain in SEARCH_ENGINES:
        site_type = "search_engine"
    elif base_domain in REAL_ESTATE_SITES:
        site_type = "real_estate"
    else:
        site_type = "other"

    # Determine browsing stage using the correct classifier
    if base_domain in SEARCH_ENGINES:
        browsing_stage = classify_google_browsing_stage(url)
    elif base_domain == "rightmove.co.uk":
        browsing_stage = classify_rightmove_browsing_stage(url)
    elif base_domain == "apartments.com":
        browsing_stage = classify_apartments_browsing_stage(url)
    elif base_domain == "rentcafe.com":
        browsing_stage = classify_rentcafe_browsing_stage(url)
    elif base_domain == "apartmentguide.com":
        browsing_stage = classify_apartmentguide_browsing_stage(url)
    elif base_domain == "craigslist.org" or base_domain == "losangeles.craigslist.org" or base_domain == "austin.craigslist.org":
        browsing_stage = classify_craigslist_browsing_stage(url)
    elif base_domain == "zoopla.co.uk":
        browsing_stage = classify_zoopla_browsing_stage(url)
    else:
        browsing_stage = "other"  # Needs manual checking

    return base_domain, site_type, browsing_stage

# Apply classification to the DataFrame
df["url_base_domain"], df["website_type"], df["browsing_stage"] = zip(
    *df["url"].map(lambda x: classify_url_and_stage(x) if pd.notna(x) else (None, None, None))
)

# Drop url_base_domain since we don't want it in the final DataFrame
df.drop(columns=["url_base_domain"], inplace=True)

# Debugging: Print unique browsing stages to verify
print("\nUnique browsing stages identified:")
print(df["browsing_stage"].value_counts())


Unique browsing stages identified:
browsing_stage
overview    16
detail      11
start        3
Name: count, dtype: int64


In [202]:
# # Apply classification to the DataFrame
# df["website_type"] = df["url"].map(lambda x: classify_url(x)[1] if pd.notna(x) else "")


In [203]:
# Check for URLs that could not be classified automatically
# show 'url' and 'website_type' where 'website_type' is 'other
df[df["website_type"] == "other"][["url"]]

Unnamed: 0,url


In [204]:
other_websites = df[df["website_type"] == "other"]["url"].unique()
for website in other_websites:
    print(website)

In [205]:
# throw error if there are any 'other' website types
assert not df[df["website_type"] == "other"].any().any(), "Unrecognized website types found"

In [206]:
df.browsing_stage.value_counts()

browsing_stage
overview    16
detail      11
start        3
Name: count, dtype: int64

In [207]:
#  create a list with all values in 'url' where 'browsing_stage' is 'other'
other_urls = df[df["browsing_stage"] == "other"]["url"].unique()
for url in other_urls:
    print(url)

In [208]:
# throw error if there are any 'other' browsing stages
assert not df[df["browsing_stage"] == "other"].any().any(), "Unrecognized browsing state found"

In [209]:
# Mapping of actions to action_goal
ACTION_GOAL_MAPPING = {
    "web_search": "search",
    "answer_question": "info_external",
    "summarize_page": "info_extraction",
    "history_back": "navigation",
    "click": "navigation",
    "visit_url" : "navigation",
}

# Function to classify action_goal based on action
def classify_action_goal(action):
    if not action:
        return None
    return ACTION_GOAL_MAPPING.get(action, "")  # Return mapped value or None if not found

# Apply classification to the DataFrame
df["action_goal"] = df["action"].map(lambda x: classify_action_goal(x) if pd.notna(x) else "")


In [210]:
df.head()

Unnamed: 0,timestamp,log_entry,type,source,action,url,website_type,browsing_stage,action_goal,orchestrator,is_in_loop,is_progress_being_made,description
0,2024-12-15T23:36:06.004036,"{""timestamp"": ""2024-12-15T23:36:06.004036"", ""t...",Initialization,,,,,,,,,,
1,2024-12-15T23:36:07.961759,"{""timestamp"": ""2024-12-15T23:36:07.961759"", ""t...",WebSurferEvent,WebSurfer,,https://www.google.com/,search_engine,start,,,,,
2,2024-12-15T23:36:15.285707,"{""timestamp"": ""2024-12-15T23:36:15.285707"", ""s...",OrchestrationEvent,UserProxy,,,,,,,,,
3,2024-12-15T23:36:21.405811,"{""timestamp"": ""2024-12-15T23:36:21.405811"", ""s...",OrchestrationEvent,Orchestrator (thought),,,,,,outer,,,
4,2024-12-15T23:36:24.577431,"{""timestamp"": ""2024-12-15T23:36:24.577431"", ""s...",OrchestrationEvent,Orchestrator (thought),,,,,,inner,False,True,


In [211]:
# show rows that are not None and not "" in 'action' column
df[df["action"].notna() & (df["action"] != "")]

Unnamed: 0,timestamp,log_entry,type,source,action,url,website_type,browsing_stage,action_goal,orchestrator,is_in_loop,is_progress_being_made,description
8,2024-12-15T23:36:31.029321,"{""timestamp"": ""2024-12-15T23:36:31.029321"", ""t...",WebSurferEvent,WebSurfer,web_search,https://www.google.com/,search_engine,start,search,,,,
15,2024-12-15T23:36:52.725738,"{""timestamp"": ""2024-12-15T23:36:52.725738"", ""t...",WebSurferEvent,WebSurfer,click,https://www.google.com/search?q=pet-friendly+s...,search_engine,overview,navigation,,,,
22,2024-12-15T23:37:18.848134,"{""timestamp"": ""2024-12-15T23:37:18.848134"", ""t...",WebSurferEvent,WebSurfer,click,https://www.google.com/search?q=pet-friendly+s...,search_engine,overview,navigation,,,,
29,2024-12-15T23:38:05.342129,"{""timestamp"": ""2024-12-15T23:38:05.342129"", ""t...",WebSurferEvent,WebSurfer,click,https://www.rightmove.co.uk/property-to-rent/S...,real_estate,overview,navigation,,,,
36,2024-12-15T23:39:10.568760,"{""timestamp"": ""2024-12-15T23:39:10.568760"", ""t...",WebSurferEvent,WebSurfer,click,https://www.rightmove.co.uk/property-to-rent/S...,real_estate,overview,navigation,,,,
43,2024-12-15T23:40:27.681108,"{""timestamp"": ""2024-12-15T23:40:27.681108"", ""t...",WebSurferEvent,WebSurfer,answer_question,https://www.rightmove.co.uk/properties/1559377...,real_estate,detail,info_external,,,,
49,2024-12-15T23:41:51.591204,"{""timestamp"": ""2024-12-15T23:41:51.591204"", ""t...",WebSurferEvent,WebSurfer,history_back,https://www.rightmove.co.uk/properties/1559377...,real_estate,detail,navigation,,,,
56,2024-12-15T23:43:52.689752,"{""timestamp"": ""2024-12-15T23:43:52.689752"", ""t...",WebSurferEvent,WebSurfer,click,https://www.rightmove.co.uk/property-to-rent/S...,real_estate,overview,navigation,,,,
63,2024-12-15T23:45:50.282249,"{""timestamp"": ""2024-12-15T23:45:50.282249"", ""t...",WebSurferEvent,WebSurfer,summarize_page,https://www.rightmove.co.uk/properties/1558936...,real_estate,detail,info_extraction,,,,
74,2024-12-15T23:50:01.129466,"{""timestamp"": ""2024-12-15T23:50:01.129466"", ""t...",WebSurferEvent,WebSurfer,click,https://www.rightmove.co.uk/properties/1558936...,real_estate,detail,navigation,,,,


In [212]:
# # remove columns 'timestamp' and 'log_entry' 
# df.drop(columns=["timestamp", "log_entry"], inplace=True)

In [213]:
# save df to csv
df.to_csv(csv_file_path, index=False)
print(f"CSV file saved to: {csv_file_path}")

CSV file saved to: ../../../logs/vision_magnetic_advanced_00_20241215233606/trajectory.csv


In [23]:
##