In [None]:
%pip install sqlite_vec

from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()  # take environment variables from .env.

In [65]:

from openai import OpenAI
import sqlite3
import os

def get_db_path() -> str:
    return './covas_tools.db'

def get_connection() -> sqlite3.Connection:
    conn = sqlite3.connect(get_db_path(), detect_types=sqlite3.PARSE_DECLTYPES |
                                                        sqlite3.PARSE_COLNAMES)
    return conn

conn = get_connection()


In [None]:
import json
import sys
from datetime import datetime
entries = conn.execute('''
select data, created_at from reply_logger_v1 ORDER BY created_at ASC
'''.replace('\n',' ')).fetchall()


for index, (entry,created_at) in enumerate(entries):
    entry = json.loads(entry)
    
    (screenshot, created_at) = conn.execute('''
    select data, created_at from screen_logger_v1 WHERE created_at >= ? ORDER BY created_at ASC LIMIT 1
    '''.replace('\n',' '), (datetime.fromisoformat(created_at.replace(' ','T')+'+00:00'),)).fetchone()
    
    request = entry['request']
    response = entry['response']
    screenshot = json.loads(screenshot)['screenshot']
    tools = request['tools']
    messages = request['messages']
    new_messages = []
    for message in reversed(messages):
        if message['role'] == 'assistant':
            break
        new_messages.append(message)
    
    category = 'unknown'
    if response['completion']['choices'][0]['message']['tool_calls']:
        category = 'tool'
    elif 'tool' in [m['role'] for m in new_messages]:
        category = 'toolres'
    elif user := [m['content'] for m in new_messages if not m['content'].startswith('(') and m['role'] == 'user']:
        category = input(' – '.join(user))
    elif len([m['content'] for m in new_messages if m['content'].startswith('(')]) > 0:
        category = 'event'
    
    reference_model = request['model']
    reference_text = response['completion']['choices'][0]['message']['content']
    reference_action = [a['function']['name'] + ': ' + a['function']['arguments'] for a in response['completion']['choices'][0]['message']['tool_calls']] if response['completion']['choices'][0]['message']['tool_calls'] else []
    
    print(f'Index: {index}')
    print('CMDR:', messages[-1]['content'])
    print('COVAS:', reference_text)
    if reference_action:
        print('Action:', reference_action)
    # flush
    sys.stdout.flush()
    
    with open(f'./covas_tools/{index}-{category}.json', 'w', encoding="utf-8") as f:
        f.write(json.dumps({
            'category': category,
            'tools': tools,
            'screenshot': 'data:image/jpg;base64,'+screenshot,
            'messages': messages,
            'reference_response': response,
        }, indent=4))

In [None]:
from openai import OpenAI
client = OpenAI(api_key='-', base_url='http://localhost:8080/v1')
#client = OpenAI(api_key='-', base_url='http://192.168.0.183:8080/v1')
#client = OpenAI(api_key='-', base_url='http://mimoja-datenschleuder.local:9090/v1/')
completion = client.chat.completions.create(**{
    'messages': [
        {
            "role": "system",
            "content": "You are covas my digital assistant."
        },
        #{
        #    "role": "user",
        #    "content": "(current status: {\"status\": [\"Docked\", \"LandingGearDown\", \"ShieldsUp\", \"FsdMassLocked\", \"InMainShip\"], \"balance\": 75888001, \"pips\": {\"system\": 3.0, \"engine\": 3.0, \"weapons\": 0.0}, \"cargo\": 0.0, \"player_time\": \"2024-12-01T00:04:07.363043\", \"elite_time\": \"3310-12-01T00:04:07.363043\"})"
        #},
        #{
        #    "role": "user",
        #    "content": "(Current ship: {\"Name\": \"european swallow\", \"Type\": \"empire_courier\", \"ShipIdent\": \"swllw\", \"UnladenMass\": 72.500648, \"Cargo\": 0, \"CargoCapacity\": 0, \"FuelMain\": 8.0, \"FuelMainCapacity\": 8.0, \"FuelReservoir\": 0.41, \"FuelReservoirCapacity\": 0.41, \"MaximumJumpRange\": 36.66188, \"LandingPadSize\": \"S\"})"
        #},
        #{
        #    "role": "user",
        #    "content": "(Current location: {\"StarSystem\": \"Betel\", \"Docked\": true, \"Station\": \"Amphipolis\"})"
        #},
        #{
        #    "role": "user",
        #    "content": "(Stations in current system: [{'name': 'Poteidaia', 'type': 'Coriolis Starport', 'orbit': 462, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Agriculture', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market', 'shipyard', 'outfitting'], 'body': 'Kava'}, {'name': 'Stronghold Carrier', 'type': 'Planetary Outpost', 'orbit': 462, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'High Tech', 'secondEconomy': None, 'controllingFaction': 'Li Yong-Rui', 'services': ['market', 'shipyard', 'outfitting'], 'body': 'Kava'}, {'name': 'Amphipolis', 'type': 'Coriolis Starport', 'orbit': 462.539346, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Agriculture', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market', 'shipyard', 'outfitting']}, {'name': 'Lockett Analytics Institution', 'type': 'Odyssey Settlement', 'orbit': 2118.172372, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'High Tech', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': []}, {'name': 'Pozandr Agricultural Centre', 'type': 'Odyssey Settlement', 'orbit': 2129, 'allegiance': 'Independent', 'government': 'Democracy', 'economy': 'Agriculture', 'secondEconomy': None, 'controllingFaction': 'Flotta Stellare', 'services': ['market'], 'body': 'Betel 4 b'}, {'name': 'Gaspar de Lemos Orbital', 'type': 'Outpost', 'orbit': 47, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Extraction', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 2'}, {'name': 'Hennepin Barracks', 'type': 'Planetary Outpost', 'orbit': 2129, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Military', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market', 'shipyard', 'outfitting'], 'body': 'Betel 4 b'}, {'name': 'Baker Enterprise', 'type': 'Planetary Outpost', 'orbit': 2129, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Colony', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market', 'outfitting'], 'body': 'Betel 4 a'}, {'name': 'Bennett Analysis Complex', 'type': 'Odyssey Settlement', 'orbit': 2123, 'allegiance': 'Independent', 'government': 'Democracy', 'economy': 'High Tech', 'secondEconomy': None, 'controllingFaction': 'Flotta Stellare', 'services': ['market'], 'body': 'Betel 4 c'}, {'name': 'Karpenko Nutrition Enterprise', 'type': 'Odyssey Settlement', 'orbit': 2145.159633, 'allegiance': 'Independent', 'government': 'Democracy', 'economy': 'Agriculture', 'secondEconomy': None, 'controllingFaction': 'Flotta Stellare', 'services': []}, {'name': 'Mogyla Biochemical Lab', 'type': 'Odyssey Settlement', 'orbit': 3133.761293, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'High Tech', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': []}, {'name': 'Castillo Synthetics Assembly', 'type': 'Odyssey Settlement', 'orbit': 3133, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Industrial', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 6 f'}, {'name': 'Bukenya Chemical Facility', 'type': 'Odyssey Settlement', 'orbit': 3140, 'allegiance': 'Federation', 'government': 'Corporate', 'economy': 'Industrial', 'secondEconomy': None, 'controllingFaction': 'Waruts Holdings', 'services': ['market'], 'body': 'Betel 6 g'}, {'name': 'Hammond Analytics Installation', 'type': 'Odyssey Settlement', 'orbit': 3141, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'High Tech', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 6 d'}, {'name': 'Zelenko Industrial Base', 'type': 'Odyssey Settlement', 'orbit': 3143, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Industrial', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 6 e'}, {'name': 'Awolowo Horticultural', 'type': 'Odyssey Settlement', 'orbit': 3140, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Agriculture', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 6 a'}, {'name': 'Kook Hydroponics Collection', 'type': 'Odyssey Settlement', 'orbit': 3141, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Agriculture', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 6 d'}, {'name': 'Crellin-Davies Analytics Installation', 'type': 'Odyssey Settlement', 'orbit': 3143, 'allegiance': 'Independent', 'government': 'Democracy', 'economy': 'High Tech', 'secondEconomy': None, 'controllingFaction': 'Flotta Stellare', 'services': ['market'], 'body': 'Betel 6 e'}, {'name': 'Collinson Botanical Facility', 'type': 'Odyssey Settlement', 'orbit': 3145.932239, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Agriculture', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': []}, {'name': 'Sakurai Chemical Silo', 'type': 'Odyssey Settlement', 'orbit': 3123, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'Industrial', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 6 i'}, {'name': 'Yeo Laboratory', 'type': 'Odyssey Settlement', 'orbit': 3114, 'allegiance': 'Independent', 'government': 'Corporate', 'economy': 'High Tech', 'secondEconomy': None, 'controllingFaction': 'Omega Merchant Command', 'services': ['market'], 'body': 'Betel 6 h'}, {'name': 'Pyeon Chemical Base', 'type': 'Odyssey Settlement', 'orbit': 3123, 'allegiance': 'Federation', 'government': 'Corporate', 'economy': 'Industrial', 'secondEconomy': None, 'controllingFaction': 'Silver Creative Network', 'services': ['market'], 'body': 'Betel 6 i'}])"
        #},
        #{
        #    "role": "user",
        #    "content": "(Current system: {\"name\":\"Betel\",\"information\":{\"allegiance\":\"Independent\",\"government\":\"Corporate\",\"faction\":\"Omega Merchant Command\",\"factionState\":\"Expansion\",\"population\":396317,\"security\":\"Low\",\"economy\":\"Agriculture\",\"secondEconomy\":\"Extraction\",\"reserve\":\"Depleted\"},\"primaryStar\":{\"type\":\"G (White-Yellow) Star\",\"name\":\"Betel\",\"isScoopable\":true}})"
        #},
        {
            "role": "user",
            "content": "List all files on the C: drive"
        },
    ],
    'model': 'hermes3-q8', 
    'max_tokens': 150, 
    'temperature': 0, 
    'tools': [
        {"function": {"description": "List files in directory", "name": "list_files", "parameters": {"properties": {"directory": {"type": "string"}}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "start firing primary weapons", "name": "fire", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "stop firing primary weapons", "name": "holdFire", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "start secondary primary weapons", "name": "fireSecondary", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "stop secondary primary weapons", "name": "holdFireSecondary", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "initiate FSD Jump, required to jump to the next system or to enter supercruise", "name": "hyperSuperCombination", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Set speed to 0%", "name": "setSpeedZero", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Set speed to 50%", "name": "setSpeed50", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Set speed to 100%", "name": "setSpeed100", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Deploy heat sink", "name": "deployHeatSink", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Deploy or retract hardpoints", "name": "deployHardpointToggle", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Increase engine power, can be done multiple times", "name": "increaseEnginesPower", "parameters": {"properties": {"pips": {"description": "Amount of pips to increase engine power, default: 1, maximum: 4", "type": "integer"}}, "required": ["pips"], "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Increase weapon power, can be done multiple times", "name": "increaseWeaponsPower", "parameters": {"properties": {"pips": {"description": "Amount of pips to increase weapon power, default: 1, maximum: 4", "type": "integer"}}, "required": ["pips"], "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Increase systems power, can be done multiple times", "name": "increaseSystemsPower", "parameters": {"properties": {"pips": {"description": "Amount of pips to increase systems power, default: 1, maximum: 4", "type": "integer"}}, "required": ["pips"], "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Open galaxy map. Focus on a system or start a navigation route", "name": "galaxyMapOpen", "parameters": {"properties": {"start_navigation": {"description": "Start navigation route to the system", "type": "boolean"}, "system_name": {"description": "System to display or plot to", "type": "string"}}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Close galaxy map", "name": "galaxyMapClose", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Open or close system map", "name": "systemMapOpen", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Cycle to next target", "name": "cycleNextTarget", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Cycle to next fire group", "name": "cycleFireGroupNext", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Toggle ship spotlight", "name": "shipSpotLightToggle", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Eject all cargo", "name": "ejectAllCargo", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Toggle landing gear", "name": "landingGearToggle", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Use shield cell", "name": "useShieldCell", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Fire chaff launcher", "name": "fireChaffLauncher", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Toggle night vision", "name": "nightVisionToggle", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Recall or dismiss ship, available on foot and inside SRV", "name": "recallDismissShip", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Target lock highest threat", "name": "selectHighestThreat", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Toggles cargo scoop", "name": "toggleCargoScoop", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Charge ECM", "name": "chargeECM", "parameters": {"properties": {}, "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Retrieve current interstellar news from Galnet", "name": "getGalnetNews", "parameters": {"properties": {"query": {"description": "Inquiry you are trying to answer. Example: \u0027What happened to the thargoids recently?\u0027", "type": "string"}}, "required": ["query"], "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Retrieve a trade route from the trade plotter. Ask for unknown values and make sure they are known.", "name": "trade_plotter", "parameters": {"properties": {"max_cargo": {"description": "Maximum cargo capacity in tons.", "type": "integer"}, "max_hop_distance": {"description": "Maximum distance in light-years for a single hop.", "type": "number"}, "max_hops": {"description": "Maximum number of hops (jumps) allowed for the route.", "type": "integer"}, "requires_large_pad": {"description": "Whether the station must have a large landing pad.", "type": "boolean"}, "starting_capital": {"description": "Available starting capital in credits.", "type": "number"}, "station": {"description": "Name of the current station. Example: \u0027Wakata Station\u0027", "type": "string"}, "system": {"description": "Name of the current system. Example: \u0027Sol\u0027", "type": "string"}}, "required": ["system", "station", "max_hops", "max_hop_distance", "starting_capital", "max_cargo", "requires_large_pad"], "type": "object"}}, "type": "function"}, {"function": {"description": "Find a star system based on allegiance, government, state, power, primary economy, and more. Ask for unknown values and ensure they are filled out.", "name": "system_finder", "parameters": {"properties": {"allegiance": {"description": "System allegiance to filter by", "items": {"enum": ["Alliance", "Empire", "Federation", "Guardian", "Independent", "Pilots Federation", "Player Pilots", "Thargoid"], "type": "string"}, "type": "array"}, "distance": {"description": "Maximum distance to search for systems, default: 50000", "type": "number"}, "government": {"description": "System government type to filter by", "items": {"type": "string"}, "type": "array"}, "name": {"description": "Required string in system name", "type": "string"}, "population": {"description": "Population comparison and value", "properties": {"comparison": {"description": "Comparison type", "enum": ["\u003c", "\u003e"], "type": "string"}, "value": {"description": "Size to compare with", "type": "number"}}, "type": "object"}, "power": {"description": "Powers controlling or exploiting the system", "items": {"type": "string"}, "type": "array"}, "primary_economy": {"description": "Primary economy type of the system", "items": {"type": "string"}, "type": "array"}, "reference_system": {"description": "Name of the current system. Example: \u0027Sol\u0027", "type": "string"}, "security": {"description": "Security level of the system", "items": {"type": "string"}, "type": "array"}, "state": {"description": "System state to filter by", "items": {"type": "string"}, "type": "array"}, "thargoid_war_state": {"description": "System\u0027s state in the Thargoid War", "items": {"type": "string"}, "type": "array"}}, "required": ["reference_system"], "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Find a station to buy or sell a commodity, to buy an outfitting module, with a Material Trader or Technology Broker. Ask for unknown values and make sure they are known.", "name": "station_finder", "parameters": {"properties": {"distance": {"description": "The maximum distance to search for stations", "example": 50000.0, "type": "number"}, "has_large_pad": {"description": "If the ship requires a large landing pad", "example": False, "type": "boolean"}, "market": {"description": "Market commodities to buy and sell", "items": {"properties": {"amount": {"description": "Tons of cargo to sell or buy. Use maximum cargo capacity.", "type": "integer"}, "name": {"description": "Name of the commodity.", "example": "Tritium", "type": "string"}, "transaction": {"description": "Type of transaction.", "enum": ["Buy", "Sell"], "type": "string"}}, "required": ["name", "amount", "transaction"], "type": "object"}, "minItems": 1, "type": "array"}, "material_trader": {"description": "Material traders to find", "items": {"enum": ["Encoded", "Manufactured", "Raw"], "type": "string"}, "minItems": 1, "type": "array"}, "modules": {"description": "Outfitting modules to buy", "items": {"properties": {"class": {"description": "Classes of the modules.", "items": {"enum": ["0", "1", "2", "3", "4", "5", "6", "7", "8"], "type": "string"}, "minItems": 1, "type": "array"}, "name": {"description": "Name of the module.", "example": "Frame Shift Drive", "type": "string"}, "rating": {"description": "Ratings of the modules.", "example": ["A", "B", "C", "D"], "items": {"enum": ["A", "B", "C", "D", "E", "F", "G", "H", "I"], "type": "string"}, "minItems": 1, "type": "array"}}, "required": ["name"], "type": "object"}, "minItems": 1, "type": "array"}, "name": {"description": "Required string in station name", "type": "string"}, "reference_system": {"description": "Name of the current system. Example: \u0027Sol\u0027", "type": "string"}, "services": {"description": "Services to use", "items": {"properties": {"name": {"description": "Name services", "enum": ["Black Market", "Interstellar Factors Contact"], "type": "string"}}, "required": ["name"], "type": "object"}, "minItems": 1, "type": "array"}, "ships": {"description": "Ships to buy", "items": {"properties": {"name": {"description": "Name of ship", "type": "string"}}, "required": ["name"], "type": "object"}, "minItems": 1, "type": "array"}, "technology_broker": {"description": "Technology brokers to find", "items": {"enum": ["Guardian", "Human"], "type": "string"}, "minItems": 1, "type": "array"}}, "required": ["reference_system", "has_large_pad"], "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Find a planet or star of a certain type or with a landmark. Ask for unknown values and make sure they are known.", "name": "body_finder", "parameters": {"properties": {"landmark_subtype": {"description": "Landmark subtype on celestial body", "items": {"type": "string"}, "type": "array"}, "name": {"description": "Required string in station name", "type": "string"}, "reference_system": {"description": "Name of the current system. Example: \u0027Sol\u0027", "type": "string"}, "subtype": {"description": "Subtype of celestial body", "items": {"type": "string"}, "type": "array"}}, "required": ["reference_system", "has_large_pad"], "type": "object"}}, "type": "function"}, 
        #{"function": {"description": "Describes what\u0027s currently visible to the Commander.", "name": "getVisuals", "parameters": {"properties": {"query": {"description": "Describe what you are curious about in the description. Example: \u0027Count the number of pirates\u0027", "type": "string"}}, "required": ["query"], "type": "object"}}, "type": "function"}
    ],
})
print(completion.choices[0].message.tool_calls)

In [None]:
import os
import sys
import json
from openai import OpenAI

test_models = {
    #"openai/gpt-4o-mini": {
    #    "client": OpenAI(api_key=os.getenv('OPENROUTER_API_KEY'), base_url='https://openrouter.ai/api/v1'),
    #    "model": "openai/gpt-4o-mini",
    #},
    #"meta-llama/llama-3.2-1b-instruct": {
    #    "client": OpenAI(api_key=os.getenv('OPENROUTER_API_KEY'), base_url='https://openrouter.ai/api/v1'),
    #    "model": "meta-llama/llama-3.2-1b-instruct",
    #},
    #"meta-llama/llama-3.2-3b-instruct": {
    #    "client": OpenAI(api_key=os.getenv('OPENROUTER_API_KEY'), base_url='https://openrouter.ai/api/v1'),
    #    "model": "meta-llama/llama-3.2-3b-instruct",
    #},
    #"meta-llama/llama-3.1-8b-instruct": {
    #    "client": OpenAI(api_key=os.getenv('OPENROUTER_API_KEY'), base_url='https://openrouter.ai/api/v1'),
    #    "model": "meta-llama/llama-3.1-8b-instruct",
    #},
    #"meta-llama/llama-3.1-70b-instruct": {
    #    "client": OpenAI(api_key=os.getenv('OPENROUTER_API_KEY'), base_url='https://openrouter.ai/api/v1'),
    #    "model": "meta-llama/llama-3.1-70b-instruct",
    #},
    
    #"lmstudio-community/Llama-3.2-1B-Instruct-GGUF": {
    #    "client": OpenAI(api_key='-', base_url='http://192.168.0.183:8080/'),
    #    "model": "lmstudio-community/Llama-3.2-1B-Instruct-GGUF",
    #},
    #"lmstudio-community/Llama-3.2-3B-Instruct-GGUF": {
    #    "client": OpenAI(api_key='-', base_url='http://192.168.0.183:8080/'),
    #    "model": "lmstudio-community/Llama-3.2-3B-Instruct-GGUF",
    #},
    #"lmstudio-community/Llama-3.1-8B-Instruct-GGUF": {
    #    "client": OpenAI(api_key='-', base_url='http://192.168.0.183:8080/'),
    #    "model": "lmstudio-community/Llama-3.1-8B-Instruct-GGUF",
    #},
    
    #"lucaelin/llama-3.2-1b-instruct-cn-v2.1-1e-gguf": {
    #    "client": OpenAI(api_key='-', base_url='http://192.168.0.183:8080/'),
    #    "model": "lucaelin/llama-3.2-1b-instruct-cn-v2.1-1e-gguf",
    #},
    "lucaelin/llama-3.2-3b-instruct-cn-v2.1-1e-gguf": {
        "client": OpenAI(api_key='-', base_url='http://192.168.0.183:8080/'),
        "model": "lucaelin/llama-3.2-3b-instruct-cn-v2.1-1e-gguf",
    },
    #"lucaelin/llama-3.1-8b-instruct-cn-v2-gguf": {
    #    "client": OpenAI(api_key='-', base_url='https://46ghrp3j5os506-8080.proxy.runpod.net/'),
    #    "model": "lucaelin/llama-3.1-8b-instruct-cn-v2-gguf",
    #},
}

for folder in os.listdir('./'):
    if not os.path.isdir(folder):
        continue
    for file in os.listdir(f'./{folder}'):
        if not file.endswith('.json'):
            continue
        data = open(f'./{folder}/'+file, 'r', encoding="utf-8").read()
        data = json.loads(data)
        
        messages = data['messages']
        #for mesg in data['messages']:
        #    messages.append({})
        #    messages[-1]['role'] = mesg['role']
        #    if 'content' in mesg and mesg['content']:
        #        messages[-1]['content'] = mesg['content']
        #    else:
        #        messages[-1]['content'] = ''
        #    if 'tool_calls' in mesg and mesg['tool_calls']:
        #        messages[-1]['tool_calls'] = mesg['tool_calls']
        
        tools = data['tools']
        reference_text = data["reference_response"]['completion']['choices'][0]['message']['content']
        reference_action = [
            { "name": a['function']['name'], "arguments": a['function']['arguments'] } for a in data["reference_response"]['completion']['choices'][0]['message']['tool_calls']
        ] if data["reference_response"]['completion']['choices'][0]['message']['tool_calls'] else []
        
        print('CMDR:', messages[-1]['content'])
        print('COVAS:', reference_text)
        if reference_action:
            print(reference_action)
        sys.stdout.flush()
        
        eval_results = {
            **data,
            "reference": {
                "text": reference_text,
                "actions": reference_action,
            },
            "responses": {}
        }
        
        # keep previous responses
        if os.path.exists(f'./{folder}/compare/'+file):
            with open(f'./{folder}/compare/'+file, 'r', encoding="utf-8") as f:
                orig_compare = json.loads(f.read())
                if 'responses' in orig_compare:
                    for model, response in orig_compare['responses'].items():
                        if model != 'reference':
                            eval_results['responses'][model] = response
        
        for model, config in test_models.items():
            # skip if already done
            if model in eval_results['responses']:
                continue
            
            completion = config['client'].chat.completions.create(
                model=config['model'],
                messages=messages,
                tools=tools,
                temperature=0,
                max_tokens=150,
            )
            if not completion.choices:
                print(completion)
            completion_model = completion.model
            completion_text = completion.choices[0].message.content
            completion_action = [{"name": a.function.name, "arguments": a.function.arguments} for a in completion.choices[0].message.tool_calls] if completion.choices[0].message.tool_calls else []
            
            print(model, completion_text)
            if completion_action:
                print(completion_action)
            sys.stdout.flush()
            
            eval_results['responses'][model] = {
                'text': completion_text,
                'actions': [
                    {"name": a.function.name, "arguments": a.function.arguments} for a in completion.choices[0].message.tool_calls
                ] if completion.choices[0].message.tool_calls else [],
            }
        
        with open(f'./{folder}/compare/'+file, 'w', encoding="utf-8") as f:
            f.write(json.dumps(eval_results, indent=4))
        

In [None]:
import os 
import json
from pydantic import BaseModel
from typing import Literal
from openai import OpenAI
from llm_eval import run_eval

openrouter = OpenAI(api_key=os.getenv('OPENROUTER_API_KEY'), base_url='https://openrouter.ai/api/v1')


for folder in os.listdir('./'):
    if not os.path.isdir(folder):
        continue
    for file in os.listdir(f'./{folder}/compare'):
        if not file.endswith('.json'):
            continue
        data = open(f'./{folder}/compare/'+file, 'r', encoding="utf-8").read()
        data = json.loads(data)
        
        # read previous evaluation
        if os.path.exists(f'./{folder}/evaluate/'+file):
            with open(f'./{folder}/evaluate/'+file, 'r', encoding="utf-8") as f:
                prev_eval_results = json.loads(f.read())
        else:
            prev_eval_results = {
                "responses": {}
            }
        
        eval_results = {}
        
        
        for key, value in data['responses'].items():
            # skip if already done
            if key in prev_eval_results['responses'] and value['text'] == prev_eval_results['responses'][key]['text'] and value['actions'] == prev_eval_results['responses'][key]['actions']:
                eval_results[key] = prev_eval_results['responses'][key]
                continue
            
            # filter out empty / invalid responses
            if not (key != 'reference' and value and ('text' in value and 'actions' in value)):
                continue
            
            
            result = run_eval(tools=data['tools'], messages=data['messages'], reference=data['reference'], actual=value)
            eval_results[key] = {
                **value,
                'evaluation': result,
            }
            
            
            print(key, json.dumps(result, indent=4))
            

        with open(f'./{folder}/evaluate/'+file, 'w', encoding="utf-8") as f:
            print('writing', file)
            f.write(json.dumps({
                **data,
                "responses": eval_results,
            }, indent=4))
        

In [None]:
import os
import json

stats = {}

for folder in os.listdir('./'):
    if not os.path.isdir(folder):
        continue
    for file in os.listdir(f'./{folder}/evaluate'):
        if not file.endswith('.json'):
            continue
        data = open(f'./{folder}/evaluate/'+file, 'r', encoding="utf-8").read()
        data = json.loads(data)
        
        if data['category'] == 'error' or data['category'] not in ['open','closed','tool','toolres','event']:
            continue
        
        for key, value in data["responses"].items():
            evaluation = value['evaluation']
            
            if 'lucaelin' in key.lower() and '3b' in key.lower() and 'poor' in value['evaluation'].values():
                print(key)
                print(value['text'])
                print(value['actions'])
                print(value['evaluation'])
            
            if not (key != 'reference' and value and 'evaluation' in value):
                continue
            
            evaluation['tools_score'] = 'n/a'
            if data['reference']['actions'] and not value['actions']:
                evaluation['adherence_score'] = 'n/a'
                evaluation['events_score'] = 'n/a'
                evaluation['hallucination_score'] = 'n/a'
                evaluation['tools_score'] = 'incorrectly_omitted'
                
            if data['tools'] and not data['reference']['actions'] and not value['actions']:
                evaluation['tools_score'] = 'correctly_omitted'
                
            if not data['reference']['actions'] and value['actions']:
                evaluation['adherence_score'] = 'n/a'
                evaluation['events_score'] = 'n/a'
                evaluation['hallucination_score'] = 'n/a'
                evaluation['tools_score'] = 'incorrectly_added'
                
            if '<tool_call>' in data['reference']['actions']:
                evaluation['adherence_score'] = 'n/a'
                evaluation['events_score'] = 'n/a'
                evaluation['hallucination_score'] = 'n/a'
                evaluation['tools_score'] = 'incorrectly_added'
                evaluation['arguments_score'] = 'incorrectly_added'
            
            if data['reference']['actions'] and value['actions']:
                if data['reference']['actions'][0]['name'] == value['actions'][0]['name']:
                    evaluation['adherence_score'] = 'n/a'
                    evaluation['events_score'] = 'n/a'
                    evaluation['hallucination_score'] = 'n/a'
                    evaluation['tools_score'] = 'correctly_added'
                else:
                    evaluation['adherence_score'] = 'n/a'
                    evaluation['events_score'] = 'n/a'
                    evaluation['hallucination_score'] = 'n/a'
                    evaluation['tools_score'] = 'incorrectly_added'
            
            #if not data['tools']:
            #    evaluation['tools_score'] = 'n/a'
            #    evaluation['arguments_score'] = 'n/a'
            #if value['actions']:
            #    if evaluation['tools_score'] == 'incorrectly_omitted':
            #        evaluation['tools_score'] = 'incorrectly_added'
            #    if evaluation['tools_score'] == 'correctly_omitted':
            #        evaluation['tools_score'] = 'incorrectly_added'
            #else:
            #    if evaluation['tools_score'] == 'correctly_added':
            #        evaluation['tools_score'] = 'incorrectly_omitted'
            #    if evaluation['tools_score'] == 'incorrectly_added':
            #        evaluation['tools_score'] = 'incorrectly_omitted'
            
            
            #if evaluation['tools_score'] not in ['correctly_added', 'incorrectly_added']:
            #    evaluation['arguments_score'] = 'n/a'
            
            
            if key not in stats:
                stats[key] = {
                    'adherence': {
                        'poor': 0,
                        'okay': 0,
                        'good': 0,
                        'excellent': 0,
                        'n/a': 0,
                        'total': 0,
                    },
                    'events': {
                        'poor': 0,
                        'okay': 0,
                        'good': 0,
                        'excellent': 0,
                        'n/a': 0,
                        'total': 0,
                    },
                    'tools': {
                        'correctly_added': 0,
                        'correctly_omitted': 0,
                        'incorrectly_added': 0,
                        'incorrectly_omitted': 0,
                        'n/a': 0,
                        'total': 0,
                    },
                    'arguments': {
                        'correctly_added': 0,
                        'incorrectly_added': 0,
                        'n/a': 0,
                        'total': 0,
                    },
                    'hallucination': {
                        'poor': 0,
                        'okay': 0,
                        'good': 0,
                        'excellent': 0,
                        'n/a': 0,
                        'total': 0,
                    },
                }
            
            for criteria in stats[key].keys():
                stats[key][criteria][evaluation[criteria+'_score']] += 1
                #if evaluation[criteria+'_score'] != 'n/a':
                stats[key][criteria]['total'] += 1

stats

In [None]:
import numpy as np

import matplotlib.pyplot as plt

models = list(dict.fromkeys([
    #"meta-llama/llama-3.2-1b-instruct",
    "lmstudio-community/Llama-3.2-1B-Instruct-GGUF",
    #"lucaelin/llama-3.2-1b-instruct-cn-v2-1e-gguf",
    "lucaelin/llama-3.2-1b-instruct-cn-v2.1-1e-gguf",
    #"meta-llama/llama-3.2-3b-instruct",
    "lmstudio-community/Llama-3.2-3B-Instruct-GGUF",
    #"lucaelin/llama-3.2-3b-instruct-cn-v2-1e-gguf",
    "lucaelin/llama-3.2-3b-instruct-cn-v2.1-1e-gguf",
    #"meta-llama/llama-3.1-8b-instruct",
    "lmstudio-community/Llama-3.1-8B-Instruct-GGUF",
    #"meta-llama/llama-3.1-70b-instruct",
    "openai/gpt-4o-mini",
]))
#] + list(stats.keys())))

names={
    **{model: model for model in models},
    "meta-llama/llama-3.2-1b-instruct": "llama-3.2-1b-or",
    "lmstudio-community/Llama-3.2-1B-Instruct-GGUF": "llama-3.2-1b",
    "lucaelin/llama-3.2-1b-instruct-cn-v2-1e-gguf": "covas.0-1b",
    "lucaelin/llama-3.2-1b-instruct-cn-v2.1-1e-gguf": "covas-1b",
    "lucaelin/llama-3.2-1b-instruct-cn-v2-gguf": "covas-1b",
    "meta-llama/llama-3.2-3b-instruct": "llama-3.2-3b-or",
    "lmstudio-community/Llama-3.2-3B-Instruct-GGUF": "llama-3.2-3b",
    "lucaelin/llama-3.2-3b-instruct-cn-v2-1e-gguf": "covas.0-3b",
    "lucaelin/llama-3.2-3b-instruct-cn-v2.1-1e-gguf": "covas-3b",
    "lucaelin/llama-3.2-3b-instruct-cn-v2-gguf": "covas-3b",
    "meta-llama/llama-3.1-8b-instruct": "llama-3.1-8b-or",
    "lmstudio-community/Llama-3.1-8B-Instruct-GGUF": "llama-3.1-8b",
    "meta-llama/llama-3.1-70b-instruct": "llama-3.1-70b-or",
    "openai/gpt-4o-mini": "gpt-4o-mini",
}
criteria = ['adherence', 'events', 'hallucination', 'tools', 'arguments']
labels = ['poor', 'okay', 'good', 'excellent', 'incorrectly_added', 'incorrectly_omitted', 'correctly_added', 'correctly_omitted']
colors = {
    'poor': 'red',
    'okay': 'orange',
    'good': 'yellow',
    'excellent': 'lightgreen',
    'correctly_added': 'lightgreen',
    'correctly_omitted': 'lightgrey',
    'incorrectly_added': 'darkred',
    'incorrectly_omitted': 'red',
    'n/a': 'white',
}

fig, axs = plt.subplots(2, 3, figsize=(15, 10))
# add some space between subplots
fig.subplots_adjust(hspace=1.5, wspace=1.5)
axs = axs.flatten()


for i, criterion in enumerate(criteria):
    ax = axs[i]
    
    #scores = {label: [stats[model][criterion].get(label, 0) for model in models] for label in labels if label in stats[models[0]][criterion]}
    scores = {}
    for label in labels:
        if label in stats[models[0]][criterion]:
            scores[label] = []
            for model in models:
                value = stats[model][criterion].get(label, 0) 
                total = stats[model][criterion].get('total', 0)
                #if criterion in ['adherence', 'events', 'hallucination'] and total:
                #    value = value / total
                scores[label].append(value)
        
    
    bottom = np.zeros(len(models))
    for j, label in enumerate(scores.keys()):
        ax.bar(models, scores[label], bottom=bottom, label=label, color=colors[label])
        bottom += np.array(scores[label])
    
    ax.set_title(criterion)
    ax.set_xticklabels([names[model] for model in models], rotation=30, ha='right')
    # place legend left of plot
    ax.legend(loc='upper right', bbox_to_anchor=(-0.15, 1))

plt.tight_layout()
plt.show()

| Step 	    | Training Loss 	| Validation Loss   |
|-----------|-------------------|-------------------|
| 4         | 1.837900 	        | 2.483167          |
| 8         | 1.289100 	        | 2.359537          |
| 12        | 0.976800 	        | 2.228802          |
| 16        | 0.959300 	        | 2.177512          |
| 20        | 0.867900 	        | 2.167669          |