# 環境セットアップ

In [0]:
!pip install \
    langchain==0.3.27 \
    langchain-core==0.3.79 \
    langchain-text-splitters==0.3.11 \
    openai==2.3.0 \
    mlflow==3.4.0 \
    pandas==2.2.3 \
    pyyaml==6.0.2

dbutils.library.restartPython()

In [0]:
import os
import json
import yaml
from typing import List, Dict
from openai import OpenAI
import mlflow
import pandas as pd
from datetime import datetime
from langchain.prompts import PromptTemplate

client = OpenAI()
print("✓ 環境セットアップ完了")

# プロンプトとテストデータ読み込み

In [0]:
base_path = "./"

# ============================================================
# すべてのYAMLファイル読み込み
# ============================================================

# プロンプトテンプレート
with open(f"{base_path}/prompts.yaml", "r", encoding="utf-8") as f:
    prompts_config = yaml.safe_load(f)

# システムプロンプト
with open(f"{base_path}/system_prompts.yaml", "r", encoding="utf-8") as f:
    system_prompts_config = yaml.safe_load(f)

# 評価設定
with open(f"{base_path}/evaluation.yaml", "r", encoding="utf-8") as f:
    evaluation_config = yaml.safe_load(f)

# テストデータ
with open(f"{base_path}/test_data.yaml", "r", encoding="utf-8") as f:
    test_data_config = yaml.safe_load(f)

# ============================================================
# プロンプトテンプレート変換
# ============================================================

prompt_versions = {}
prompt_metadata = {}

for version_name, config in prompts_config.items():
    prompt_versions[version_name] = PromptTemplate.from_template(config["template"])
    prompt_metadata[version_name] = {
        "name": config.get("name", ""),
        "description": config.get("description", ""),
        "version": config.get("version", ""),
        "created_date": config.get("created_date", ""),
        "author": config.get("author", "")
    }

# ============================================================
# テストデータ取得
# ============================================================

test_customers = test_data_config["test_cases"]
test_metadata = test_data_config["metadata"]

# ============================================================
# 評価プロンプトテンプレート変換
# ============================================================

eval_config = evaluation_config["summary_evaluation"]
evaluation_prompt_template = PromptTemplate.from_template(eval_config["template"])

print("=" * 60)
print("✓ すべての設定読み込み完了")
print("=" * 60)

print(f"\n【プロンプト】{len(prompt_versions)}個")
for name in prompt_versions.keys():
    print(f"  - {name}: {prompt_metadata[name]['name']}")

print(f"\n【テストデータ】{len(test_customers)}件")
print(f"  データセット: {test_metadata['dataset_name']}")
print(f"  バージョン: {test_metadata['version']}")

print(f"\n【システムプロンプト】{len(system_prompts_config)}個")
for name, config in system_prompts_config.items():
    print(f"  - {name}: {config['name']}")

print(f"\n【評価設定】")
print(f"  評価名: {eval_config['name']}")
print(f"  モデル: {eval_config['model']}")
print(f"  Temperature: {eval_config['temperature']}")

# 推論関数定義

In [0]:
def generate_summary(
    customer: Dict, 
    prompt_template: PromptTemplate,
    system_prompt: str,
    model: str = "gpt-4o-mini",
    temperature: float = 0.3
) -> str:
    """
    PromptTemplateとシステムプロンプトを使って要約生成
    """
    # LangChainのPromptTemplateで整形
    prompt = prompt_template.format(**customer)
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=300
    )
    
    return response.choices[0].message.content

print("✓ 推論関数定義完了")

# 評価関数定義

In [0]:
def evaluate_summary(customer: Dict, summary: str) -> Dict:
    """
    YAML設定に基づいて生成された要約を評価
    
    Returns:
        {
            "score": float (0.0-1.0),
            "reasoning": str,
            "metrics": {
                "conciseness": float,
                "clarity": float,
                "actionability": float
            }
        }
    """
    
    # 評価プロンプト生成
    eval_prompt = evaluation_prompt_template.format(
        customer_name=customer['name'],
        customer_age=customer['age'],
        customer_occupation=customer['occupation'],
        customer_purchase_history=customer['purchase_history'],
        customer_inquiries=customer['inquiries'],
        customer_notes=customer['notes'],
        generated_summary=summary
    )
    
    # 評価実行
    response = client.chat.completions.create(
        model=eval_config["model"],
        messages=[
            {
                "role": "system", 
                "content": system_prompts_config["evaluation_judge"]["content"]
            },
            {"role": "user", "content": eval_prompt}
        ],
        temperature=eval_config["temperature"],
        response_format={"type": eval_config["response_format"]}
    )
    
    result = json.loads(response.choices[0].message.content)
    
    # 総合スコア計算（平均を0-1に正規化）
    total_score = (
        result["conciseness"] + 
        result["clarity"] + 
        result["actionability"]
    ) / 30.0
    
    return {
        "score": total_score,
        "reasoning": result.get("reasoning", ""),
        "metrics": {
            "conciseness": result["conciseness"] / 10.0,
            "clarity": result["clarity"] / 10.0,
            "actionability": result["actionability"] / 10.0
        }
    }

print("✓ 評価関数定義完了")

# 要約と評価を実施しMLFlowに記録

In [0]:
# MLflow実験設定
mlflow.set_experiment(experiment_id="2762244084694140")

system_prompt = system_prompts_config["default"]["content"]

for version_name, prompt_template in prompt_versions.items():
    metadata = prompt_metadata[version_name]
    
    with mlflow.start_run(run_name=f"{version_name}_{metadata['name']}"):
        
        # ============================================================
        # 1. プロンプトメタデータをログ
        # ============================================================
        mlflow.log_param("プロンプトバージョン", version_name)
        mlflow.log_param("プロンプト名", metadata['name'])
        mlflow.log_param("説明", metadata['description'])
        mlflow.log_param("バージョン", metadata['version'])
        mlflow.log_param("作成日", metadata['created_date'])
        mlflow.log_param("作成者", metadata['author'])
        
        # ============================================================
        # 2. システムプロンプトをログ
        # ============================================================
        mlflow.log_param("システムプロンプト", system_prompts_config["default"]["name"])
        mlflow.log_text(system_prompt, "システムプロンプト.txt")
        
        # ============================================================
        # 3. プロンプトテンプレートをログ
        # ============================================================
        mlflow.log_text(prompt_template.template, f"{version_name}_プロンプト.txt")
        
        # ============================================================
        # 4. テストデータメタデータをログ
        # ============================================================
        mlflow.log_param("テストデータセット", test_metadata['dataset_name'])
        mlflow.log_param("テストデータ件数", len(test_customers))
        mlflow.log_param("テストデータバージョン", test_metadata['version'])
        
        # ============================================================
        # 5. 評価設定をログ
        # ============================================================
        mlflow.log_param("評価モデル", eval_config['model'])
        mlflow.log_param("評価Temperature", eval_config['temperature'])
        mlflow.log_param("評価プロンプト名", eval_config['name'])
        mlflow.log_text(evaluation_prompt_template.template, "評価プロンプト.txt")
        
        # ============================================================
        # 6. 各顧客で評価実行
        # ============================================================
        version_scores = []
        customer_results = []
        
        for customer in test_customers:
            # 要約生成
            summary = generate_summary(customer, prompt_template, system_prompt)
            
            # 評価実行
            evaluation = evaluate_summary(customer, summary)
            
            # 結果記録
            result = {
                "顧客名": customer["name"],
                "要約": summary,
                "要約文字数": len(summary),
                "総合スコア": evaluation["score"],
                "簡潔性": evaluation["metrics"]["conciseness"],
                "明瞭性": evaluation["metrics"]["clarity"],
                "実用性": evaluation["metrics"]["actionability"],
                "評価理由": evaluation["reasoning"]
            }
            
            customer_results.append(result)
            version_scores.append(evaluation["score"])
            
            # 顧客ごとの詳細結果をログ
            mlflow.log_metric(f"スコア_{customer['customer_id']}", evaluation["score"])
        
        # ============================================================
        # 7. バージョンごとの集計指標をログ
        # ============================================================
        avg_score = sum(version_scores) / len(version_scores)
        avg_conciseness = sum(r["簡潔性"] for r in customer_results) / len(customer_results)
        avg_clarity = sum(r["明瞭性"] for r in customer_results) / len(customer_results)
        avg_actionability = sum(r["実用性"] for r in customer_results) / len(customer_results)
        avg_length = sum(r["要約文字数"] for r in customer_results) / len(customer_results)
        
        mlflow.log_metric("平均スコア", avg_score)
        mlflow.log_metric("平均_簡潔性", avg_conciseness)
        mlflow.log_metric("平均_明瞭性", avg_clarity)
        mlflow.log_metric("平均_実用性", avg_actionability)
        mlflow.log_metric("平均_要約文字数", avg_length)
        mlflow.log_metric("最小スコア", min(version_scores))
        mlflow.log_metric("最大スコア", max(version_scores))
