In [None]:
import os
import logging
import re
from rdflib import Graph, Namespace, URIRef
from rdflib.namespace import RDF, RDFS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- 設定 ---
BASE_DIR = 'https://github.com/naoki-kokaze/British_Warship_Career/tree/main/'
ONTOLOGY_PATH = os.path.join(BASE_DIR, 'warship_career_ontology.ttl')
CAREERS_PATH  = os.path.join(BASE_DIR, 'data', 'ship_careers_combined.ttl')

# 出力先
OUTPUT_VIS_DIR = os.path.join(BASE_DIR, 'visualizations')
OUTPUT_CSV_PATH = os.path.join(BASE_DIR, 'data', 'investment_data.csv')
LOG_FILE = os.path.join(BASE_DIR, 'log', 'investment_analysis.log')

# 出力ディレクトリ作成
os.makedirs(OUTPUT_VIS_DIR, exist_ok=True)

logging.basicConfig(filename=LOG_FILE, level=logging.INFO, filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')
print(f"Log will be saved to: {LOG_FILE}")
print(f"CSV output will be saved to: {OUTPUT_CSV_PATH}")
print(f"Plot output will be saved to: {OUTPUT_VIS_DIR}")

# --- 名前空間 ---
NS_WARSHIP = Namespace("http://www.example.com/ontology/warship#")
NS_CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")

# --- ヘルパー関数 ---
def extract_year(date_literal):
    """xsd:date, xsd:gYearMonth, xsd:gYear から年(YYYY)を整数で抽出する"""
    if date_literal is None or pd.isna(date_literal):
        return None
    date_str = str(date_literal)
    match = re.match(r'^(\d{4})', date_str) # YYYY...
    if match:
        try:
            return int(match.group(1))
        except (ValueError, TypeError):
            return None
    return None

def execute_sparql(graph, query, description):
    """SPARQLクエリを実行し、結果をDataFrameに変換するヘルパー関数"""
    print(f"Executing SPARQL query: {description}...")
    logging.info(f"Executing SPARQL query: {description}...")
    try:
        results = graph.query(query)
        print(f"Query finished. Found {len(results)} results.")
        logging.info(f"Query finished. Found {len(results)} results.")

        data = []
        for row in results:
            data.append(row.asdict())

        if not data:
            logging.warning(f"Query '{description}' returned 0 results.")

        return pd.DataFrame(data)

    except Exception as e:
        msg = f"FATAL: SPARQL query '{description}' failed: {e}"
        print(msg)
        logging.error(msg)
        return None

# --- メイン処理 ---
def analyze_investment_data():
    print("--- Starting Investment Analysis ---")
    logging.info("--- Starting Investment Analysis ---")

    print("Initializing Graph...")
    logging.info("Initializing Graph...")
    g = Graph()

    files_to_load = [ONTOLOGY_PATH, CAREERS_PATH]

    # グラフに全データをロード
    for f_path in files_to_load:
        if not os.path.exists(f_path):
            msg = f"ERROR: File not found, skipping: {f_path}"
            print(msg)
            logging.error(msg)
            if f_path == CAREERS_PATH:
                 print("Career data file is missing. Aborting.")
                 logging.error("Career data file is missing. Aborting.")
                 return
            continue
        try:
            print(f"Loading {os.path.basename(f_path)}...")
            g.parse(f_path, format="turtle")
            logging.info(f"Successfully loaded {f_path}")
        except Exception as e:
            msg = f"FATAL: Error parsing {f_path}: {e}"
            print(msg)
            logging.error(msg)
            return

    print(f"Graph loaded successfully. Total triples: {len(g)}")
    logging.info(f"Graph loaded successfully. Total triples: {len(g)}")

    sparql_query_investments = """
      PREFIX crm:  <http://www.cidoc-crm.org/cidoc-crm/>
      PREFIX :    <http://www.example.com/ontology/warship#>
      PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>

      SELECT ?event ?event_label ?investment_type_label ?date ?amount ?is_production
      WHERE {
        ?event crm:P21_had_general_purpose ?investment_type .
        FILTER(?investment_type IN (:CombatCapabilityInvestment, :NonCombatDutyInvestment))

        BIND(REPLACE(STR(?investment_type), "http://www.example.com/ontology/warship#", "") AS ?investment_type_label)

        OPTIONAL { ?event rdfs:label ?event_label . }

        ?event :incurred_cost ?cost_node .
        ?cost_node crm:P90_has_value ?amount .

        ?event crm:P4_has_time-span ?ts .
        OPTIONAL { ?ts crm:P79_beginning_is_qualified_by ?p79_date . }
        OPTIONAL { ?ts crm:P80_end_is_qualified_by ?p80_date . }
        OPTIONAL { ?ts crm:P82_at_some_time_within ?p82_date . }
        BIND(COALESCE(?p79_date, ?p82_date, ?p80_date) AS ?date)

        # --- ここで E12_Production 判定 ---
        OPTIONAL { ?event rdf:type/rdfs:subClassOf* crm:E12_Production . BIND(true AS ?prodHit) }
        BIND(IF(BOUND(?prodHit), "Production", "NonProduction") AS ?is_production)

        FILTER(BOUND(?date) && BOUND(?amount))
      }
      ORDER BY ?event ?date

    """

    df_investments = execute_sparql(g, sparql_query_investments, "Investment Events (Type, Date, Amount)")

    if df_investments is None or df_investments.empty:
        print("No investment data found. Check SPARQL query or data files.")
        logging.warning("No investment data found. Aborting visualization.")
        return

    # データを整形
    print("Formatting data for plotting...")
    logging.info("Formatting data for plotting...")

    # 年を抽出
    df_investments['Year'] = df_investments['date'].apply(extract_year)
    # 金額を数値に変換
    df_investments['Amount'] = pd.to_numeric(df_investments['amount'], errors='coerce')

    # 文字列化（rdflib Literal 対策）＋不足時の補完
    for col in ['investment_type_label', 'is_production']:
        if col in df_investments.columns:
            df_investments[col] = df_investments[col].apply(lambda x: str(x).strip() if pd.notna(x) else x)
    df_investments['is_production'] = df_investments['is_production'].fillna('NonProduction')

    # 不正なデータをドロップ
    df_plot_data = df_investments.dropna(subset=['Year', 'Amount', 'investment_type_label', 'is_production'])
    df_plot_data['Year'] = df_plot_data['Year'].astype(int)

    # ログ用にデータを保存
    try:
        df_plot_data.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8-sig')
        print(f"Investment data saved to {OUTPUT_CSV_PATH}")
        logging.info(f"Investment data saved to {OUTPUT_CSV_PATH}")
    except Exception as e:
        print(f"Error saving investment CSV: {e}")
        logging.error(f"Error saving investment CSV: {e}")

    if df_plot_data.empty:
        print("No valid investment data to plot after cleaning.")
        logging.warning("No valid investment data to plot after cleaning.")
        return

    print(f"Plotting {len(df_plot_data)} investment events.")

    # --- 可視化 (散布図) ---
    print("Generating Investment Scatter Plot...")
    logging.info("Generating Investment Scatter Plot...")

    plt.figure(figsize=(20, 10))

    df_plot_data['investment_type_label'] = (
        df_plot_data['investment_type_label']
        .apply(lambda x: str(x).strip())  # ← Literal → 文字列
    )

    print(df_plot_data['investment_type_label'].unique())

    markers_map = {
        "CombatCapabilityInvestment": "o",
        "NonCombatDutyInvestment": "s"
    }

    type_palette = {
        "CombatCapabilityInvestment": "#e04169",   # 赤
        "NonCombatDutyInvestment": "#4169e1"       # 青
    }

    # 0) 1860年以降の薄い帯（背景）
    ax = plt.gca()
    year_min = int(df_plot_data["Year"].min())
    year_max = int(df_plot_data["Year"].max())
    ax.axvspan(1860, year_max, color="k", alpha=0.05, zorder=0)   # ← 薄い帯を背面に

    # 1) ベース層：全点（色＝投資タイプ）
    ax = sns.scatterplot(
        data=df_plot_data,
        x='Year', y='Amount',
        hue='investment_type_label',
        palette=type_palette,
        style='investment_type_label',
        markers=markers_map,
        style_order=list(markers_map.keys()),
        s=50, alpha=0.65, linewidth=0, legend='brief'
    )

    # 2) 上書き層：Production だけ黒縁で強調（色は同じ／線のみ黒）
    df_prod = df_plot_data[df_plot_data['is_production'] == 'NonProduction'].copy()
    if not df_prod.empty:
        sns.scatterplot(
            data=df_prod,
            x='Year', y='Amount',
            hue='investment_type_label',
            palette=type_palette,
            style='investment_type_label',
            markers=markers_map,
            style_order=list(markers_map.keys()),
            s=60, alpha=1.0,
            edgecolor='black', linewidth=0.9,   # ← 黒縁で浮かせる
            legend=False, zorder=10             # 上に描く
        )


    ax.set_yscale('linear') # 実数スケールを指定

    plt.title("Naval Investments Over Time\nColor = Investment Type, Black Outline = Non-Production, Shaded = ≥1860", fontsize=18)
    plt.xlabel('Year of Expenditure', fontsize=14)
    plt.ylabel('Amount (£) (Linear Scale)', fontsize=14)

    # 凡例位置（投資タイプのみ表示）
    plt.legend(title='Investment Type', bbox_to_anchor=(1.02, 1), loc='upper left')

    plot_path_svg = os.path.join(OUTPUT_VIS_DIR, 'investment_scatterplot_prod_outline.svg')

    plt.grid(True, linestyle='--', alpha=0.6)

    try:
        # SVG（ベクタ形式）保存
        plt.rcParams['svg.fonttype'] = 'none'  # フォントをアウトライン化しない
        plt.savefig(plot_path_svg, format='svg', bbox_inches='tight')
        print(f"Investment scatter plot (SVG) saved to {plot_path_svg}")
        logging.info(f"Investment scatter plot (SVG) saved to {plot_path_svg}")
    except Exception as e:
        print(f"Error saving scatter plot: {e}")
        logging.error(f"Error saving scatter plot: {e}")
    plt.close()

    print("\n--- Investment Analysis Finished ---")
    logging.info("--- Investment Analysis Finished ---")


if __name__ == "__main__":
    analyze_investment_data()