標本データの年代カバー範囲の基礎データ

In [None]:
import os
import re
import pandas as pd
from rdflib import Graph, Namespace
from rdflib.namespace import RDF

# === 設定 ===
BASE_DIR = 'https://github.com/naoki-kokaze/British_Warship_Career/tree/main/'
ONTOLOGY_PATH = os.path.join(BASE_DIR, 'warship_career_ontology.ttl')
CAREERS_PATH  = os.path.join(BASE_DIR, 'data', 'ship_careers_combined.ttl')
OUTPUT_CSV_PATH = os.path.join(BASE_DIR, 'data', 'career_dates_from_rdf.csv')

# === 名前空間 ===
crm = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
sealit = Namespace("http://www.sealitproject.eu/ontology/")
BASE = "http://www.example.com/myData/"

# === RDF 読み込み ===
g = Graph()
g.parse(ONTOLOGY_PATH, format="turtle")
g.parse(CAREERS_PATH, format="turtle")
print("Loaded triples:", len(g))

# === 日付正規化 ===
def normalize(lit):
    if lit is None:
        return None
    s = str(lit)
    # YYYY-MM-DD
    if re.match(r'^\d{4}-\d{2}-\d{2}$', s):
        return s
    # YYYY-MM
    if re.match(r'^\d{4}-\d{2}$', s):
        return s + "-01"
    # YYYY
    if re.match(r'^\d{4}$', s):
        return s + "-01-01"
    return None

def extract_year(date):
    if not date:
        return None
    m = re.match(r'^(\d{4})', date)
    return int(m.group(1)) if m else None

def extract_category_code(ship_uri):
    uri = str(ship_uri)
    m = re.search(r'/([^/]+)_(\d+)$', uri)
    return m.group(1) if m else "Unknown"

# === Ship リスト ===
ships = []
for ship, _, _ in g.triples((None, RDF.type, sealit.Ship)):
    if str(ship).startswith(BASE):
        label = next((o for _,_,o in g.triples((ship, None, None)) if "label" in _.toPython()), None)
        ships.append((ship, str(label) if label else None))

print("Ships found:", len(ships))

rows = []

for ship_uri, ship_label in ships:
    ship_id = str(ship_uri).replace(BASE, "")
    prefix = f"{BASE}event_{ship_id}_"

    # --- Start (全イベント → 最小日) ---
    start_dates = []
    for ev, _, ts in g.triples((None, crm["P4_has_time-span"], None)):
        if not str(ev).startswith(prefix):  # prefix = f"{BASE}event_{ship_id}_"
            continue
        p79 = next((o for _,_,o in g.triples((ts, crm["P79_beginning_is_qualified_by"], None))), None)
        p82 = next((o for _,_,o in g.triples((ts, crm["P82_at_some_time_within"], None))), None)
        p80 = next((o for _,_,o in g.triples((ts, crm["P80_end_is_qualified_by"], None))), None)
        # 開始優先: P79 → P82 → P80
        for cand in (p79, p82, p80):
            norm = normalize(cand)  # "YYYY[-MM[-DD]]" → "YYYY-MM-DD"
            if norm:
                start_dates.append(norm)
                break
    start_norm = min(start_dates) if start_dates else None


    # --- End (全イベント → 最大日) ---
    dates = []
    for ev,_,ts in g.triples((None, crm["P4_has_time-span"], None)):
        if not str(ev).startswith(prefix):
            continue
        p80 = next((o for _,_,o in g.triples((ts, crm["P80_end_is_qualified_by"], None))), None)
        p79 = next((o for _,_,o in g.triples((ts, crm["P79_beginning_is_qualified_by"], None))), None)
        p82 = next((o for _,_,o in g.triples((ts, crm["P82_at_some_time_within"], None))), None)
        for cand in (p80, p79, p82):
            norm = normalize(cand)
            if norm:
                dates.append(norm)
                break
    end_norm = max(dates) if dates else None

    rows.append({
        "Ship_ID": ship_uri,
        "Ship_Label": ship_label,
        "Category_Code": extract_category_code(ship_uri),
        "Start_Date_Raw": start_norm,
        "End_Date_Raw": end_norm,
        "Start_Year": extract_year(start_norm),
        "End_Year": extract_year(end_norm),
    })

df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV_PATH, index=False, encoding="utf-8")
print("Saved:", OUTPUT_CSV_PATH)
df.head()


Loaded triples: 37835
Ships found: 268
Saved: /content/drive/MyDrive/PhD Thesis/Winfield RDF/api_out_pro/career_dates_from_rdf.csv


Unnamed: 0,Ship_ID,Ship_Label,Category_Code,Start_Date_Raw,End_Date_Raw,Start_Year,End_Year
0,http://www.example.com/myData/ACD_0001,HMS Royal Sovereign,ACD,1848-06-29,1885-05-01,1848,1885
1,http://www.example.com/myData/ACD_0002,HMS Prince Albert,ACD,1862-04-08,1899-03-16,1862,1899
2,http://www.example.com/myData/AIF_0001,HMS Warrior,AIF,1859-05-25,1979-01-01,1859,1979
3,http://www.example.com/myData/AIF_0003,HMS Defence,AIF,1859-12-14,1935-08-01,1859,1935
4,http://www.example.com/myData/AWC_0001,HMS Royal Oak,AWC,1859-04-08,1885-09-30,1859,1885


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- 設定 ---
BASE_DIR = 'https://github.com/naoki-kokaze/British_Warship_Career/tree/main/'
INPUT_CSV_PATH = os.path.join(BASE_DIR, 'data', 'career_dates_from_rdf.csv')
OUTPUT_DIR = os.path.join(BASE_DIR, 'visualizations')

# 出力ディレクトリ作成
try:
    os.makedirs(OUTPUT_DIR, exist_ok=True)
except OSError as e:
    print(f"Error creating output directory {OUTPUT_DIR}: {e}")
    pass

# グラフのスタイル設定
sns.set(style="whitegrid")
print("Seaborn style set.")


def visualize_data(csv_path, output_dir):
    """CSVを読み込み、ヒストグラムと箱ひげ図を生成する"""

    try:
        df = pd.read_csv(csv_path)
        print(f"Successfully loaded {csv_path}")
    except FileNotFoundError:
        print(f"ERROR: Input CSV file not found at {csv_path}")
        return
    except Exception as e:
        print(f"ERROR: Could not read CSV: {e}")
        return

    required_cols = ['Start_Year', 'End_Year', 'Category_Code']
    if not all(col in df.columns for col in required_cols):
        missing = [col for col in required_cols if col not in df.columns]
        print(f"ERROR: CSV is missing required columns: {', '.join(missing)}")
        return

    # 年代データを数値に変換 (NaNは除外)
    df['Start_Year'] = pd.to_numeric(df['Start_Year'], errors='coerce')
    df['End_Year'] = pd.to_numeric(df['End_Year'], errors='coerce')

    # カテゴリコードが "Unknown" または NaN のものを除外
    df_cleaned = df.dropna(subset=['Start_Year', 'End_Year', 'Category_Code'])
    df_cleaned = df_cleaned[df_cleaned['Category_Code'] != 'Unknown']

    # キャリア期間（Duration）を計算
    df_cleaned['Duration'] = df_cleaned['End_Year'] - df_cleaned['Start_Year']

    # 期間がマイナスや0のデータを除外（データエラーの可能性）
    df_cleaned = df_cleaned[df_cleaned['Duration'] > 0]

    print(f"Loaded and cleaned {len(df_cleaned)} records for visualization.")
    if df_cleaned.empty:
        print("No valid data to visualize.")
        return

    # --- ヒストグラム (始点と終点の分布) ---
    print("Generating Histogram...")
    plt.figure(figsize=(16, 8))
    sns.histplot(df_cleaned['Start_Year'], color="skyblue", label='Start_Event (ex. Production)', kde=True, binwidth=5, alpha=0.6)
    sns.histplot(df_cleaned['End_Year'], color="salmon", label='End_Event (ex. Destruction/Sale)', kde=True, binwidth=5, alpha=0.6)
    plt.title(f'Year distribution of career start and end points for sample warships (N={len(df_cleaned)})', fontsize=16)
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Count of Ships', fontsize=12)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.6)

    hist_path_svg = os.path.join(output_dir, 'career_dates_histogram.svg')
    try:
        # SVG（ベクタ形式）として保存
        plt.rcParams['svg.fonttype'] = 'none'  # テキストをアウトライン化せずフォント保持
        plt.savefig(hist_path_svg, format='svg', bbox_inches='tight')
        print(f"Duration box plot (SVG) saved to {hist_path_svg}")

    except Exception as e:
        print(f"Error saving histogram: {e}")
    plt.close() # グラフを閉じる


    # CSVを読み込んで可視化を実行
    visualize_data(INPUT_CSV_PATH, OUTPUT_DIR)