In [None]:
# OGVC
# JTESで感情分類にファインチューニングされた日本語感情認識モデルのSSL
# wav2vec2-XLSR から音響特徴量を抽出してCSV保存
# 実行時のエラーだが、今回は特徴抽出のために分類ヘッドを付けているだけだから、初期化されても問題ない

import os
import re
import csv
import soundfile as sf
import torch
import numpy as np
import librosa
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

# ===============================
# 設定
# ===============================
ROOT_WAV_DIR = "/autofs/diamond2/share/diamond/corpus/OGVC/Vol2/Acted/wav"
MODEL_NAME = "Bagus/wav2vec2-xlsr-japanese-speech-emotion-recognition"
OUTPUT_CSV = "ogvc_wav2vec2_features_Bagus-jtes.csv"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ===============================
# モデルロード
# ===============================
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)

model = AutoModelForAudioClassification.from_pretrained(
    MODEL_NAME,
    output_hidden_states=True
).to(DEVICE)
model.eval()

hidden_dim = model.config.hidden_size  # 768

# ===============================
# ファイル名解析用 regex
# FOY0101ANT0.wav → emotion=ANT, intensity=0
# ===============================
FILENAME_PATTERN = re.compile(r"^[A-Z0-9]+([A-Z]{3})(\d)\.wav$")

# ===============================
# CSV ヘッダ管理
# ===============================
rows = []
header_written = False

# ===============================
# wav 以下を再帰探索
# ===============================
for root, _, files in os.walk(ROOT_WAV_DIR):
    for fname in files:
        if not fname.lower().endswith(".wav"):
            continue

        match = FILENAME_PATTERN.match(fname)
        if match is None:
            print(f"Skip (filename mismatch): {fname}")
            continue

        emotion = match.group(1)
        intensity = int(match.group(2))
        wav_path = os.path.join(root, fname)

        # ===============================
        # 音声ロード & リサンプリング
        # ===============================
        audio, sr = sf.read(wav_path)

        # stereo → mono
        if audio.ndim > 1:
            audio = audio.mean(axis=1)

        # resample to 16kHz（wav2vec2前提）
        if sr != feature_extractor.sampling_rate:
            audio = librosa.resample(
                audio,
                orig_sr=sr,
                target_sr=feature_extractor.sampling_rate
            )
            sr = feature_extractor.sampling_rate

        # ===============================
        # 特徴量抽出
        # ===============================
        inputs = feature_extractor(
            audio,
            sampling_rate=sr,
            return_tensors="pt"
        )
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        # ===============================
        # 音響特徴量（最終層 hidden states）
        # hidden_states[-1]: (1, T, D)
        # ===============================
        hidden_states = outputs.hidden_states[-1].squeeze(0)  # (T, D)

        # ===============================
        # 時間方向 mean pooling
        # ===============================
        feature_vector = hidden_states.mean(dim=0).cpu().numpy()  # (D,)

        # ===============================
        # CSV 用 row 作成
        # ===============================
        row = [fname, emotion, intensity] + feature_vector.tolist()
        rows.append(row)

        # ヘッダは最初の1回だけ作る
        if not header_written:
            header = (
                ["filename", "emotion", "intensity"]
                + [f"ssl_{i}" for i in range(len(feature_vector))]
            )
            header_written = True

# ===============================
# CSV 書き込み
# ===============================
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(rows)

print(f"Saved to {OUTPUT_CSV}")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at Bagus/wav2vec2-xlsr-japanese-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved to ogvc_wav2vec2_features_Bagus-jtes.csv


In [2]:
# ======================================================
# OGVC 版：ロジスティック回帰で Emotion / Intensity を分類
# ======================================================

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

# ================================
# 設定
# ================================
RESULT_DIR = "results_ogvc_emo_int"
os.makedirs(RESULT_DIR, exist_ok=True)

EMOTION_LABELS = {
    "JOY": 0, "ACC": 1, "FEA": 2, "SUR": 3,
    "SAD": 4, "DIS": 5, "ANG": 6, "ANT": 7,
    "NEU": 8, "OTH": 9
}

# =========================================================
# 混同行列プロット
# =========================================================
def plot_conf_matrix(true, pred, labels, title, save_path, exclude_labels=None):
    cm = confusion_matrix(true, pred, labels=range(len(labels)))

    if exclude_labels is not None:
        exclude_idx = [labels.index(l) for l in exclude_labels]
        cm = np.delete(cm, exclude_idx, axis=0)
        cm = np.delete(cm, exclude_idx, axis=1)
        labels = [l for l in labels if l not in exclude_labels]

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels, yticklabels=labels, cmap="Blues", annot_kws={"size": 16})
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

# =========================================================
# Step1: データ読み込み
# =========================================================
def step1_load_data(csv_path):
    df = pd.read_csv(csv_path)
    print("=== Step1: データ読み込み完了 ===")
    print(df.head())
    return df

# =========================================================
# Step2: データ分割 & モデル学習
# =========================================================
def step2_train_model(df):
    SPLIT_DIR = "ogvc_split_data"
    TRAIN_DIR = os.path.join(SPLIT_DIR, "train")
    TEST_DIR = os.path.join(SPLIT_DIR, "test")
    os.makedirs(TRAIN_DIR, exist_ok=True)
    os.makedirs(TEST_DIR, exist_ok=True)

    feature_df = df.drop(columns=["emotion", "intensity"])
    numeric_cols = feature_df.select_dtypes(include=[np.number]).columns
    X = feature_df[numeric_cols]

    y_emo = df["emotion"].map(EMOTION_LABELS)
    y_int = df["intensity"]

    X_train, X_test, emo_train, emo_test, int_train, int_test = train_test_split(
        X, y_emo, y_int, test_size=0.2, random_state=42, stratify=y_emo
    )

    train_df = df.loc[X_train.index]
    test_df = df.loc[X_test.index]
    train_df.to_csv(os.path.join(TRAIN_DIR, "train_data-bagus.csv"), index=False)
    test_df.to_csv(os.path.join(TEST_DIR, "test_data-bagus.csv"), index=False)

    emo_model = LogisticRegression(max_iter=3000).fit(X_train, emo_train)
    int_model = LogisticRegression(max_iter=3000).fit(X_train, int_train)

    print("=== Step2: モデル学習完了 ===")

    return {
        "X_test": X_test,
        "filename_test": df.loc[X_test.index, "filename"] if "filename" in df.columns else None,
        "emo_test": emo_test,
        "int_test": int_test,
        "emo_model": emo_model,
        "int_model": int_model
    }

# =========================================================
# Step3: 評価 & CSV出力
# =========================================================
def step3_evaluate(data):
    X_test = data["X_test"]
    filename_test = data["filename_test"]
    emo_test = data["emo_test"]
    int_test = data["int_test"]
    emo_model = data["emo_model"]
    int_model = data["int_model"]

    emo_pred = emo_model.predict(X_test)
    int_pred = int_model.predict(X_test)

    print("\n=== Step3: 評価 ===")
    print("\nEmotion Classification Report")
    print(classification_report(emo_test, emo_pred))
    print("\nIntensity Classification Report")
    print(classification_report(int_test, int_pred))

    # 予測結果 CSV 保存
    emotion_correct = ["〇" if t == p else "×" for t, p in zip(emo_test, emo_pred)]
    intensity_correct = ["〇" if t == p else "×" for t, p in zip(int_test, int_pred)]
    pred_df = pd.DataFrame({
        "filename": filename_test,
        "emotion_true": emo_test,
        "emotion_pred": emo_pred,
        "intensity_true": int_test,
        "intensity_pred": int_pred,
        "emotion_correct": emotion_correct,
        "intensity_correct": intensity_correct
    })
    pred_path = os.path.join(RESULT_DIR, "prediction_results-bagus.csv")
    pred_df.to_csv(pred_path, index=False)
    print("\n予測結果を保存しました →", pred_path)

    # 混同行列
    emo_cm_path = os.path.join(RESULT_DIR, "confusion_emotion-bagus.png")
    int_cm_path = os.path.join(RESULT_DIR, "confusion_intensity-bagus.png")

    plot_conf_matrix(
        emo_test, emo_pred, list(EMOTION_LABELS.keys()),
        " OGVC Emotion Confusion Matrix", emo_cm_path,
        exclude_labels=["NEU", "OTH"]
    )

    plot_conf_matrix(   # 3を追加
        int_test, int_pred, ["0", "1", "2", "3"],
        "OGVC Intensity Confusion Matrix", int_cm_path
    )

    print("混同行列保存 →", emo_cm_path)
    print("混同行列保存 →", int_cm_path)

# =========================================================
# メイン処理
# =========================================================
if __name__ == "__main__":
    df = step1_load_data("ogvc_wav2vec2_features_Bagus-jtes.csv")
    data = step2_train_model(df)
    step3_evaluate(data)
    print("\n=== 完了：Step1 → Step2 → Step3 ===")


=== Step1: データ読み込み完了 ===
          filename emotion  intensity     ssl_0     ssl_1     ssl_2     ssl_3  \
0  FOY0806ANG2.wav     ANG          2 -0.166178 -0.335452  0.283190  0.100455   
1  FOY0101ANT2.wav     ANT          2 -0.588002  1.086630  0.672229 -0.049684   
2  FOY0101ANT3.wav     ANT          3 -0.590559  0.742118  0.564494  0.044688   
3  FOY0104FEA0.wav     FEA          0 -0.559797 -1.040905 -0.112782 -0.069219   
4  FOY0104FEA3.wav     FEA          3 -0.176736 -1.262802 -0.658257  0.023738   

      ssl_4     ssl_5     ssl_6  ...  ssl_1014  ssl_1015  ssl_1016  ssl_1017  \
0  0.330798 -1.102320  0.444114  ...  0.115072 -0.214865  0.600724 -1.021574   
1  0.180771 -0.479517 -0.921932  ... -0.198160 -0.254211  0.479511  0.340814   
2  0.119269 -0.664458 -0.651955  ... -0.409490 -0.173947  0.384327  0.221036   
3  0.418174 -1.086987  0.788613  ...  0.228670  0.262627  0.718724  0.394348   
4 -0.166624 -0.927520  1.510856  ... -0.192233  0.592060  0.744003 -0.399632   

   ssl_

In [4]:
# 強度を回帰で求める
# 保存先；results_ogvc_emo_int/pred_emo-bunrui_int-kaiki-re2.csv

# ======================================================
# OGVC 完全版
# Emotion : 分類（Logistic Regression）
# Intensity : 回帰（Ridge Regression）
# ======================================================

import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    mean_absolute_error,
    mean_squared_error
)
from sklearn.linear_model import LogisticRegression, Ridge

import seaborn as sns
import matplotlib.pyplot as plt

# ================================
# 設定
# ================================
RESULT_DIR = "results_ogvc_emo_int"
os.makedirs(RESULT_DIR, exist_ok=True)

EMOTION_LABELS = {
    "JOY": 0, "ACC": 1, "FEA": 2, "SUR": 3,
    "SAD": 4, "DIS": 5, "ANG": 6, "ANT": 7,
    "NEU": 8, "OTH": 9
}

# =========================================================
# 混同行列プロット（Emotion 用）
# =========================================================
def plot_conf_matrix(true, pred, labels, title, save_path, exclude_labels=None):
    cm = confusion_matrix(true, pred, labels=range(len(labels)))

    if exclude_labels is not None:
        exclude_idx = [labels.index(l) for l in exclude_labels]
        cm = np.delete(cm, exclude_idx, axis=0)
        cm = np.delete(cm, exclude_idx, axis=1)
        labels = [l for l in labels if l not in exclude_labels]

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        xticklabels=labels,
        yticklabels=labels,
        cmap="Blues",
        annot_kws={"size": 16}      # 数値を大きく
    )
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

# =========================================================
# Step1: データ読み込み
# =========================================================
def step1_load_data(csv_path):
    df = pd.read_csv(csv_path)
    print("=== Step1: データ読み込み完了 ===")
    print(df.head())
    return df

# =========================================================
# Step2: データ分割 & モデル学習
# =========================================================
def step2_train_model(df):
    SPLIT_DIR = "ogvc_split_data"
    TRAIN_DIR = os.path.join(SPLIT_DIR, "train")
    TEST_DIR = os.path.join(SPLIT_DIR, "test")
    os.makedirs(TRAIN_DIR, exist_ok=True)
    os.makedirs(TEST_DIR, exist_ok=True)

    # 特徴量
    feature_df = df.drop(columns=["emotion", "intensity"])
    numeric_cols = feature_df.select_dtypes(include=[np.number]).columns
    X = feature_df[numeric_cols]

    # ラベル
    y_emo = df["emotion"].map(EMOTION_LABELS)
    y_int = df["intensity"]  # ← 連続値のまま

    # 分割（Emotion で stratify）
    X_train, X_test, emo_train, emo_test, int_train, int_test = train_test_split(
        X, y_emo, y_int,
        test_size=0.2,
        random_state=42,
        stratify=y_emo
    )

    # 分割データ保存
    train_df = df.loc[X_train.index]
    test_df = df.loc[X_test.index]
    train_df.to_csv(os.path.join(TRAIN_DIR, "train_data-bagus.csv"), index=False)
    test_df.to_csv(os.path.join(TEST_DIR, "test_data-bagus.csv"), index=False)

    # ===== モデル =====
    emo_model = LogisticRegression(max_iter=3000)
    emo_model.fit(X_train, emo_train)

    int_model = Ridge(alpha=1.0)
    int_model.fit(X_train, int_train)

    print("=== Step2: モデル学習完了 ===")

    return {
        "X_test": X_test,
        "filename_test": df.loc[X_test.index, "filename"] if "filename" in df.columns else None,
        "emo_test": emo_test,
        "int_test": int_test,
        "emo_model": emo_model,
        "int_model": int_model
    }

# =========================================================
# Step3: 評価 & CSV出力
# =========================================================
def step3_evaluate(data):
    X_test = data["X_test"]
    filename_test = data["filename_test"]
    emo_test = data["emo_test"]
    int_test = data["int_test"]
    emo_model = data["emo_model"]
    int_model = data["int_model"]

    # 予測
    emo_pred = emo_model.predict(X_test)
    int_pred = int_model.predict(X_test)  # 回帰（連続値）

    # ========================
    # Emotion（分類）
    # ========================
    print("\n=== Emotion Classification ===")
    print(classification_report(emo_test, emo_pred))

    emo_cm_path = os.path.join(RESULT_DIR, "confusion_emotion-bagus.png")
    plot_conf_matrix(
        emo_test,
        emo_pred,
        list(EMOTION_LABELS.keys()),
        "OGVC Emotion Confusion Matrix",
        emo_cm_path,
        exclude_labels=["NEU", "OTH"]
    )
    print("Emotion 混同行列保存 →", emo_cm_path)

    # ========================
    # Intensity（回帰）
    # ========================
    mae = mean_absolute_error(int_test, int_pred)
    mse = mean_squared_error(int_test, int_pred)
    rmse = np.sqrt(mse)

    print("\n=== Intensity Regression ===")
    print(f"MAE  : {mae:.4f}")
    print(f"MSE  : {mse:.4f}")
    print(f"RMSE : {rmse:.4f}")

    # ========================
    # 予測結果 CSV 保存
    # ========================
    pred_df = pd.DataFrame({
        "filename": filename_test,
        "emotion_true": emo_test,
        "emotion_pred": emo_pred,
        "intensity_true": int_test,
        "intensity_pred": int_pred,
        "intensity_error": int_pred - int_test
    })

    pred_path = os.path.join(RESULT_DIR, "pred_emo-bunrui_int-kaiki-bagus.csv")
    pred_df.to_csv(pred_path, index=False)
    print("予測結果 CSV 保存 →", pred_path)

# =========================================================
# メイン処理
# =========================================================
if __name__ == "__main__":
    df = step1_load_data("ogvc_wav2vec2_features_Bagus-jtes.csv")
    data = step2_train_model(df)
    step3_evaluate(data)
    print("\n=== 完了：Emotion=分類 / Intensity=回帰 ===")

=== Step1: データ読み込み完了 ===
          filename emotion  intensity     ssl_0     ssl_1     ssl_2     ssl_3  \
0  FOY0806ANG2.wav     ANG          2 -0.166178 -0.335452  0.283190  0.100455   
1  FOY0101ANT2.wav     ANT          2 -0.588002  1.086630  0.672229 -0.049684   
2  FOY0101ANT3.wav     ANT          3 -0.590559  0.742118  0.564494  0.044688   
3  FOY0104FEA0.wav     FEA          0 -0.559797 -1.040905 -0.112782 -0.069219   
4  FOY0104FEA3.wav     FEA          3 -0.176736 -1.262802 -0.658257  0.023738   

      ssl_4     ssl_5     ssl_6  ...  ssl_1014  ssl_1015  ssl_1016  ssl_1017  \
0  0.330798 -1.102320  0.444114  ...  0.115072 -0.214865  0.600724 -1.021574   
1  0.180771 -0.479517 -0.921932  ... -0.198160 -0.254211  0.479511  0.340814   
2  0.119269 -0.664458 -0.651955  ... -0.409490 -0.173947  0.384327  0.221036   
3  0.418174 -1.086987  0.788613  ...  0.228670  0.262627  0.718724  0.394348   
4 -0.166624 -0.927520  1.510856  ... -0.192233  0.592060  0.744003 -0.399632   

   ssl_

In [7]:
# 予測結果の感情を数値からラベルに変換
import os
import pandas as pd

# ================================
# 設定
# ================================
RESULT_DIR = "results_ogvc_emo_int"
pred_path = os.path.join(RESULT_DIR, "pred_emo-bunrui_int-kaiki-bagus.csv")

out_path = os.path.join(
    RESULT_DIR,
    "pred_results_with_emotion_label-bagus.csv"
)

# ================================
# 感情ラベル対応表
# ================================
EMOTION_ID2LABEL = {
    0: "JOY",
    1: "ACC",
    2: "FEA",
    3: "SUR",
    4: "SAD",
    5: "DIS",
    6: "ANG",
    7: "ANT"
}

# ================================
# CSV読み込み
# ================================
df = pd.read_csv(pred_path)

# ================================
# 感情カテゴリのみラベル化
# ================================
df["emotion_true_label"] = df["emotion_true"].map(EMOTION_ID2LABEL)
df["emotion_pred_label"] = df["emotion_pred"].map(EMOTION_ID2LABEL)

# ※ intensity_true / intensity_pred はそのまま残す

# ================================
# 保存
# ================================
df.to_csv(out_path, index=False)

print(f"Saved emotion-labeled results to: {out_path}")

Saved emotion-labeled results to: results_ogvc_emo_int/pred_results_with_emotion_label-bagus.csv


In [8]:
# 予測結果(pred_results_with_emotion_label.csv)を読み込んで，全体＋感情別 MAE / RMSEを出力
import os
import pandas as pd
import numpy as np

# ================================
# 設定
# ================================
RESULT_DIR = "results_ogvc_emo_int"
csv_path = os.path.join(
    RESULT_DIR,
    "pred_results_with_emotion_label-bagus.csv"
)

out_path = os.path.join(
    RESULT_DIR,
    "eval_emotion_wise_intensity_mae_rmse-bagus.csv"
)

# ================================
# CSV読み込み
# ================================
df = pd.read_csv(csv_path)

# ================================
# 誤差計算（既存列を使用）
# ================================
# intensity_error = pred - true
df["abs_error"] = df["intensity_error"].abs()
df["sq_error"] = df["intensity_error"] ** 2

# ================================
# 1️⃣ モデル全体の MAE / RMSE
# ================================
overall_mae = df["abs_error"].mean()
overall_rmse = np.sqrt(df["sq_error"].mean())

print("=== Overall Intensity Performance ===")
print(f"MAE  : {overall_mae:.4f}")
print(f"RMSE : {overall_rmse:.4f}")

# ================================
# 2️⃣ 感情カテゴリ別 MAE / RMSE
# ================================
emotion_metrics = (
    df.groupby("emotion_true_label")
      .agg(
          MAE=("abs_error", "mean"),
          RMSE=("sq_error", lambda x: np.sqrt(x.mean())),
          Count=("abs_error", "count")
      )
      .reset_index()
)

print("\n=== Emotion-wise Intensity Performance ===")
print(emotion_metrics)

# ================================
# 3️⃣ 保存（論文・図表用）
# ================================
emotion_metrics.to_csv(out_path, index=False)
print(f"\nSaved emotion-wise MAE / RMSE to: {out_path}")

=== Overall Intensity Performance ===
MAE  : 0.6349
RMSE : 0.7999

=== Emotion-wise Intensity Performance ===
  emotion_true_label       MAE      RMSE  Count
0                ACC  0.653925  0.810426     64
1                ANG  0.667345  0.827965     64
2                ANT  0.685597  0.868099     64
3                DIS  0.692214  0.851128     64
4                FEA  0.587891  0.751928     64
5                JOY  0.584314  0.725720     67
6                SAD  0.647716  0.862761     68
7                SUR  0.574203  0.701042     77

Saved emotion-wise MAE / RMSE to: results_ogvc_emo_int/eval_emotion_wise_intensity_mae_rmse-bagus.csv
