<a href="https://colab.research.google.com/github/otome-rin/ensyu-2/blob/main/imitation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install librosa numpy matplotlib scipy fastdtw

Collecting fastdtw
  Downloading fastdtw-0.3.4.tar.gz (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fastdtw
  Building wheel for fastdtw (setup.py) ... [?25l[?25hdone
  Created wheel for fastdtw: filename=fastdtw-0.3.4-cp310-cp310-linux_x86_64.whl size=512551 sha256=fd3c074af5d4a9a9f4ece243a6a8135e4b53160ad182ccde144d9e9ac9037113
  Stored in directory: /root/.cache/pip/wheels/73/c8/f7/c25448dab74c3acf4848bc25d513c736bb93910277e1528ef4
Successfully built fastdtw
Installing collected packages: fastdtw
Successfully installed fastdtw-0.3.4


In [17]:
import librosa
import numpy as np
from fastdtw import fastdtw
from scipy.signal import correlate
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from librosa.sequence import dtw
def extract_f0_dtw(audio_path):
    # 音声データを読み込む
    y, sr = librosa.load(audio_path, sr=None)

    # ピッチを抽出
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)

    # 基本周波数（F0）を取得
    f0 = []
    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()  # 最大の強度を持つインデックスを取得
        pitch = pitches[index, t]
        if pitch > 0:  # 有効なピッチのみを使用
            f0.append(pitch)

    return f0 if f0 else 0  # 平均基本周波数を返す（データがない場合は0）
# 基本周波数の類似度を計算する関数
def compare_pitch(y1, sr1, y2, sr2):
    # 音声の基本周波数（F0）を抽出
    Max_f0 = 500
    Min_f0 = 100
    f0_1, _, _ = librosa.pyin(y1, fmin = Min_f0, fmax = Max_f0)
    f0_2, _, _ = librosa.pyin(y2, fmin = Min_f0, fmax = Max_f0)

    # NaNを除去
    f0_1 = f0_1[~np.isnan(f0_1)]
    f0_2 = f0_2[~np.isnan(f0_2)]
    # fastdtwを使用して基本周波数の動的時間伸縮距離を計算
    distance, _ = fastdtw(f0_1, f0_2)
    return distance

# 抑揚（ピッチとエネルギー変動）の類似度を計算する関数
def compare_intonation(y1, sr1, y2, sr2):
    # ピッチ（F0）とエネルギー（RMS）の変動を抽出
    rms_1 = librosa.feature.rms(y=y1)[0]
    rms_2 = librosa.feature.rms(y=y2)[0]
    if len(rms_2) != 0:
        rms_1=rms_1/np.max(rms_1)
        rms_2=rms_2/np.max(rms_2)
    # ピッチとエネルギーの変動を一緒に比較
    #features_1 = np.vstack((f0_1, rms_1)).T
    #features_2 = np.vstack((f0_2, rms_2)).T

    # fastdtwを使用して抑揚の動的時間伸縮距離を計算
    distance, _ = fastdtw(rms_1, rms_2)
    return distance
def sub(audio_path1, audio_path2):
    y1, sr1 = librosa.load(audio_path1)
    y2, sr2 = librosa.load(audio_path2)

    # 基本周波数の類似度
    pitch_similarity = compare_pitch(y1, sr1, y2, sr2)

    # 抑揚の類似度
    intonation_similarity = compare_intonation(y1, sr1, y2, sr2)
    return pitch_similarity,intonation_similarity

def extract_mfcc(file_path, n_mfcc=13):
    # 音声ファイルを読み込み
    y, sr = librosa.load(file_path, sr=None)

    # MFCC特徴量を抽出
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfcc

# DTW距離を計算する関数
def dtw_distance(mfcc1, mfcc2):
    # DTWを計算
    D, wp = dtw(X=mfcc1, Y=mfcc2, metric='euclidean')

    # 最小コストを返す
    return D[-1, -1]

# 話者の類似性を評価する関数
def evaluate_similarity(file1, file2):
    # 2つの音声ファイルからMFCCを抽出
    mfcc1 = extract_mfcc(file1)
    mfcc2 = extract_mfcc(file2)

    # DTW距離を計算
    distance = dtw_distance(mfcc1, mfcc2)

    return distance

# コサイン類似度を計算する関数
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# 話者の類似性を評価する関数

# 音声データをロードする関数
def load_audio(file_path, sr=22050):
    audio, _ = librosa.load(file_path, sr=sr)
    return audio



# メイン関数: 2つの音声ファイルを比較
def compare_audio(file1, file2):
    #音声ファイルをロード
    audio1 = load_audio(file1)
    audio2 = load_audio(file2)
    p,i=sub(file1,file2)
    #  ゼロクロス率を計算
    #zcr1 = compute_zero_crossing_rate(audio1)
    #zcr2 = compute_zero_crossing_rate(audio2
    # 相互相関を計算
    #correlation = correlate(audio1, audio2, mode='full')

    # 3. テンポ（BPM）を計算
  #  tempo1 = compute_tempo(audio1)
  #  tempo2 = compute_tempo(audio2)

    # テンポの類似度を計算
   # tempo_sim = tempo_similarity(tempo1, tempo2)

    # 結果を表示
  # t1=tempo1[0]
  # t2=tempo2[0]
  # ts=tempo_sim[0]
    #print(f"Tempo (BPM) for Audio 1: {t1:.4f}")
    #print(f"Tempo (BPM) for Audio 2: {t2:.4f}")
    #print(f"Tempo Similarity: {ts:.4f}")
    mfcc_score00 = evaluate_similarity(file1, file2)
    Max_f0 = 500
    Min_f0 = 100
    f1, _, _ = librosa.pyin(audio1, fmin = Min_f0, fmax = Max_f0)
    f2, _, _ = librosa.pyin(audio2, fmin = Min_f0, fmax = Max_f0)
    co1=0
    co2=0
    for co in range(len(audio1)):
      if(audio1[co]>100):
        co1=co
        break
    for co in range(len(audio2)):
      if(audio2[co]>100):
        co2=co
        break
    # NaNを除去
    f1 = f1[~np.isnan(f1)]
    f2 = f2[~np.isnan(f2)]
    #喋る速さ測定
    sp=0
    if((len(audio1)-co1)>(len(audio2)-co2)):
      sp=1-(len(audio2)-co2)/(len(audio1)-co1)
    if((len(audio2)-co2)>(len(audio1)-co1)):
      sp=1-(len(audio1)-co1)/(len(audio2)-co2)
    # fastdtwを使用して基本周波数の動的時間伸縮距離を計算
    _,F=fastdtw(f1,f2)
    if(len(F)==0):
      print("False")
      return "False"
    pa=0
    er=0
    s=0
    l=(F[len(F)-1][0]**2+F[len(F)-1][1]**2)**0.5
    for co in range(len(F)-1):
      er=((F[co+1][0]-F[co][0])+(F[co+1][1]-F[co][1]))**0.5
      pa+=er
    ry=(pa-l)/(F[len(F)-1][0]+F[len(F)-1][1]-l)
    # 結果の表示
    print(file2)
    print(f"Pitch Similarity (F0): {p}")
    print(f"Intonation Similarity (Energy): {i}")
    print(f"MFCC Score: {mfcc_score00}")
    print(f"リズムの類似度: {ry}")
    print(f"速度の類似度: {sp}")
    print("")





In [3]:
doraemon1 = 'doraemon.mp3'
doraemon2 = 'doraemon02.mp3'
doraemon3 = 'doraemon03.mp3'
doraemon4 = 'doraemon04.mp3'
doraemon5 = 'doraemon05.mp3'
similarrin1='similar-rin-01.mp3'
similarrin2='similar-rin-02.mp3'
differrin1='differ-rin-01.mp3'
differrin2='differ-rin-02.mp3'
similarfumiki1='similarFumiki1.mp3'
similarfumiki2='similarFumiki2.mp3'
differfumuki1='differFumiki1.mp3'
differfumuki2='differFumiki2.mp3'
effect = 'effect.mp3'
obake = 'obake.mp3'
differ01='differ01.mp3'
differ02='differ02.mp3'
similar01='similar01.mp3'
similar02='similar02.mp3'
similar03='similar03.mp3'
nitenaidora='nitenaidora.mp3'
niterudora='niterudora.mp3'


In [18]:
compare_audio(doraemon1, nitenaidora)

compare_audio(doraemon1, niterudora)

compare_audio(doraemon1, obake)

compare_audio(doraemon1, effect)

compare_audio(doraemon1, similar01)

compare_audio(doraemon1, similar02)

compare_audio(doraemon1, similar03)

compare_audio(doraemon1, similarrin1)

compare_audio(doraemon1, similarrin2)

compare_audio(doraemon1, differrin1)

compare_audio(doraemon1, differrin2)

compare_audio(doraemon2, doraemon1)

compare_audio(doraemon3, doraemon1)

compare_audio(doraemon4, doraemon1)

nitenaidora.mp3
Pitch Similarity (F0): 8888.52264399506
Intonation Similarity (Energy): 13.31968575849559
MFCC Score: 94135.92932343057
リズムの類似度: 0.5989121870588535
速度の類似度: 0.3279416935715884

niterudora.mp3
Pitch Similarity (F0): 3910.7835781478775
Intonation Similarity (Energy): 6.523570883106768
MFCC Score: 73951.88458059351
リズムの類似度: 0.5692746782155098
速度の類似度: 0.19471416259695484

False
effect.mp3
Pitch Similarity (F0): 7312.779044745884
Intonation Similarity (Energy): 18.378186202824754
MFCC Score: 76540.0135438501
リズムの類似度: 0.4084142595297199
速度の類似度: 0.5602409638554218

similar01.mp3
Pitch Similarity (F0): 1169.9292852126293
Intonation Similarity (Energy): 6.217707157489957
MFCC Score: 26924.919857399887
リズムの類似度: 0.3547717791816337
速度の類似度: 0.061857876712328785

similar02.mp3
Pitch Similarity (F0): 2216.5718128858925
Intonation Similarity (Energy): 8.27439116757002
MFCC Score: 56538.20869518648
リズムの類似度: 0.4422512255287041
速度の類似度: 0.24053102140341376

similar03.mp3
Pitch Similarity (F