# FDICAによる多チャネル音源分離

In [None]:
%%shell
git clone https://github.com/tky823/audio_source_separation.git
pip install soundfile

In [None]:
%cd "/content/audio_source_separation/egs/bss-example/fdica"

## データの準備
[CMU ARCTICデータベース](http://www.festvox.org/cmu_arctic/)の音声，および[Multi-Channel Impulse Response Database](https://www.iks.rwth-aachen.de/en/research/tools-downloads/databases/multi-channel-impulse-response-database/)のインパルス応答を用いて，多チャネルの混合音をシミュレーションする．

In [None]:
%%shell
. ./prepare.sh

In [None]:
import sys
sys.path.append("../../../src")

In [None]:
import numpy as np
import scipy.signal as ss
import soundfile as sf
import IPython.display as ipd
import matplotlib.pyplot as plt

In [None]:
from bss.fdica import NaturalGradLaplaceFDICA

In [None]:
plt.rcParams['figure.dpi'] = 200

窓長などについて
- $T_{60}=160$ [ms]の残響のインパルス応答を使用する．
- 空間がランク$1$である仮定から，フーリエ変換の窓長は，$4096$サンプル（$=256$ [ms]）としている．
- シフト長は，窓長の半分の$2048$サンプルとしている

In [None]:
fft_size, hop_size = 4096, 2048

## 2音源分離

In [None]:
aew_mic3, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic3.wav")
axb_mic3, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic3.wav")
x_mic3 = aew_mic3 + axb_mic3

aew_mic4, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic4.wav")
axb_mic4, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic4.wav")
x_mic4 = aew_mic4 + axb_mic4

x = np.vstack([x_mic3, x_mic4])
n_sources, T = x.shape

### インパルス応答畳み込み後の音

In [None]:
ipd.Audio(aew_mic3, rate=sr)

In [None]:
ipd.Audio(axb_mic3, rate=sr)

### 混合音

In [None]:
ipd.Audio(x[0], rate=sr)

In [None]:
ipd.Audio(x[1], rate=sr)

### FDICAの実行

In [None]:
_, _, X = ss.stft(x, nperseg=fft_size, noverlap=hop_size)

In [None]:
np.random.seed(111)
fdica = NaturalGradLaplaceFDICA(is_holonomic=True)

In [None]:
Y = fdica(X, iteration=200)

In [None]:
_, y = ss.istft(Y, nperseg=fft_size, noverlap=hop_size)
y = y[:,:T]

### 分離音

In [None]:
ipd.Audio(y[0], rate=sr)

In [None]:
ipd.Audio(y[1], rate=sr)

In [None]:
plt.figure()
plt.plot(fdica.loss, color='black')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

## 3音源分離

In [None]:
aew_mic2, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic2.wav")
axb_mic2, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic2.wav")
bdl_mic2, sr = sf.read("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic2.wav")
x_mic2 = aew_mic2 + axb_mic2 + bdl_mic2

aew_mic4, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic4.wav")
axb_mic4, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic4.wav")
bdl_mic4, sr = sf.read("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic4.wav")
x_mic4 = aew_mic4 + axb_mic4 + bdl_mic4

aew_mic5, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic5.wav")
axb_mic5, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic5.wav")
bdl_mic5, sr = sf.read("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic5.wav")
x_mic5 = aew_mic5 + axb_mic5 + bdl_mic5

x = np.vstack([x_mic2, x_mic4, x_mic5])
n_sources, T = x.shape

### インパルス応答畳み込み後の音

In [None]:
ipd.Audio(aew_mic2, rate=sr)

In [None]:
ipd.Audio(axb_mic2, rate=sr)

In [None]:
ipd.Audio(bdl_mic2, rate=sr)

### 混合音

In [None]:
ipd.Audio(x[0], rate=sr)

In [None]:
ipd.Audio(x[1], rate=sr)

In [None]:
ipd.Audio(x[2], rate=sr)

### FDICAの実行

In [None]:
_, _, X = ss.stft(x, nperseg=fft_size, noverlap=hop_size)

In [None]:
np.random.seed(111)
fdica = NaturalGradLaplaceFDICA()

In [None]:
Y = fdica(X, iteration=200)

In [None]:
_, y = ss.istft(Y, nperseg=fft_size, noverlap=hop_size)
y = y[:,:T]

### 分離音

In [None]:
ipd.Audio(y[0], rate=sr)

In [None]:
ipd.Audio(y[1], rate=sr)

In [None]:
ipd.Audio(y[2], rate=sr)

In [None]:
plt.figure()
plt.plot(fdica.loss, color='black')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()