In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import soundfile as sf

from IPython.display import Audio as ipy_audio
from quicktranscribe.wave import read_audio_section

In [None]:
gps_file = "GhodePeSawaar.mp3"
ngtj_file = "NasatesaGhariTuJevha.wav"
bhoop_file = "Omkar Dadarkar - Raag Bhoopali.mp3"
malkauns_file = "Ajoy Chakrabarty - Malkauns.mp3"

### Librosa init example

In [None]:
# 06:45 to 07:00
start = 45*60+20
end = 45*60+32
y_stereo, sr = read_audio_section(bhoop_file, start, end)
y = librosa.to_mono(y_stereo.T)
print(y.shape)

In [None]:
ipy_audio(data=y, rate=sr)

In [None]:
# res = librosa.pyin(y, fmin=50, fmax=200)
# plt.plot(res[0])
# # wtf is f0? why is it 3 arrays?

In [None]:
# ptrack = librosa.piptrack(y=y, sr=sr)

In [None]:
cqt = librosa.feature.chroma_cqt(y=y, sr=sr)

In [None]:
# known tonic
sa = 155.563492

In [None]:
plt.figure(figsize=(10,5))
ax = plt.gca()
img = librosa.display.specshow(cqt, y_axis='chroma', x_axis='time')
ax.yaxis.set_major_formatter(librosa.display.SvaraFormatter(sa))
ax.set(title='chroma_stft')
ax.label_outer()
plt.colorbar(img)

Something's wrong about this!<br>
But stuff can be done..

In [None]:
y_harm, y_perc = librosa.effects.hpss(y)

In [None]:
ipy_audio(data=y_harm, rate=sr)

In [None]:
ipy_audio(data=y_perc, rate=sr)

### Librosa spectral features

In [None]:
start = 6
end = 16
y_stereo, sr = read_audio_section(malkauns_file, start, end)
y = librosa.to_mono(y_stereo.T)

In [None]:
ipy_audio(data=y, rate=sr)

STFT

In [None]:
S = np.abs(librosa.stft(y, n_fft=4096))**2
img = librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), y_axis='log', x_axis='time')

In [None]:
S = np.abs(librosa.stft(y, n_fft=4096))**2
chroma = librosa.feature.chroma_stft(S=S, sr=sr)
img = librosa.display.specshow(chroma, y_axis='chroma_h', x_axis='time', thaat='bhairavi', Sa=1)

CQT

In [None]:
C = np.abs(librosa.cqt(y, sr=sr))
img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), sr=sr, x_axis='time', y_axis='cqt_note')

In [None]:
chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr)
img = librosa.display.specshow(chroma_cq, y_axis='chroma_h', x_axis='time', thaat='bhairavi', Sa=1)

VQT

In [None]:
V = np.abs(librosa.vqt(y, sr=sr, bins_per_octave=36, fmin=librosa.note_to_hz('C2')))
img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max), sr=sr, x_axis='time', y_axis='cqt_note')

In [None]:
chroma_vq = librosa.vqt(y, sr=sr, bins_per_octave=36, fmin=librosa.note_to_hz('C2'))
img = librosa.display.specshow(chroma_vq, y_axis='vqt_fjs', x_axis='time', thaat='bhairavi', Sa=1, intervals='ji5')

Misc

In [None]:
""" Zero crossing:
The  higher this is, the less reliable our computed frequency is?? Why??
"""
zc = librosa.feature.zero_crossing_rate(y)[0]
plt.plot(zc)

In [None]:
""" RMS:
Correlates with the energy in the signal over time. The RMS is probably over the f domain, so a sharp f --> a higher RMS??
"""
rms = librosa.feature.rms(y=y)[0]
plt.plot(rms)

In [None]:
ipy_audio(data=y, rate=sr)

In [None]:
mfcc = librosa.feature.mfcc(y=y, sr=sr, dct_type=2)
plt.figure(figsize=(10,5))
# ax = plt.gca()
img = librosa.display.specshow(mfcc, x_axis='time')
# ax.matshow(mfcc)

### speech_recongition example

In [None]:
import speech_recognition as sr

In [None]:
r = sr.Recognizer()

# Reading Audio File and storing in a variable
with sr.AudioFile(ngtj_file) as source:
    audio_text = r.listen(source)

In [None]:
text = r.recognize_google(audio_text, language="mr")