# Speech-based-stress-detection-using-a-cross-modal-transfer-learning-approach: Generating images from audio files

In [45]:
from IPython.display import Audio
import librosa
import glob
import time
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [46]:
SAMPLE_RATE = 16000
DIR_AUDIOS = '../data/processed_audios/'
DIR_IMAGES = '../data/images/'

In [47]:
plt.ioff()

## Reading directory files

In [48]:
files_audios = [(f, (f.split('/')[-1]).split('.')[0]) for f in glob.glob(DIR_AUDIOS + "*.wav")]
print(f'Number audio files: {len(files_audios)}')

Number audio files: 32


In [49]:
files_audios_example = [files_audios[2]]
files_audios_example

[('../data/processed_audios/ID_852630991_baseline.wav',
  'ID_852630991_baseline')]

In [50]:
n_fft = 256 # freq resolution
hop_length = int(n_fft/2)

n_mels = 20  # number of Mel bands to generate

fmin = 50           
fmax = sample_rate/2

In [55]:
start = time.time()
for i, file in enumerate(files_audios_example):
    
    path = file[0]
    filename = file[1]
    
    print(f'File {i+1} / {len(files_audios_example)}: {path}')
    
    clip, sample_rate = librosa.load(path, sr=SAMPLE_RATE)
    print('\t - Sample Rate   {} Hz'.format(sample_rate))
    print('\t - Clip Length   {:3.2f} seconds'.format(len(clip)/sample_rate))
    n_chunks = int(np.floor(len(clip)/sample_rate))

    
    for j in range(n_chunks):
        
        file_save = DIR_IMAGES+filename+'_'+str(j)+'.png'
        
        print('\t - Chunk {} / {}: {}'.format(j+1, n_chunks, file_save), end='\r', flush=True)
        
        #print(f'Chunk {j+1} / {n_chunks}: {file_save}', flush=True)
        clip_chunk = clip[j*SAMPLE_RATE:(j+1)*SAMPLE_RATE]
        
        fig = plt.figure(figsize=(10, 10))

        mel_spec = librosa.feature.melspectrogram(clip_chunk, n_fft=n_fft, hop_length=hop_length,
                                                  n_mels=n_mels, sr=sample_rate, power=1.0,
                                                  fmin=fmin, fmax=fmax)

        mel_spec_db = librosa.amplitude_to_db(mel_spec, ref=np.max)
        librosa.display.specshow(mel_spec_db, x_axis='ms',  y_axis='mel', 
                                 sr=sample_rate, hop_length=hop_length, 
                                 fmin=fmin, fmax=fmax)
        plt.axis('off')
        plt.savefig(file_save)
        plt.close(fig)
        

print('\n\nExecution time: {:.2f} seconds'.format(time.time() - start))

File 1 / 1: ../data/processed_audios/ID_852630991_baseline.wav
	 - Sample Rate   16000 Hz
	 - Clip Length   112.00 seconds
	 - Chunk 112 / 112: ../data/images/ID_852630991_baseline_111.png
Execution time: 8.95 seconds
