<a href="https://colab.research.google.com/github/lmbernardo7520112/APIs-Serverless-dio-lmb/blob/master/audio_cd_cs_simplified_version_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wget  # If you haven't already installed wget

import os

def download_and_extract_dataset():
    """Downloads and extracts the Mini Speech Commands dataset."""
    print("Baixando o dataset Mini Speech Commands...")
    !wget http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip
    print("Dataset baixado com sucesso!")

    print("Extraindo o dataset Mini Speech Commands...")
    !unzip -q mini_speech_commands.zip
    print("Dataset extraído com sucesso!")

# Execute the download and extraction
download_and_extract_dataset()

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=9142d5513533ccdb60553ee4cb6b2b854c19af4222a3df7d07e464f867fc6810
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Baixando o dataset Mini Speech Commands...
--2024-11-18 21:14:50--  http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.187.207, 64.233.188.207, 64.233.189.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.187.207|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182082353 (174M) [application/zip]
S

In [25]:
import tensorflow as tf
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import glob
from tqdm import tqdm
from tensorflow.keras import layers, models

# Configurações iniciais
COMMANDS = ['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes']
SAMPLE_RATE = 16000  # Taxa de amostragem
MAX_DURATION = 1  # Duração máxima em segundos

# Função para carregar áudio com fallback
def load_audio(file_path, sample_rate, max_duration, fallback_file=None):
    if not os.path.exists(file_path):
        print(f"Arquivo não encontrado: {file_path}")
        if fallback_file and os.path.exists(fallback_file):
            print(f"Usando arquivo substituto: {fallback_file}")
            file_path = fallback_file
        else:
            return None
    try:
        audio, sr = librosa.load(file_path, sr=sample_rate, mono=True, duration=max_duration)
        return audio, sr
    except Exception as e:
        print(f"Erro ao carregar o arquivo {file_path}: {e}")
        return None

# Função para extrair características emocionais
def extract_emotional_features(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")  # Arquivos substitutos
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        features = {
            'pitch_mean': np.mean(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'pitch_std': np.std(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'rms_energy': np.mean(librosa.feature.rms(y=audio)[0]),
            'zero_crossing_rate': np.mean(librosa.feature.zero_crossing_rate(audio)),
            'spectral_centroid': np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]),
            'spectral_bandwidth': np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]),
            'spectral_rolloff': np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]),
            'mfccs': librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        }
        return features
    except Exception as e:
        print(f"Erro ao extrair características de {file_path}: {e}")
        return None

# Função para extrair espectrograma mel
def extract_mel_spectrogram(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        return mel_spectrogram_db
    except Exception as e:
        print(f"Erro ao extrair espectrograma de {file_path}: {e}")
        return None

# Função para criar o modelo de rede neural
def create_emotional_command_model(input_shape):
    input_spec = layers.Input(shape=input_shape, name='spectogram_input')
    input_emotion = layers.Input(shape=(4,), name='emotional_features')

    x = layers.Conv2D(32, 3, activation='relu')(input_spec)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(64, 3, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Flatten()(x)

    combined = layers.Concatenate()([x, input_emotion])
    x = layers.Dense(128, activation='relu')(combined)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(len(COMMANDS), activation='softmax')(x)

    model = models.Model(inputs=[input_spec, input_emotion], outputs=outputs)
    return model

# Função principal de treinamento
def train_emotional_command_recognition():
    spectrograms, emotional_features, labels = [], [], []
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None

    for command in COMMANDS:
        path = f"mini_speech_commands/{command}"
        if not os.path.exists(path):
            print(f"Diretório não encontrado: {path}")
            continue
        for file in tqdm(glob.glob(os.path.join(path, "*.wav"))):
            features = extract_emotional_features(file)
            if features is None:
                continue
            spectrogram = extract_mel_spectrogram(file)
            if spectrogram is None:
                continue
            spectrograms.append(spectrogram)
            emotional_features.append([features['pitch_mean'], features['pitch_std'], features['rms_energy'], features['zero_crossing_rate']])
            labels.append(COMMANDS.index(command))

    if not spectrograms:
        raise ValueError("Nenhum espectrograma foi extraído.")

    max_rows = max(spec.shape[0] for spec in spectrograms)
    max_cols = max(spec.shape[1] for spec in spectrograms)

    spectrograms = np.array([  # Pad spectrograms to the maximum size
        np.pad(spec, ((0, max_rows - spec.shape[0]), (0, max_cols - spec.shape[1])), mode='constant')
        for spec in spectrograms
    ])[..., np.newaxis]
    emotional_features = np.array(emotional_features)
    labels = np.array(labels)

    indices = np.random.permutation(len(spectrograms))
    training_idx = indices[:int(0.8 * len(indices))]
    test_idx = indices[int(0.8 * len(indices)):]

    # Create and compile the model
    model = create_emotional_command_model(spectrograms[0].shape)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # EarlyStopping and ReduceLROnPlateau callback setup
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  # Monitor the loss for early stopping
            patience=5,  # Tuning patience to avoid stopping too early
            restore_best_weights=True,  # Restore the best weights based on val_loss
            mode='min'  # 'min' for monitoring loss, 'max' for accuracy
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',  # Reduce learning rate based on validation loss
            factor=0.5,  # Reduce LR by a factor of 0.5
            patience=3,  # Number of epochs to wait before reducing LR
            mode='min'  # 'min' for loss
        )
    ]

    # Train the model
    model.fit(
        [spectrograms[training_idx], emotional_features[training_idx]],
        labels[training_idx],
        validation_data=(
            [spectrograms[test_idx], emotional_features[test_idx]],
            labels[test_idx]
        ),
        epochs=50,
        batch_size=32,
        callbacks=callbacks
    )

    return model

# Execução
if __name__ == "__main__":
    model = train_emotional_command_recognition()
    test_file = "mini_speech_commands/stop/0a7c2a8d_nohash_0.wav"
    features = extract_emotional_features(test_file)
    print(features)


100%|██████████| 1000/1000 [00:51<00:00, 19.30it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.13it/s]
100%|██████████| 1000/1000 [00:57<00:00, 17.54it/s]
100%|██████████| 1000/1000 [00:55<00:00, 17.96it/s]
100%|██████████| 1000/1000 [00:53<00:00, 18.74it/s]
100%|██████████| 1000/1000 [00:52<00:00, 19.15it/s]
100%|██████████| 1000/1000 [00:53<00:00, 18.65it/s]
100%|██████████| 1000/1000 [00:52<00:00, 19.03it/s]


Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 234ms/step - accuracy: 0.2181 - loss: 3.6773 - val_accuracy: 0.4400 - val_loss: 1.5441 - learning_rate: 0.0010
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 251ms/step - accuracy: 0.3585 - loss: 1.6718 - val_accuracy: 0.4850 - val_loss: 1.4134 - learning_rate: 0.0010
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 236ms/step - accuracy: 0.4290 - loss: 1.4762 - val_accuracy: 0.4700 - val_loss: 1.3477 - learning_rate: 0.0010
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 234ms/step - accuracy: 0.4661 - loss: 1.3782 - val_accuracy: 0.1669 - val_loss: 3.5415 - learning_rate: 0.0010
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 238ms/step - accuracy: 0.4969 - loss: 1.2721 - val_accuracy: 0.3738 - val_loss: 1.9973 - learning_rate: 0.0010
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [2]:
import tensorflow as tf
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import glob
from tqdm import tqdm
from tensorflow.keras import layers, models
import random  # Added import for random file selection

# Configurações iniciais
COMMANDS = ['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes']
SAMPLE_RATE = 16000  # Taxa de amostragem
MAX_DURATION = 1  # Duração máxima em segundos

# Função para carregar áudio com fallback
def load_audio(file_path, sample_rate, max_duration, fallback_file=None):
    if not os.path.exists(file_path):
        print(f"Arquivo não encontrado: {file_path}")
        if fallback_file and os.path.exists(fallback_file):
            print(f"Usando arquivo substituto: {fallback_file}")
            file_path = fallback_file
        else:
            return None
    try:
        audio, sr = librosa.load(file_path, sr=sample_rate, mono=True, duration=max_duration)
        return audio, sr
    except Exception as e:
        print(f"Erro ao carregar o arquivo {file_path}: {e}")
        return None

# Função para extrair características emocionais
def extract_emotional_features(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")  # Arquivos substitutos
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        features = {
            'pitch_mean': np.mean(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'pitch_std': np.std(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'rms_energy': np.mean(librosa.feature.rms(y=audio)[0]),
            'zero_crossing_rate': np.mean(librosa.feature.zero_crossing_rate(audio)),
            'spectral_centroid': np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]),
            'spectral_bandwidth': np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]),
            'spectral_rolloff': np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]),
            'mfccs': librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        }
        return features
    except Exception as e:
        print(f"Erro ao extrair características de {file_path}: {e}")
        return None

# Função para extrair espectrograma mel
def extract_mel_spectrogram(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        return mel_spectrogram_db
    except Exception as e:
        print(f"Erro ao extrair espectrograma de {file_path}: {e}")
        return None

# Função para criar o modelo de rede neural
def create_emotional_command_model(input_shape):
    input_spec = layers.Input(shape=input_shape, name='spectogram_input')
    input_emotion = layers.Input(shape=(4,), name='emotional_features')

    x = layers.Conv2D(32, 3, activation='relu')(input_spec)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(64, 3, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Flatten()(x)

    combined = layers.Concatenate()([x, input_emotion])
    x = layers.Dense(128, activation='relu')(combined)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(len(COMMANDS), activation='softmax')(x)

    model = models.Model(inputs=[input_spec, input_emotion], outputs=outputs)
    return model

# Função principal de treinamento
def train_emotional_command_recognition():
    spectrograms, emotional_features, labels = [], [], []
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None

    for command in COMMANDS:
        path = f"mini_speech_commands/{command}"
        if not os.path.exists(path):
            print(f"Diretório não encontrado: {path}")
            continue
        for file in tqdm(glob.glob(os.path.join(path, "*.wav"))):
            features = extract_emotional_features(file)
            if features is None:
                continue
            spectrogram = extract_mel_spectrogram(file)
            if spectrogram is None:
                continue
            spectrograms.append(spectrogram)
            emotional_features.append([features['pitch_mean'], features['pitch_std'], features['rms_energy'], features['zero_crossing_rate']])
            labels.append(COMMANDS.index(command))

    if not spectrograms:
        raise ValueError("Nenhum espectrograma foi extraído.")

    max_rows = max(spec.shape[0] for spec in spectrograms)
    max_cols = max(spec.shape[1] for spec in spectrograms)

    spectrograms = np.array([  # Pad spectrograms to the maximum size
        np.pad(spec, ((0, max_rows - spec.shape[0]), (0, max_cols - spec.shape[1])), mode='constant')
        for spec in spectrograms
    ])[..., np.newaxis]
    emotional_features = np.array(emotional_features)
    labels = np.array(labels)

    indices = np.random.permutation(len(spectrograms))
    training_idx = indices[:int(0.8 * len(indices))]
    test_idx = indices[int(0.8 * len(indices)):]

    # Create and compile the model
    model = create_emotional_command_model(spectrograms[0].shape)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # EarlyStopping and ReduceLROnPlateau callback setup
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',  # Monitor the loss for early stopping
            patience=5,  # Tuning patience to avoid stopping too early
            restore_best_weights=True,  # Restore the best weights based on val_loss
            mode='min'  # 'min' for monitoring loss, 'max' for accuracy
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',  # Reduce learning rate based on validation loss
            factor=0.5,  # Reduce LR by a factor of 0.5
            patience=3,  # Number of epochs to wait before reducing LR
            mode='min'  # 'min' for loss
        )
    ]

    # Train the model
    model.fit(
        [spectrograms[training_idx], emotional_features[training_idx]],
        labels[training_idx],
        validation_data=(
            [spectrograms[test_idx], emotional_features[test_idx]],
            labels[test_idx]
        ),
        epochs=50,
        batch_size=32,
        callbacks=callbacks
    )

    return model

if __name__ == "__main__":
    # Treina modelo
    model = train_emotional_command_recognition()

    # Exemplo de análise de um comando
    test_file = "mini_speech_commands/stop/0a7c2a8d_nohash_0.wav"

    # Check if the test file exists
    if not os.path.exists(test_file):
        print(f"Arquivo não encontrado: {test_file}")

        # Choose a random file from the dataset
        command = random.choice(COMMANDS)  # Select a random command
        file_list = glob.glob(os.path.join("mini_speech_commands", command, "*.wav"))  # Get list of files
        test_file = random.choice(file_list)  # Choose a random file

        print(f"Usando arquivo aleatório: {test_file}")

    features = extract_emotional_features(test_file)
    print(features)


100%|██████████| 1000/1000 [01:15<00:00, 13.24it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.17it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.00it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.16it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.00it/s]
100%|██████████| 1000/1000 [00:55<00:00, 17.90it/s]
100%|██████████| 1000/1000 [00:54<00:00, 18.22it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.08it/s]


Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 253ms/step - accuracy: 0.2186 - loss: 3.1887 - val_accuracy: 0.3806 - val_loss: 1.6284 - learning_rate: 0.0010
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 229ms/step - accuracy: 0.3225 - loss: 1.6628 - val_accuracy: 0.3831 - val_loss: 1.6772 - learning_rate: 0.0010
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 235ms/step - accuracy: 0.3633 - loss: 1.5257 - val_accuracy: 0.4494 - val_loss: 1.4442 - learning_rate: 0.0010
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 227ms/step - accuracy: 0.4074 - loss: 1.4171 - val_accuracy: 0.5144 - val_loss: 1.2688 - learning_rate: 0.0010
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 227ms/step - accuracy: 0.4521 - loss: 1.2978 - val_accuracy: 0.5481 - val_loss: 1.1434 - learning_rate: 0.0010
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [24]:
import tensorflow as tf
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import glob
from tqdm import tqdm
from tensorflow.keras import layers, models

# Configurações iniciais
COMMANDS = ['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes']
SAMPLE_RATE = 16000  # Taxa de amostragem
MAX_DURATION = 1  # Duração máxima em segundos

# Função para carregar áudio com fallback
def load_audio(file_path, sample_rate, max_duration, fallback_file=None):
    if not os.path.exists(file_path):
        print(f"Arquivo não encontrado: {file_path}")
        if fallback_file and os.path.exists(fallback_file):
            print(f"Usando arquivo substituto: {fallback_file}")
            file_path = fallback_file
        else:
            return None
    try:
        audio, sr = librosa.load(file_path, sr=sample_rate, mono=True, duration=max_duration)
        return audio, sr
    except Exception as e:
        print(f"Erro ao carregar o arquivo {file_path}: {e}")
        return None

# Função para extrair características emocionais
def extract_emotional_features(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")  # Arquivos substitutos
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        features = {
            'pitch_mean': np.mean(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'pitch_std': np.std(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'rms_energy': np.mean(librosa.feature.rms(y=audio)[0]),
            'zero_crossing_rate': np.mean(librosa.feature.zero_crossing_rate(audio)),
            'spectral_centroid': np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]),
            'spectral_bandwidth': np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]),
            'spectral_rolloff': np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]),
            'mfccs': librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        }
        return features
    except Exception as e:
        print(f"Erro ao extrair características de {file_path}: {e}")
        return None

# Função para extrair espectrograma mel
def extract_mel_spectrogram(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        return mel_spectrogram_db
    except Exception as e:
        print(f"Erro ao extrair espectrograma de {file_path}: {e}")
        return None

# Função principal de treinamento
def train_emotional_command_recognition():
    spectrograms, emotional_features, labels = [], [], []
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None

    for command in COMMANDS:
        path = f"mini_speech_commands/{command}"
        if not os.path.exists(path):
            print(f"Diretório não encontrado: {path}")
            continue
        for file in tqdm(glob.glob(os.path.join(path, "*.wav"))):
            features = extract_emotional_features(file)
            if features is None:
                continue
            spectrogram = extract_mel_spectrogram(file)
            if spectrogram is None:
                continue
            spectrograms.append(spectrogram)
            emotional_features.append([features['pitch_mean'], features['pitch_std'], features['rms_energy'], features['zero_crossing_rate']])
            labels.append(COMMANDS.index(command))

    if not spectrograms:
        raise ValueError("Nenhum espectrograma foi extraído.")

    max_rows = max(spec.shape[0] for spec in spectrograms)
    max_cols = max(spec.shape[1] for spec in spectrograms)

    spectrograms = np.array([
        np.pad(spec, ((0, max_rows - spec.shape[0]), (0, max_cols - spec.shape[1])), mode='constant')
        for spec in spectrograms
    ])[..., np.newaxis]
    emotional_features = np.array(emotional_features)
    labels = np.array(labels)

    indices = np.random.permutation(len(spectrograms))
    training_idx = indices[:int(0.8 * len(indices))]
    test_idx = indices[int(0.8 * len(indices)):]

    model = create_emotional_command_model(spectrograms[0].shape)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        [spectrograms[training_idx], emotional_features[training_idx]],
        labels[training_idx],
        validation_data=( [spectrograms[test_idx], emotional_features[test_idx]], labels[test_idx]),
        epochs=50,
        batch_size=32,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(patience=5),
            tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=2)
        ]
    )
    return model

# Execução
if __name__ == "__main__":
    model = train_emotional_command_recognition()
    test_file = "mini_speech_commands/stop/0a7c2a8d_nohash_0.wav"
    features = extract_emotional_features(test_file)
    print(features)


100%|██████████| 1000/1000 [00:52<00:00, 19.18it/s]
100%|██████████| 1000/1000 [00:53<00:00, 18.65it/s]
100%|██████████| 1000/1000 [00:53<00:00, 18.74it/s]
100%|██████████| 1000/1000 [00:53<00:00, 18.82it/s]
100%|██████████| 1000/1000 [00:51<00:00, 19.44it/s]
100%|██████████| 1000/1000 [00:53<00:00, 18.83it/s]
100%|██████████| 1000/1000 [00:52<00:00, 18.93it/s]
100%|██████████| 1000/1000 [00:51<00:00, 19.36it/s]


Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 232ms/step - accuracy: 0.2291 - loss: 3.3219 - val_accuracy: 0.3919 - val_loss: 1.6209 - learning_rate: 0.0010
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 226ms/step - accuracy: 0.3597 - loss: 1.6277 - val_accuracy: 0.5375 - val_loss: 1.2810 - learning_rate: 0.0010
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 248ms/step - accuracy: 0.4268 - loss: 1.4465 - val_accuracy: 0.5869 - val_loss: 1.2056 - learning_rate: 0.0010
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 235ms/step - accuracy: 0.4672 - loss: 1.3224 - val_accuracy: 0.5019 - val_loss: 1.4374 - learning_rate: 0.0010
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 226ms/step - accuracy: 0.4944 - loss: 1.2313 - val_accuracy: 0.6162 - val_loss: 1.0339 - learning_rate: 0.0010
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [25]:
model.save('emotional_command_model.keras')

In [26]:
import os
print(os.listdir('.'))  # Lista os arquivos no diretório atual

['.config', 'mini_speech_commands', '__MACOSX', 'mini_speech_commands.zip', 'emotional_command_model.keras', 'sample_data']


In [31]:
import random
import glob
import os
import numpy as np
import tensorflow as tf
import librosa
import librosa.display

# Configurações iniciais (mantendo consistência com o código original)
COMMANDS = ['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes']
SAMPLE_RATE = 16000
MAX_DURATION = 1

def load_audio(file_path, sample_rate, max_duration, fallback_file=None):
    if not os.path.exists(file_path):
        print(f"Arquivo não encontrado: {file_path}")
        if fallback_file and os.path.exists(fallback_file):
            print(f"Usando arquivo substituto: {fallback_file}")
            file_path = fallback_file
        else:
            return None
    try:
        audio, sr = librosa.load(file_path, sr=sample_rate, mono=True, duration=max_duration)
        return audio, sr
    except Exception as e:
        print(f"Erro ao carregar o arquivo {file_path}: {e}")
        return None

def extract_emotional_features(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        features = {
            'pitch_mean': np.mean(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'pitch_std': np.std(librosa.yin(audio, fmin=100, fmax=800, sr=sr)),
            'rms_energy': np.mean(librosa.feature.rms(y=audio)[0]),
            'zero_crossing_rate': np.mean(librosa.feature.zero_crossing_rate(audio)),
            'spectral_centroid': np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]),
            'spectral_bandwidth': np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]),
            'spectral_rolloff': np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]),
            'mfccs': librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        }
        return features
    except Exception as e:
        print(f"Erro ao extrair características de {file_path}: {e}")
        return None

def extract_mel_spectrogram(file_path):
    fallback_files = glob.glob("mini_speech_commands/stop/*.wav")
    fallback_file = fallback_files[0] if fallback_files else None
    data = load_audio(file_path, SAMPLE_RATE, MAX_DURATION, fallback_file)
    if data is None:
        return None

    audio, sr = data
    try:
        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        return mel_spectrogram_db
    except Exception as e:
        print(f"Erro ao extrair espectrograma de {file_path}: {e}")
        return None

def get_max_dimensions():
    """
    Calcula as dimensões máximas dos espectrogramas no conjunto de dados
    """
    max_rows = 0
    max_cols = 0

    for command in COMMANDS:
        path = f"mini_speech_commands/{command}"
        if not os.path.exists(path):
            continue

        for file in glob.glob(os.path.join(path, "*.wav")):
            spectrogram = extract_mel_spectrogram(file)
            if spectrogram is not None:
                max_rows = max(max_rows, spectrogram.shape[0])
                max_cols = max(max_cols, spectrogram.shape[1])

    return max_rows, max_cols

def random_file_test(model, num_tests=3):
    """
    Realiza testes com arquivos aleatórios

    Args:
        model: Modelo treinado
        num_tests: Número de testes a realizar
    """
    # Obter dimensões máximas para padding
    max_rows, max_cols = get_max_dimensions()

    results = []

    for i in range(num_tests):
        print(f"\nRealizando teste {i+1}/{num_tests}")

        # Escolher comando aleatório
        random_command = random.choice(COMMANDS)
        command_path = f"mini_speech_commands/{random_command}"

        # Selecionar arquivo aleatório
        files = glob.glob(os.path.join(command_path, "*.wav"))
        if not files:
            print(f"Nenhum arquivo encontrado para {random_command}")
            continue

        test_file = random.choice(files)
        print(f"Arquivo selecionado: {test_file}")

        # Extrair características
        features = extract_emotional_features(test_file)
        spectrogram = extract_mel_spectrogram(test_file)

        if features is None or spectrogram is None:
            print(f"Falha ao processar {test_file}")
            continue

        # Preparar dados para predição
        spectrogram = np.pad(
            spectrogram,
            ((0, max_rows - spectrogram.shape[0]),
             (0, max_cols - spectrogram.shape[1])),
            mode='constant'
        )[np.newaxis,..., np.newaxis]

        emotional_features = np.array([
            [features['pitch_mean'], features['pitch_std'],
             features['rms_energy'], features['zero_crossing_rate']]
        ])

        # Fazer predição
        prediction = model.predict([spectrogram, emotional_features], verbose=0)
        predicted_class = COMMANDS[np.argmax(prediction)]
        true_class = random_command

        # Registrar resultado
        result = {
            'file': test_file,
            'true_class': true_class,
            'predicted_class': predicted_class,
            'probabilities': dict(zip(COMMANDS, prediction[0].tolist())),
            'correct': predicted_class == true_class
        }
        results.append(result)

        # Imprimir resultado do teste
        print(f"\nResultado do teste {i+1}:")
        print(f"Classe Verdadeira: {true_class}")
        print(f"Classe Predita: {predicted_class}")
        print("Probabilidades:")
        for cmd, prob in result['probabilities'].items():
            print(f"{cmd}: {prob*100:.2f}%")
        print(f"Resultado: {'Correto!' if result['correct'] else 'Incorreto.'}")

    # Resumo dos testes
    correct_tests = sum(result['correct'] for result in results)
    print(f"\n=== Resumo dos Testes ===")
    print(f"Total de Testes: {len(results)}")
    print(f"Testes Corretos: {correct_tests}")
    print(f"Precisão: {correct_tests/len(results)*100:.2f}%")

    return results

In [32]:
# Carregar o modelo e realizar os testes
model = tf.keras.models.load_model('emotional_command_model.keras')
resultados = random_file_test(model, num_tests=5)


Realizando teste 1/5
Arquivo selecionado: mini_speech_commands/go/3bfd30e6_nohash_0.wav

Resultado do teste 1:
Classe Verdadeira: go
Classe Predita: go
Probabilidades:
down: 7.24%
go: 62.36%
left: 0.00%
no: 28.13%
right: 2.27%
stop: 0.00%
up: 0.00%
yes: 0.00%
Resultado: Correto!

Realizando teste 2/5
Arquivo selecionado: mini_speech_commands/left/dbaf8fc6_nohash_0.wav

Resultado do teste 2:
Classe Verdadeira: left
Classe Predita: left
Probabilidades:
down: 0.00%
go: 0.00%
left: 93.82%
no: 0.15%
right: 5.34%
stop: 0.00%
up: 0.27%
yes: 0.41%
Resultado: Correto!

Realizando teste 3/5
Arquivo selecionado: mini_speech_commands/left/cb5d2c6e_nohash_1.wav

Resultado do teste 3:
Classe Verdadeira: left
Classe Predita: left
Probabilidades:
down: 0.00%
go: 0.00%
left: 95.55%
no: 0.66%
right: 1.96%
stop: 0.00%
up: 1.78%
yes: 0.05%
Resultado: Correto!

Realizando teste 4/5
Arquivo selecionado: mini_speech_commands/up/f428ca69_nohash_0.wav

Resultado do teste 4:
Classe Verdadeira: up
Classe Predit