<a href="https://colab.research.google.com/github/nobertomaciel/PLN-ANIMA/blob/main/UA2/PLN_speech_recognition_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
#  RECONHECIMENTO DE VOZ - SIMPLIFICADO (CNN)
# ============================================
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from jiwer import wer, cer
import tensorflow as tf
from tensorflow.keras import layers, models

# -------------------------------------------------------
# GERAR DADOS SINTÉTICOS (demonstrativo educacional)
# -------------------------------------------------------
def gerar_exemplo():
    sr = 16000
    audio = np.random.randn(sr)  # ruído como áudio simulado
    label = np.random.choice(["sim", "não"])
    return audio, label

audios = []
labels = []

for _ in range(200):
    x, y = gerar_exemplo()
    audios.append(x)
    labels.append(0 if y == "sim" else 1)

# -------------------------------------------------------
# CONVERTE PARA MEL-SPECTROGRAM
# -------------------------------------------------------
def audio_to_mel(aud):
    mel = librosa.feature.melspectrogram(y=aud, sr=16000)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db

X = np.array([audio_to_mel(a) for a in audios])
X = X[..., np.newaxis]
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# -------------------------------------------------------
# MODELO CNN SIMPLES
# -------------------------------------------------------
model = models.Sequential([
    layers.Conv2D(16, (3,3), activation='relu', input_shape=X_train[0].shape),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(32, (3,3), activation='relu'),
    layers.MaxPooling2D((2,2)),
    layers.Flatten(),
    layers.Dense(32, activation='relu'),
    layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
history = model.fit(X_train, y_train, epochs=5, validation_split=0.2)

# -------------------------------------------------------
# PREDIÇÕES
# -------------------------------------------------------
pred = model.predict(X_test)
y_pred = np.argmax(pred, axis=1)

# ============================================
#  BLOCO PADRÃO DE MÉTRICAS (USADO NOS 3 CÓDIGOS)
# ============================================
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from nltk.translate.bleu_score import sentence_bleu

print("\n=== MÉTRICAS ===")
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")

print("Acurácia:", acc)
print("F1-score:", f1)

# Como ASR envolve texto, criamos exemplos fictícios:
true_sentences = ["sim" if t == 0 else "não" for t in y_test]
pred_sentences = ["sim" if p == 0 else "não" for p in y_pred]

print("WER:", wer(true_sentences, pred_sentences))
print("CER:", cer(true_sentences, pred_sentences))

bleu = np.mean([sentence_bleu([ts.split()], ps.split())
                for ts, ps in zip(true_sentences, pred_sentences)])
print("BLEU:", bleu)

# Matriz de confusão
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.title("Matriz de Confusão - Speech Recognition")
plt.show()

# Plot de acurácia/validação
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Loss / Val Loss")
plt.show()


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/803.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
Envie um arquivo de áudio (wav/mp3/m4a)


Saving F042-0000.mp3 to F042-0000.mp3
Carregando modelo...


100%|███████████████████████████████████████| 461M/461M [00:05<00:00, 87.7MiB/s]


Transcrevendo áudio...

=== TRANSCRIÇÃO FINAL ===

 Pesquisa é uma coisa que muda toda a hora.

Arquivo salvo como: F042-0000.mp3.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>