In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import librosa
import soundfile as sf
from IPython import display
import os
import cv2
import gc
import random

In [None]:
#bloco de código para criar os arquivos de áudio com 10 segundos que serão usados

portuguese = '/content/drive/MyDrive/audios/português'
english = '/content/drive/MyDrive/audios/inglês'

dirs_portuguese = os.listdir(portuguese)
dirs_english = os.listdir(english)[0:len(dirs_portuguese)]

k = []
counter = -1
for p, e in zip(dirs_portuguese, dirs_english):
  for init in range(10, 240, 8):
    counter += 1
    try:
      y, sr = librosa.load(portuguese + '/' + p, offset=init, duration=10.0)
      sf.write(f'/content/drive/MyDrive/audios/inputs/portuguese_inputs/{counter}.wav', y, sr)
    except:
      k.append(portuguese + '/' + p)

    try:
      y, sr = librosa.load(english + '/' + e, offset=init, duration=10.0)
      sf.write(f'/content/drive/MyDrive/audios/inputs/english_inputs/{counter}.wav', y, sr)
    except:
      k.append(english + '/' + e)


In [12]:
def get_waveform(file_path):
  audio_binary = tf.io.read_file(file_path)
  waveform, _ = tf.audio.decode_wav(contents=audio_binary)
  waveform = tf.reshape(waveform, (220500))
  return waveform

def show_waveform(waveform):
  plt.plot(waveform)

In [13]:
def get_spectrogram(waveform):
  # Zero-padding for an audio waveform with less than 16,000 samples.
  input_len = 220500
  waveform = waveform[:input_len]
  zero_padding = tf.zeros(
      [220500] - tf.shape(waveform),
      dtype=tf.float32)
  # Cast the waveform tensors' dtype to float32.
  waveform = tf.cast(waveform, dtype=tf.float32)
  # Concatenate the waveform with `zero_padding`, which ensures all audio
  # clips are of the same length.
  equal_length = tf.concat([waveform, zero_padding], 0)
  # Convert the waveform to a spectrogram via a STFT.
  spectrogram = tf.signal.stft(
      equal_length, frame_length=2048, frame_step=128)
  # Obtain the magnitude of the STFT.
  spectrogram = tf.abs(spectrogram)
  # Add a `channels` dimension, so that the spectrogram can be used
  # as image-like input data with convolution layers (which expect
  # shape (`batch_size`, `height`, `width`, `channels`).
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

In [14]:
def create_spectrogram_for_model(directory):
  wave_array = get_waveform(directory)
  spectrogram_of_wave = np.array(get_spectrogram(wave_array))
  normalized_spectrogram = (spectrogram_of_wave - spectrogram_of_wave.min())/(spectrogram_of_wave.max() - spectrogram_of_wave.min())

  return normalized_spectrogram


In [15]:
x = create_spectrogram_for_model('/content/drive/MyDrive/audios/inputs/english_inputs/1.wav')

In [22]:
x.shape

(1707, 1025, 1)

In [42]:
#bloco de código para ler os arquivos de 3s e processá-los
x = []
y = []

portuguese_audios = os.listdir('/content/drive/MyDrive/audios/inputs/portuguese_inputs')
english_audios = os.listdir('/content/drive/MyDrive/audios/inputs/english_inputs')

for idx in range(1, min(len(portuguese_audios), len(english_audios))):
  try:
    spectrogram = create_spectrogram_for_model('/content/drive/MyDrive/audios/inputs/portuguese_inputs/' + str(idx) + '.wav')
    x.append(cv2.resize(spectrogram[:800,:800, 0], (500, 250)))
    y.append(1)
  except:
    pass

  try:
    spectrogram = create_spectrogram_for_model('/content/drive/MyDrive/audios/inputs/english_inputs/' + str(idx) + '.wav')
    x.append(cv2.resize(spectrogram[:800,:800, 0], (500, 250)))
    y.append(0)
  except:
    pass

x = np.array(x)
y = np.array(y)

print(x.shape)
print(y.shape)

(1965, 250, 500)
(1965,)


In [40]:
def division_dataset(X, y, train_ratio=0.5, val_ratio=0.23, test_ratio=0.27):
  """
  função para dividir o dataset completo em treino, validação e teste.
  """
  if train_ratio + test_ratio + val_ratio != 1: ##criar um raise aqui
    return False

  len_dataset = X.shape[0]
  train_x, train_y, val_x, val_y, test_x, test_y = [], [], [], [], [], []

  random.seed(2023)

  for i in range(X.shape[0]):
    r = random.random()
    if r <= train_ratio:
      train_x.append(X[i])
      train_y.append(y[i])
    elif r <= train_ratio + val_ratio:
      val_x.append(X[i])
      val_y.append(y[i])
    else:
      test_x.append(X[i])
      test_y.append(y[i])
    
  return np.array(train_x, dtype='float16'), np.array(train_y, dtype='float16'), np.array(val_x, dtype='float16'), np.array(val_y, dtype='float16'), np.array(test_x, dtype='float16'), np.array(test_y, dtype='float16')

In [43]:
train_x, train_y, val_x, val_y, test_x, test_y = division_dataset(x, y)

In [44]:
train_x.shape, train_y.shape, val_x.shape, val_y.shape, test_x.shape, test_y.shape

((965, 250, 500), (965,), (457, 250, 500), (457,), (543, 250, 500), (543,))

In [70]:
model = tf.keras.Sequential()

model.add(tf.keras.layers.Conv2D(input_shape=(250, 500, 1), filters=32, kernel_size=(3,3), use_bias=True, padding='same', activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), use_bias=True, padding='same', activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPool2D(pool_size=(3,3)))
model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), use_bias=True, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPool2D(pool_size=(3,3)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=1024, activation=('selu'), kernel_initializer='lecun_normal'))
model.add(tf.keras.layers.Dense(1, activation=('sigmoid')))

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[tf.keras.metrics.CategoricalAccuracy(), 
                       tf.keras.metrics.Precision(),
                       tf.keras.metrics.AUC()])

In [46]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_18 (Conv2D)          (None, 250, 500, 32)      320       
                                                                 
 batch_normalization_18 (Bat  (None, 250, 500, 32)     128       
 chNormalization)                                                
                                                                 
 conv2d_19 (Conv2D)          (None, 250, 500, 64)      18496     
                                                                 
 batch_normalization_19 (Bat  (None, 250, 500, 64)     256       
 chNormalization)                                                
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 83, 166, 64)      0         
 2D)                                                             
                                                      

In [71]:
model.fit(train_x, train_y,
            batch_size=16,
            epochs=10,
            validation_data=(val_x, val_y),
            shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f49a32e62b0>

In [48]:
model.evaluate(test_x, test_y, batch_size=1)



[2.5263874530792236, 1.0, 0.0, 0.6095101833343506]

In [None]:
#full english test
k = []
english_x = []
english_y = []
counter = 0
for init in range(10, 230, 10):
    counter += 1
    y, sr = librosa.load('/content/drive/MyDrive/audios/inglês/WOJAK ASKS HIS WIFE FOR MONEY TO INVEST IN THE CRYPTO MARKET.3gpp', offset=init, duration=10.0)
    sf.write(f'file{counter}.wav', y, sr)

for i in range(1, counter):
    spectrogram = create_spectrogram_for_model(f'file{i}.wav')
    english_x.append(cv2.resize(spectrogram, (500, 250)))
    english_y.append(0)


In [None]:
#full test portuguese
k = []
portuguese_x = []
portuguese_y = []
counter = 0
for init in range(10, 230, 10):
    counter += 1
    y, sr = librosa.load('/content/drive/MyDrive/audios/português/Quanto Tempo de Fato um Programador Fica Codificando  Formação DEV cortes.3gpp', offset=init, duration=10.0)
    sf.write(f'pfile{counter}.wav', y, sr)

for i in range(1, counter):
  try:
    spectrogram = create_spectrogram_for_model(f'pfile{i}.wav')
    portuguese_x.append(cv2.resize(spectrogram, (500, 250)))
    portuguese_y.append(1)
  except:
    pass

In [64]:
english_x, english_y = np.array(english_x), np.array(english_y)
portuguese_x, portuguese_y = np.array(portuguese_x), np.array(portuguese_y)

In [66]:
model.evaluate(english_x, english_y, batch_size=1)



[0.006144034676253796, 1.0, 0.0, 0.0]

In [68]:
model.evaluate(portuguese_x, portuguese_y, batch_size=1)



[5.372665882110596, 1.0, 0.0, 0.0]