In [None]:
!pip install hugsvision
from hugsvision.dataio.VisionDataset import VisionDataset
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer
from hugsvision.nnet.VisionClassifierTrainer import VisionClassifierTrainer
from transformers import ViTFeatureExtractor, ViTForImageClassification
from hugsvision.inference.VisionClassifierInference import VisionClassifierInference
import soundfile as sf
import librosa
import librosa.display
import matplotlib.pyplot as plt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Crea y entrena modelo
Solo para entrenamiento. Para realizar la inferencia sobre los audios de validacion pasar al siguiente punto

In [None]:
huggingface_model = 'google/vit-base-patch16-224-in21k'

#Particiona datos
# train, test, id2label, label2id = VisionDataset.fromImageFolder(
#   "/content/drive/MyDrive/Dataton/datos/por_carpetas/",
#   test_ratio   = 0.2,
#   balanced     = False,
#   augmentation = False,
# )
#Particiona datos
train, test, id2label, label2id = VisionDataset.fromImageFolders(
  train = "/content/drive/MyDrive/augmented2",
	test = "/content/drive/MyDrive/original2",
)



In [None]:
trainer = VisionClassifierTrainer(
	model_name   = "ViT_MarineFinal",
	train        = train,
	test         = test,
	output_dir   = "/content/drive/MyDrive/models/",
	max_epochs   = 10,
	batch_size   = 16, 
	lr	     = 1e-5,
	fp16	     = True,
	model = ViTForImageClassification.from_pretrained(
	    huggingface_model,
	    num_labels = len(label2id),
	    label2id   = label2id,
	    id2label   = id2label,
	),
	feature_extractor = ViTFeatureExtractor.from_pretrained(
		huggingface_model,
	),
)

In [None]:
hyp, ref = trainer.evaluate_f1_score()

# Inferencia

In [None]:
model_path = "/content/drive/MyDrive/Modelo/VIT_MARINEFINAL/10_2022-10-25-00-42-10/model"
fe_path = "/content/drive/MyDrive/Modelo/VIT_MARINEFINAL/10_2022-10-25-00-42-10/feature_extractor"

classifier = VisionClassifierInference(
    feature_extractor = ViTFeatureExtractor.from_pretrained(fe_path),
    model = ViTForImageClassification.from_pretrained(model_path),
)


Definicion función de ventanas para calcular los tiempos de inicio y fin:

In [None]:

PERIOD = 10 #Tamaño ventana en segundos (aumentar lo hace mas rapido pero menos exacto)
SALTO = 5 #Desplazamiento de la ventana en segundos

def genera_subclips(clip,sr):
  audios = []
  len_y = len(clip)
  start = 0
  end = PERIOD * sr
  while True:
        y_batch = clip[start:end].astype(np.float32)
        if len(y_batch) != PERIOD * sr:
            y_pad = np.zeros(PERIOD * sr, dtype=np.float32)
            y_pad[:len(y_batch)] = y_batch
            audios.append(y_pad)
            break
        start += SALTO * sr
        end += SALTO * sr
        audios.append(y_batch)
  return audios


def calcula_anotaciones(estimated_event_list,sr,filename):
  anotaciones = []
  len_events = len(estimated_event_list)
  i = 0
  while i < len_events:
    j = i + 1
    while j < len_events and estimated_event_list[i] == estimated_event_list[j]:
      j+=1
    if estimated_event_list[i]!='silence':
      if j == len_events:
        anotaciones.append({'path': filename,
                  'start':float(SALTO * i),
                  'end':float(SALTO * (j-1) + PERIOD),
                  'label':estimated_event_list[j-1]})
      else:
          anotaciones.append({'path': filename,
                'start':float(SALTO * i),
                'end':float(SALTO * j),
                'label':estimated_event_list[j-1]})
    i = j
  
  return anotaciones

def prediction_for_clip(filename,clip: np.ndarray,sr,threshold=0.5):

     #Generamos los subaudios del audio
    print('Duración clip: ' + str(len(clip)/sr) + ' segundos')
    y = clip.astype(np.float32)
    audios = genera_subclips(y,sr)
    print('Ventanas de ' + str(PERIOD) +' segundos')
    print('N ventanas: ' + str(len(audios)))

    estimated_event_list = []
    for audio in audios:
      #Generar espectrogramas
      fig = plt.figure(figsize=[1,1])
      # This is to get rid of the axes and only get the picture 
      ax = fig.add_subplot(111)
      ax.axes.get_xaxis().set_visible(False)
      ax.axes.get_yaxis().set_visible(False)
      ax.set_frame_on(False)
      # This is the melspectrogram from the decibels with a linear relationship
      S = librosa.feature.melspectrogram(y=audio, sr=sr)
      librosa.display.specshow(librosa.power_to_db(S, ref=np.max), y_axis='linear')
      # Here we choose the path and the name to save the file, we will change the path when
      # using the function for train, val and test to make the function easy to use and output
      # the images in different folders to use later with a generator

      file_spectrogram  = '/content/iterator.jpg'
      # Here we finally save the image file choosing the resolution 
      plt.savefig(file_spectrogram, dpi=500, bbox_inches='tight',pad_inches=0)
      
      # Here we close the image because otherwise we get a warning saying that the image stays
      # open and consumes memory
      plt.close()
            
      #PREDICT
      y_pred = classifier.predict(img_path=file_spectrogram,return_str=False)
      outputs = np.array(list(y_pred.values()))
      result_label = list(y_pred.keys())[outputs.argmax()]
      result_score = max(outputs)/100 
      
      if (result_score > threshold):
        estimated_event_list.append(result_label)
      else:
        estimated_event_list.append('silence')
    return calcula_anotaciones(estimated_event_list,sr,filename)
      
  

Calcular anotaciones en un conjunto de audios -> **indicar en submision_audios la carpeta en la que se encuentran los audios a anotar**

In [None]:
import os
submision_audios = '/content/drive/MyDrive/Dataton/submission'# Carpeta que contiene los audios
final_df = pd.DataFrame()
i=0
for audio_name in os.listdir(submision_audios):
  print(f'Audio nº {i}')
  clip, sr = sf.read(submision_audios + '/' + audio_name)
  anotaciones = prediction_for_clip(audio_name,clip,sr,threshold = 0.5)
  df_anotaciones = pd.DataFrame(anotaciones)
  final_df = pd.concat([final_df, df_anotaciones], ignore_index=True)
  print("audio "+ audio_name + " procesado---->" + str(len(final_df))+ "anotaciones totales")
  i+=1

Guardamos el dataset de anotaciones

In [None]:
final_df.to_csv('/content/drive/MyDrive/Dataton/pred.csv',index = False)
final_df