In [38]:
import numpy as np
import librosa
import os
import csv
import tensorflow as tf
import keras
import joblib
from sklearn.preprocessing import LabelEncoder
model = keras.models.load_model("acoustic_classification.keras")
#model = tf.saved_model.load("acoustic_classification5")

In [39]:
model.summary()

In [40]:
target_sample_rate = 40000

def resample_audio(file_path, target_sr=target_sample_rate):
    # Загрузка аудиофайла с его исходной частотой дискретизации
    y, sr = librosa.load(file_path, sr=None)
    
    # Если исходная частота не совпадает с целевой, пересэмплируем
    if sr != target_sr:
        print(f"Файл {file_path} имеет частоту {sr} Гц, пересэмплируем до {target_sr} Гц.")
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    else:
        print(f"Файл {file_path} уже имеет целевую частоту {target_sr} Гц.")
    
    return y, target_sr

In [50]:
scaler = joblib.load("scaler")
encoder = LabelEncoder()
marine_mammals = "AtlanticSpottedDolphin BeardedSeal Beluga_WhiteWhale BottlenoseDolphin BowheadWhale ClymeneDolphin \
        CommonDolphin FalseKillerWhale Fin_FinbackWhale Fraser'sDolphin \
        HumpbackWhale KillerWhale MelonHeadedWhale NorthernRightWhale \
        PantropicalSpottedDolphin Short_Finned(Pacific)PilotWhale SpermWhale".split()
y = encoder.fit_transform(marine_mammals)
def classify_audio(audio_path, window_size=2.5, hop_size=1, sample_rate=40000, confidence_threshold=0.3):
    # Загрузка аудио
    audio, sr = resample_audio(audio_path)
    
    window_samples = int(window_size * sample_rate)
    hop_samples = int(hop_size * sample_rate)
    
    segments = []
    
    # Применяем скользящее окно по всей длине записи
    for start in np.arange(0, len(audio) - window_samples + 1, hop_samples):
        segment = audio[int(start):int(start) + window_samples]
        
        #y, sr = librosa.load(segment, mono = True, duration = 30)
        chroma_stft = librosa.feature.chroma_stft(y = segment, sr = sr)
        rmse = librosa.feature.rms(y = segment)
        spec_cent = librosa.feature.spectral_centroid(y = segment, sr = sr)
        spec_bw = librosa.feature.spectral_bandwidth(y = segment, sr = sr)
        rolloff = librosa.feature.spectral_rolloff(y = segment, sr = sr)
        zcr = librosa.feature.zero_crossing_rate(segment)
        mfcc = librosa.feature.mfcc(y = segment, sr = sr)

        features = [
        np.mean(chroma_stft), 
        np.mean(rmse), 
        np.mean(spec_cent), 
        np.mean(spec_bw), 
        np.mean(rolloff), 
        np.mean(zcr)
        ]
        # Добавляем средние значения MFCC ко всем признакам
        features.extend([np.mean(e) for e in mfcc])
        #X_test = scaler.fit_transform(np.array(features).reshape(1, -1))
        X_test = scaler.transform(np.array(features).reshape(1, -1))
        # Получаем предсказание от модели
        predict = model.predict(X_test)
        print(predict)
        classes = np.argmax(predict, axis = 1)
        print(classes)
        predicted_class = encoder.inverse_transform(classes)
        confidence = np.max(predict, axis=1)
        print(confidence)
        print(predicted_class)
        # Если уверенность ниже порога, классифицируем как "шум/тишину"
        if confidence < confidence_threshold:
           predicted_class = "noise/silence"
        
        # Сохраняем информацию о сегменте
        start_time = start / sample_rate
        end_time = (start + window_samples) / sample_rate
        segments.append((start_time, end_time, predicted_class, confidence))
    return segments

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [52]:
def combine_segments(segments):
    combined_segments = []
    min_gap = 0.5
    
    for start, end, cls, conf in segments:
        if not combined_segments:
            combined_segments.append([start, end, cls, conf])
        else:
            last_segment = combined_segments[-1]
            last_end_time, last_class = last_segment[1], last_segment[2]
            # Проверяем, нужно ли объединить
            if last_class == cls and (start - last_end_time) <= min_gap:
                # Объединяем интервалы
                combined_segments[-1][1] = end  # Обновляем конец последнего интервала
            else:
                combined_segments.append([start, end, cls, conf])
    return combined_segments

In [53]:
file_path = "240826-013_K24.wav" #Ссылочку на аудио сюда
with open("prediction_results.txt", "w") as file:
    print("", file=file)
segments = classify_audio(file_path)
combine_segments=combine_segments(segments)
with open("prediction_results.txt", "w") as file:
    for start, end, cls, conf in combine_segments:
        #print(f"Time: {start:.2f} - {end:.2f} s | Class: {cls} | Confidence: {conf}")
        result_line = f"Time: {start:.2f} - {end:.2f} s | Class: {cls} | Confidence: {conf}\n"
        # Записываем строку в файл
        file.write(result_line)

Файл 04081350.wav имеет частоту 48000 Гц, пересэмплируем до 40000 Гц.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[[5.3663345e-14 1.3925291e-17 1.7405380e-16 3.7911085e-18 3.5189340e-13
  8.2758335e-19 5.9985398e-12 1.5413597e-10 1.2089172e-19 1.3296481e-13
  6.0730289e-18 1.0666399e-15 5.6643102e-12 1.2522919e-14 1.7526024e-18
  3.1057090e-09 9.9140376e-01 2.5941448e-17 6.3366275e-16 5.0369041e-12
  2.5977563e-19 1.4544672e-17 1.9747863e-17 6.2267996e-19 4.9337383e-12
  4.0540276e-15 2.6877901e-15 2.7964742e-08 3.9765523e-17 6.3564856e-14
  3.7932429e-10 3.5122377e-04 1.8673880e-17 1.7934881e-16 2.0119138e-19
  1.1116450e-17 5.6213455e-08 1.7845667e-18 1.7851143e-15 2.6477255e-15
  1.8032678e-12 1.7836818e-18 3.3716897e-06 1.8129059e-17 8.2415128e-03]]
[16]
[0.99140376]
["Grampus_Risso'sDolphin"]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[[5.97370935e-14 9.53298690e-18 1.03397960e-16 5.31390710e-18
  4.05795975e-13 1.23831316e-