In [20]:
#필요한 패키지 임포트
import pandas as pd
import numpy as np
import librosa
from glob import glob
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from PIL import Image
from keras.utils import load_img, img_to_array
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Dropout, Conv2D, MaxPool2D, Flatten, Dense
import random
import os
import shutil
import tensorflow as tf

In [3]:
def ctft_to_peaks(carr : np.ndarray, threshold = 1, max_peak_num = 10):
    rising = np.zeros(shape=(max_peak_num, carr.shape[1]))
    falling = np.zeros(shape=(max_peak_num , carr.shape[1]))
    diff=np.diff(np.sign(carr.T-threshold), axis = 1)

    for arr, num in [(rising, 2), (falling, -2)]:
        a,b = np.where(diff==num)
        for i in range(diff.shape[0]):
            temp=b[a==i]
            minlen = min(max_peak_num, len(temp))
            arr[:minlen,i]=temp[:minlen]

    return (rising, falling)

In [15]:
def get_melspectrogram_db(file_path, sr=None, n_fft=2048, hop_length=160, n_mels=128, fmin=20, fmax=8300, top_db=80):
  wav,sr = librosa.load(file_path,sr=sr)
  if wav.shape[0]<int(1.3*sr):
    wav=np.pad(wav,int(np.ceil((1.3*sr-wav.shape[0])/2)),mode='reflect')
  else:
    wav=wav[:int(1.3*sr)]
  spec=librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
  spec_db=librosa.power_to_db(spec,top_db=top_db)
  return spec_db

In [16]:
def spec_to_image(spec, eps=1e-6):
  mean = spec.mean()
  std = spec.std()
  spec_norm = (spec - mean) / (std + eps)
  spec_min, spec_max = spec_norm.min(), spec_norm.max()
  spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
  spec_scaled = spec_scaled.astype(np.uint8)
  return spec_scaled

In [62]:
def iscaterwaul(filedir):
    print("Loading File")
    y, sr = librosa.load(filedir)
    spec = np.abs(librosa.stft(y, hop_length=512))
    spec = librosa.amplitude_to_db(spec, ref=np.max)
    print("Filtering Noise")
    S_full, phase = librosa.magphase(librosa.stft(y)) #speech processing에 n_fft=512 권장된다고는함
    S_filter = librosa.decompose.nn_filter(S_full,
                                        aggregate=np.median,
                                        metric='cosine',
                                        width=int(librosa.time_to_frames(2, sr=sr)))
    S_filter = np.minimum(S_full, S_filter)
    margin_i, margin_v = 2, 10
    power = 2
    mask_i = librosa.util.softmask(S_filter,
                                margin_i * (S_full - S_filter),
                                power=power)
    mask_v = librosa.util.softmask(S_full - S_filter,
                                margin_v * S_filter,
                                power=power)
    S_foreground = mask_v * S_full
    S_background = mask_i * S_full
    rising, falling = ctft_to_peaks(S_foreground, threshold=0.1, max_peak_num=5)
    #의미있는 부분만 잘라내기
    cnt_mat = (rising > 1e-5).sum(axis=0)
    thres = 5 - 0.5
    cutpoint_rising = np.where(np.diff(np.sign(cnt_mat-thres))==2)[0]
    cutpoint_falling = np.where(np.diff(np.sign(cnt_mat-thres))==-2)[0]+1
    if(cnt_mat[0] > thres):
        cutpoint_rising = np.insert(cutpoint_rising,0,0)
    if len(cutpoint_rising)!=len(cutpoint_falling):
            cutpoint_rising = cutpoint_rising[:-1]
    print("Making Dir")
    os.mkdir("/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0])
    os.mkdir("/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0]+'/wav')
    os.mkdir("/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0]+'/img')
    print("Cutting File")
    for i in range(len(cutpoint_rising)):
        if 1.3>=(librosa.frames_to_time(cutpoint_falling[i],sr=sr)-librosa.frames_to_time(cutpoint_rising[i],sr=sr))>=0.3:
            os.system(("ffmpeg -y -ss " + str(librosa.frames_to_time(cutpoint_rising[i],sr=sr)) + " -t " + str(librosa.frames_to_time(cutpoint_falling[i],sr=sr)-librosa.frames_to_time(cutpoint_rising[i],sr=sr)) + " -i " +str(filedir)+" /catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0]+"/wav/"+str(filedir.split('/')[-1].split(".")[0])+'_'+str(i)+".wav -loglevel quiet"))
    base_dir = "/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0]+"/wav"
    train_folder = glob(base_dir)
    train_path = []
    for folder in train_folder:
        tmp = glob(folder + '/*')
        train_path += tmp

    print("Converting to img")
    for i in range(len(train_path)):
        im = Image.fromarray(spec_to_image(get_melspectrogram_db(str(train_path[i]))))
        im.save("/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0]+"/img/"+str(train_path[i]).split("/")[-1].split(".")[0]+'.jpeg')
    
    model = tf.keras.models.load_model('/catclass_ai/model_fuck.h5')

    base_dir = "/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0]+"/img"
    train_folder = glob(base_dir)
    train_path = []
    for folder in train_folder:
        tmp = glob(folder + '/*')
        train_path += tmp
    print("Predicting")
    d=[[0,0]]
    for i in range(len(train_path)):
        img1 = load_img(train_path[i],target_size=(128,131))
        imarr = img_to_array(img1)
        X = np.expand_dims(imarr,axis=0)
        val = model.predict(X,verbose=0)
        d+=val
    shutil.rmtree("/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0], ignore_errors=True)
    if d[0][0]>0:
        return "caterwaul"
    else:
        return "meow"

        


In [65]:
iscaterwaul("/catclass_ai/test7.wav")

Loading File
Filtering Noise
Making Dir
Cutting File
Converting to img
Predicting


'caterwaul'

In [13]:
os.rmdir("/catclass_ai/temp/asdf")

In [12]:
filedir = '/catclass_ai/asdf.wav'
os.mkdir("/catclass_ai/temp/"+filedir.split("/")[-1].split(".")[0])