In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import random
import time
from tqdm import tqdm

import librosa
import librosa.display
import IPython.display as ipd
import albumentations as A
import albumentations.pytorch.transforms as T
import timm
from pathlib import Path

from torchlibrosa.augmentation import SpecAugmentation

import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torchvision import models
from sklearn import metrics


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from torchaudio import transforms

In [None]:
meta = pd.read_csv("C:/UCI MDS Spring Quarter 2022/CS 274P/Final Project/train_metadata.csv")
test = pd.read_csv("C:/UCI MDS Spring Quarter 2022/CS 274P/Final Project/test.csv")
taxonomy = pd.read_csv("C:/UCI MDS Spring Quarter 2022/CS 274P/Final Project/eBird_Taxonomy_v2021.csv")

In [None]:
## seeding

OUTPUT_DIR = f'./'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
   

### Seeding purposes    
def set_seed(seed=42):
    random.seed(seed)                                                     ## Seed RNG for custom operators
    os.environ["PYTHONHASHSEED"] = str(seed)                              ## Set environment hash to seed
    np.random.seed(seed)                                                  ## Seed RNG for libraries dependent on numpy
    torch.manual_seed(seed)                                               ## set random seed for CPU AND GPU
    torch.backends.cudnn.deterministic = False                            ## Benchmark=False makes CudNN select same algorithm, but the algorithm itself might be non-deterministic. So, we ensure the algorithm is deterministic
    torch.backends.cudnn.benchmark = True                                 ## CudNN might select better algorithm on each run to improve performance, varies and makes it irreproducible
    
set_seed(config.seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)

# batchnorm layer
def init_bn(bn):
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)


def init_weights(model):
    classname = model.__class__.__name__
    if classname.find("Conv2d") != -1:
        nn.init.xavier_uniform_(model.weight, gain=np.sqrt(2))
        model.bias.data.fill_(0)
    elif classname.find("BatchNorm") != -1:
        model.weight.data.normal_(1.0, 0.02)
        model.bias.data.fill_(0)
    elif classname.find("GRU") != -1:
        for weight in model.parameters():
            if len(weight.size()) > 1:
                nn.init.orghogonal_(weight.data)
    elif classname.find("Linear") != -1:
        model.weight.data.normal_(0, 0.01)
        model.bias.data.zero_()
        
def do_mixup(x, mixup_lambda):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
    (1, 3, 5, ...).
    """
    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
    return out

def interpolate(x, ratio):
    """Interpolate data in time domain. This is used to compensate the 
    resolution reduction in downsampling of a CNN.
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled

def pad_framewise_output(framewise_output, frames_num):
    """Pad framewise_output to the same length as input frames. The pad value 
    is the same as the value of the last frame.
    """
    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output


# attention block for deep multiple instance learning
class AttBlock(nn.Module):
    def __init__(self, n_in, n_out, activation='linear'):
        super(AttBlock, self).__init__()
        
        self.activation = activation
        self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
        self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
        
        self.init_weights()
        
    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
         
    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)

In [None]:
## Fine tuning a pretrained model.

class BirdCLEFPretrain(nn.Module):
  def __init__(self, mel_bins: int, pretrained: bool, num_classes : int, in_channels= 1):
    
    # Why num_classes = 24 in EX005??
    super().__init__() 

    base_model = timm.create_model(
          "tf_efficientnet_b0_ns", pretrained=True, in_chans=in_channels)
    
    
    #base_model = torch.hub.load('harritaylor/torchvggish', 'vggish')

    self.spec_augmenter = SpecAugmentation(
        time_drop_width = 64,
        time_stripes_num = 2,
        freq_drop_width = 8,
        freq_stripes_num = 2)

    self.bn0 = nn.BatchNorm2d(mel_bins)

    #conv1_weight = base_model.features[0].weight
    #conv1_type = conv1_weight.dtype
    #conv1_weight = conv1_weight.float()
    #repeat = int(math.ceil(in_channels/1))
    #conv1_weight = conv1_weight.repeat(1, repeat, 1, 1)[:, :in_channels, :, :]
    #conv1_weight = conv1_weight.to(conv1_type)
    #base_model.features[0].weight = nn.Parameter(conv1_weight)

    layers = list(base_model.children())[:-2]
    self.encoder = nn.Sequential(*layers)

    if hasattr(base_model, "fc"):
      in_features = base_model.fc.in_features
    else:
      in_features = base_model.classifier.in_features
    
    #in_features = base_model.embeddings[0].in_features

    self.fc1 = nn.Linear(in_features, in_features, bias = True )


    # Flattening layer for VGG16
    #self.flatten = nn.Flatten()

    self.att_block = AttBlock(
        in_features, num_classes, activation = 'sigmoid'
    )


    self.init_weight()

  def init_weight(self):
    init_bn(self.bn0)
    init_layer(self.fc1)

  def preprocess(self, input, mixup_lambda=config.mixup_lambda):
    x = input # (batch_size, 3, time_steps, mel_bins)
    frames_num = x.shape[2]

    x = x.transpose(1, 3)
    x = self.bn0(x)
    x = x.transpose(1, 3)

    if self.training:
      x = self.spec_augmenter(x)

    # Mixup on spectrogram
    if self.training and mixup_lambda is not None:
      x = do_mixup(x, mixup_lambda)

    x = x.transpose(2,3) ## EX005 does this. not sure what it does, but I know in_channels for first layer of VGGish is 1

    return x, frames_num
    
  # add config.mixup_lambda  
  def forward(self, input, mixup_lambda = None):
    x, frames_num = self.preprocess(input, mixup_lambda = mixup_lambda)


    x = self.encoder(x)


    x = torch.mean(x, dim=3)

    x1 = F.max_pool1d(x, kernel_size = 3, stride = 1, padding = 1)
    x2 = F.avg_pool1d(x, kernel_size = 3, stride = 1, padding = 1)
    x = x1+x2

   
    

    x = F.dropout(x, p=0.5, training=self.training)
    x = x.transpose(1, 2)

    # Flattening layer for VGG16
    #x = self.flatten(x)
    
    x = F.relu_(self.fc1(x))
    x = x.transpose(1, 2)
    x = F.dropout(x, p=0.5, training=self.training)

    (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
    logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
    segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
    segmentwise_output = segmentwise_output.transpose(1, 2)

    interpolate_ratio = frames_num // segmentwise_output.size(1)

    framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
    framewise_output = pad_framewise_output(framewise_output, frames_num)

    framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
    framewise_logit = pad_framewise_output(framewise_logit, frames_num)


    output_dict = {
        'framewise_output': framewise_output,
        'clipwise_output': clipwise_output,
        'logit': logit,
        'framewise_logit': framewise_logit
    }


    return output_dict

In [None]:
# Mel Spec Image Transforms
mean = (0.485, 0.456, 0.406) # RGB
std = (0.229, 0.224, 0.225) # RGB

spec_transforms = {
    'train' : A.Compose([
            A.HorizontalFlip(p=0.5),
            
            A.OneOf([
                A.Cutout(max_h_size=5, max_w_size=16),
                A.CoarseDropout(max_holes=4),
            ], p=0.5),
            A.Normalize(mean, std),
    ]),
    'valid' : A.Compose([
            A.Normalize(mean, std),
    ]),
}

In [None]:
class config:
  #####################
  # Global #
  #####################
  EXP_NO = 'EXP001'
  seed = 1
  epochs = 5
  base_dir = 'C:/UCI MDS Spring Quarter 2022/CS 274P/Final Project/train_audio'
  # device = 'cuda' if torch.cuda.is_available() else 'cpu'
  

  #####################
  # Dataset # 
  #####################

  duration = 5        
  n_mels = 224
  sample_rate = 32000
  fmin = 500          # Min Frequency    most birds vocalize between 500Hz and 12.5kHz
  fmax = 12500        # Max Frequency
  n_fft = 2048        # length of fft window
  hop_length = 512    # length of non-intersecting portion of window length

  #####################
  # Dataloader Params
  #####################

  loader_params = {
      "train": {
          "batch_size": 16, #64
          "num_workers": 0,
          "shuffle": True
      },
      "valid": {
          "batch_size": 32, #128
          "num_workers": 0,
          "shuffle": False
      }
  }


 
  #####################
  # Model # 
  #####################

  base_model_name = "tf_efficientnet_b0_ns" # We should see the different types of models
  pooling = "max" # not sure why we need to set this yet
  classes = os.listdir(base_dir)
  num_classes = 152
  EARLY_STOPPING = True
  EVALUATION = 'AUC'
  pretrained = False
  in_channels = 3
  folds = [0] # [0,1,2,3,4]
  num_folds = 5
  LR = 1e-3
  mel_bins = 224
  mixup_lambda = None
  
  img_size = 224


In [None]:
# Grab Mel Spectrograms
def get_mel_spects(y):
    melspec = librosa.feature.melspectrogram(y=y, sr=config.sample_rate, n_mels=config.n_mels, 
                                             fmin=config.fmin, fmax=config.fmax, n_fft=config.n_fft, hop_length=config.hop_length)
    melspec = librosa.power_to_db(melspec).astype(np.float32)
    return melspec

# Converts to RGB inputs for Pretrained model
def mono_to_color(X, eps=1e-6):
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = X.mean()
    std = X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

In [None]:
class TestDataset(torchdata.Dataset):
    def __init__(self, df: pd.DataFrame, clip: np.ndarray):
        self.df = df
        # self.clip = clip
        self.clip = np.concatenate([clip, clip, clip])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        SR = 32000
        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end_seconds = int(sample.seconds)
        start_seconds = int(end_seconds - 5)
        
        # end_index = int(SR * (end_seconds + (self.train_period - 5) / 2) + len(self.clip) // 3)
        # start_index = int(SR * (start_seconds - (self.train_period - 5) / 2) + len(self.clip) // 3)
        
        # y = self.clip[start_index:end_index].astype(np.float32)
        image = self.clip[SR*start_seconds:SR*end_seconds].astype(np.float32)
        image = np.nan_to_num(image)
        
        image = get_mel_spects(image)
        image = mono_to_color(image)
        image = image.astype(np.uint8)

        image = spec_transforms['valid'](image=image)['image'].T
            
        return {
            "image": image,
            "row_id": row_id,
        }

In [None]:
model_paths = ['C:/UCI MDS Spring Quarter 2022/CS 274P/Final Project/Saved_Models/model.pth']

models = []

for p in model_paths:
    model = BirdCLEFPretrain(
        mel_bins = config.mel_bins,
        pretrained = config.pretrained,
        num_classes = config.num_classes,
        in_channels = config.in_channels)
    
    # model.to(device)
    model.load_state_dict(torch.load(p))
    model.eval()
    models.append(model)
    
print()

In [None]:
TARGET_SR = 32000
TESTDIR = Path()"C:/UCI MDS Spring Quarter 2022/CS 274P/Final Project/test_soundscapes/")

In [None]:
all_audios = list(TESTDIR.glob("*.ogg"))
sample_submission = pd.read_csv("C:/UCI MDS Spring Quarter 2022/CS 274P/Final Project/sample_submission.csv")
sample_submission

In [None]:
def prediction_for_clip(test_df: pd.DataFrame, 
                        clip: np.ndarray, 
                        models, 
                        threshold=0.05, 
                        threshold_long=None):

    dataset = TestDataset(df=test_df, 
                          clip=clip,)
    loader = torchdata.DataLoader(dataset, batch_size=1, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
#     [model.eval() for model in models]
    prediction_dict = {}
    for data in tqdm(loader):
        row_id = data['row_id'][0]
        image = data['image'].to(device)

        with torch.no_grad():
            probas = []
            probas_long = []
            for model in models:
                with torch.cuda.amp.autocast():
                    output = model(image)
                probas.append(output['clipwise_output'].detach().cpu().numpy().reshape(-1))
                # probas_long.append(clipwise_pred_long.detach().cpu().numpy().reshape(-1))
            probas = np.array(probas)
            # probas_long = np.array(probas_long)
#             probas = np.array([model(image)[1].detach().cpu().numpy().reshape(-1) for model in models])
        if threshold_long is None:
            events = probas.mean(0) >= threshold
        else:
            events = ((probas.mean(0) >= threshold).astype(int) \
                      + (probas_long.mean(0) >= threshold_long).astype(int)) >= 2
        labels = np.argwhere(events).reshape(-1).tolist()
#         labels = labels[:2]
        if len(labels) == 0:
            prediction_dict[str(row_id)] = "nocall"
        else:
            labels_str_list = list(map(lambda x: config.classes[x], labels))
            label_string = " ".join(labels_str_list)
            prediction_dict[str(row_id)] = label_string
    return prediction_dict

In [None]:
def prediction(test_audios,
               threshold=0.05, 
               threshold_long=None):
    
    # models = [model]
    warnings.filterwarnings("ignore")
    prediction_dicts = {}
    for audio_path in test_audios:
       # with timer(f"Loading {str(audio_path)}", logger):
       #     clip, _ = sf.read(audio_path, always_2d=True)
        clip, _ = librosa.load(audio_path)
        clip = np.mean(clip, 1)
            
        seconds = []
        row_ids = []
        for second in range(5, 65, 5):
            row_id = "_".join(audio_path.name.split(".")[:-1]) + f"_{second}"
            seconds.append(second)
            row_ids.append(row_id)
        print(row_ids)
        test_df = pd.DataFrame({
            "row_id": row_ids,
            "seconds": seconds
        })
        #with timer(f"Prediction on {audio_path}", logger):
            prediction_dict = prediction_for_clip(test_df,
                                                  clip=clip,
                                                  models=models,
                                                  threshold=threshold, threshold_long=threshold_long)
#         row_id = list(prediction_dict.keys())
#         birds = list(prediction_dict.values())
#         prediction_df = pd.DataFrame({
#             "row_id": row_id,
#             "birds": birds
#         })
#         prediction_dfs.append(prediction_df)
#     prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
        prediction_dicts.update(prediction_dict)
    return prediction_dicts

In [None]:
threshold = 0.05
threshold_long = None # 0.05

prediction_dicts = prediction(test_audios=all_audios,
           threshold=threshold, 
           threshold_long=threshold_long)
print(prediction_dicts)

for i in range(len(sample_submission)):
    sample = sample_submission.row_id[i]
    key = sample.split("_")[0] + "_" + sample.split("_")[1] + "_" + sample.split("_")[3]
    target_bird = sample.split("_")[2]
    print(key, target_bird)
    if key in prediction_dicts:
        sample_submission.iat[i, 1] = (target_bird in prediction_dicts[key])
sample_submission.to_csv("submission.csv", index=False)