In [1]:
torch.cuda.is_available()

NameError: name 'torch' is not defined

In [5]:
from pathlib import Path
import yaml

In [6]:
import numpy as np

In [7]:
import pandas as pd

In [8]:
sample_submission = pd.read_csv(
    f"../../data/input/sample_submission.csv"
)

In [9]:
DATADIR = Path("../../data/input/test_soundscapes/")
all_audios = list(DATADIR.glob("*.ogg"))
ogg_name_list = [ogg_name.name for ogg_name in all_audios]

In [10]:
import torch
import torchaudio
import torchvision

class Bird2023TestDataset(torch.utils.data.Dataset):
    def __init__(self, cfg, ogg_name_list):
        self.cfg = cfg
        self.ogg_name_list = ogg_name_list
        self.audio_length = cfg["audio"]["sample_rate"] * self.cfg["audio"]["duration"]
        self.step = cfg["audio"]["sample_rate"] * 5

        """
        # Create a zero tensor of shape (len(X), num_columns_X)
        self.y = torch.zeros(
            (len(self.df), len(self.df.columns)), dtype=torch.float32
        )
        """

        # Define a normalization transformation for the data
        self.normalize = torchvision.transforms.Compose(
            [
                torchvision.transforms.Normalize(
                    mean=self.cfg["model"]["mean"], std=self.cfg["model"]["std"]
                )
            ]
        )

    def __len__(self):
        return len(self.ogg_name_list)

    def min_max_0_1(self, x):
        return (x - x.min()) / (x.max() - x.min())

    def audio_to_mel_specgram(self, audio):
        # Compute the mel spectrogram of the waveform
        mel_specgram = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.cfg["audio"]["sample_rate"],
            n_fft=self.cfg["mel_specgram"]["n_fft"],
            win_length=self.cfg["mel_specgram"]["win_length"],
            hop_length=self.cfg["mel_specgram"]["hop_length"],
            n_mels=self.cfg["mel_specgram"]["n_mels"],
        )(audio)
        # Convert the mel spectrogram to a decibel scale
        mel_specgram = torchaudio.transforms.AmplitudeToDB()(mel_specgram)

        # Expand to n channels
        if self.cfg["model"]["in_chans"] > 1:
            mel_specgram = mel_specgram.repeat(self.cfg["model"]["in_chans"], 1, 1)

        # Apply min-max normalization to scale values between 0 and 1
        mel_specgram = self.min_max_0_1(mel_specgram)
        # Apply z-normalization to scale values to have zero mean and unit variance
        mel_specgram = self.normalize(mel_specgram)

        return mel_specgram

    def __getitem__(self, index):
        # Retrieve the filename of the audio file at the given index
        file_path = self.ogg_name_list[index]
        # Load the audio waveform and its sample rate from the file path
        waveform, sample_rate = torchaudio.load(
            "../" + self.cfg["general"]["input_path"] + "/test_soundscapes/" + file_path
        )

        # Resample the waveform if the sample rate is not equal to the target sample rate
        if sample_rate != self.cfg["audio"]["sample_rate"]:
            waveform = torchaudio.transforms.Resample(
                orig_freq=sample_rate, new_freq=self.cfg["audio"]["sample_rate"]
            )

        waveforms = []
        for i in range(self.audio_length, waveform.shape[1] + self.step, self.step):
            start = max(0, i - self.audio_length)
            end = start + self.audio_length
            waveforms.append(waveform[:, start:end])

        if waveforms[-1].shape[1] < self.audio_length:
            waveforms = waveforms[:, :-1]

        mel_specgrams = [self.audio_to_mel_specgram(waveform) for waveform in waveforms]
        mel_specgrams = torch.stack(mel_specgrams)

        return mel_specgrams

In [11]:
with open("/Users/moritake/data_science/kaggle/birdclef-2023/src/configs/debug.yaml", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

In [12]:
dataset = Bird2023TestDataset(cfg, ogg_name_list)

In [13]:
batch_size=32

In [14]:
sub_df = pd.DataFrame(columns=sample_submission.columns)

In [15]:
for i, data in enumerate(dataset):
    preds = []
    ogg_name = ogg_name_list[i][:-4]
    for start in range(0, len(data), batch_size):
        #preds.append(data[0][start:start+batch_size])
        preds.append(torch.zeros([len(data[start:start+batch_size]), 264]))
    preds = torch.cat(preds)
    row_ids = [f'{ogg_name}_{(i+1)*5}' for i in range(len(preds))]
    df = pd.DataFrame(columns=sample_submission.columns)
    df["row_id"] = row_ids
    df[df.columns[1:]] = preds
    sub_df = pd.concat([sub_df,df]).reset_index(drop=True)

In [16]:
sub_df

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,soundscape_29201_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,soundscape_29201_15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,soundscape_29201_20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,soundscape_29201_25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,soundscape_29201_copy_580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
236,soundscape_29201_copy_585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237,soundscape_29201_copy_590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,soundscape_29201_copy_595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
preds.shape

torch.Size([120, 3, 128, 313])

In [96]:
preds = torch.cat(preds)

In [97]:
len(preds)

240

In [98]:
row_ids = [f"{'a'}_{(i+1)*5}" for i in range(len(preds))]

In [94]:
row_ids

['a_5',
 'a_10',
 'a_15',
 'a_20',
 'a_25',
 'a_30',
 'a_35',
 'a_40',
 'a_45',
 'a_50',
 'a_55',
 'a_60',
 'a_65',
 'a_70',
 'a_75',
 'a_80',
 'a_85',
 'a_90',
 'a_95',
 'a_100',
 'a_105',
 'a_110',
 'a_115',
 'a_120',
 'a_125',
 'a_130',
 'a_135',
 'a_140',
 'a_145',
 'a_150',
 'a_155',
 'a_160',
 'a_165',
 'a_170',
 'a_175',
 'a_180',
 'a_185',
 'a_190',
 'a_195',
 'a_200',
 'a_205',
 'a_210',
 'a_215',
 'a_220',
 'a_225',
 'a_230',
 'a_235',
 'a_240',
 'a_245',
 'a_250',
 'a_255',
 'a_260',
 'a_265',
 'a_270',
 'a_275',
 'a_280',
 'a_285',
 'a_290',
 'a_295',
 'a_300',
 'a_305',
 'a_310',
 'a_315',
 'a_320',
 'a_325',
 'a_330',
 'a_335',
 'a_340',
 'a_345',
 'a_350',
 'a_355',
 'a_360',
 'a_365',
 'a_370',
 'a_375',
 'a_380',
 'a_385',
 'a_390',
 'a_395',
 'a_400',
 'a_405',
 'a_410',
 'a_415',
 'a_420',
 'a_425',
 'a_430',
 'a_435',
 'a_440',
 'a_445',
 'a_450',
 'a_455',
 'a_460',
 'a_465',
 'a_470',
 'a_475',
 'a_480',
 'a_485',
 'a_490',
 'a_495',
 'a_500',
 'a_505',
 'a_510',


In [80]:
sample_submission.columns

Index(['row_id', 'abethr1', 'abhori1', 'abythr1', 'afbfly1', 'afdfly1',
       'afecuc1', 'affeag1', 'afgfly1', 'afghor1',
       ...
       'yebsto1', 'yeccan1', 'yefcan', 'yelbis1', 'yenspu1', 'yertin1',
       'yesbar1', 'yespet1', 'yetgre1', 'yewgre1'],
      dtype='object', length=265)

In [84]:
df = pd.DataFrame(columns=sample_submission.columns)
df["row_id"] = row_ids

In [85]:
df

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,,,,,,,,,,...,,,,,,,,,,
1,soundscape_29201_10,,,,,,,,,,...,,,,,,,,,,
2,soundscape_29201_15,,,,,,,,,,...,,,,,,,,,,
3,soundscape_29201_20,,,,,,,,,,...,,,,,,,,,,
4,soundscape_29201_25,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,soundscape_29201_copy_580,,,,,,,,,,...,,,,,,,,,,
236,soundscape_29201_copy_585,,,,,,,,,,...,,,,,,,,,,
237,soundscape_29201_copy_590,,,,,,,,,,...,,,,,,,,,,
238,soundscape_29201_copy_595,,,,,,,,,,...,,,,,,,,,,


In [72]:
[row_id for row_id in row_ids]

[['soundscape_29201_5',
  'soundscape_29201_10',
  'soundscape_29201_15',
  'soundscape_29201_20',
  'soundscape_29201_25',
  'soundscape_29201_30',
  'soundscape_29201_35',
  'soundscape_29201_40',
  'soundscape_29201_45',
  'soundscape_29201_50',
  'soundscape_29201_55',
  'soundscape_29201_60',
  'soundscape_29201_65',
  'soundscape_29201_70',
  'soundscape_29201_75',
  'soundscape_29201_80',
  'soundscape_29201_85',
  'soundscape_29201_90',
  'soundscape_29201_95',
  'soundscape_29201_100',
  'soundscape_29201_105',
  'soundscape_29201_110',
  'soundscape_29201_115',
  'soundscape_29201_120',
  'soundscape_29201_125',
  'soundscape_29201_130',
  'soundscape_29201_135',
  'soundscape_29201_140',
  'soundscape_29201_145',
  'soundscape_29201_150',
  'soundscape_29201_155',
  'soundscape_29201_160'],
 ['soundscape_29201_165',
  'soundscape_29201_170',
  'soundscape_29201_175',
  'soundscape_29201_180',
  'soundscape_29201_185',
  'soundscape_29201_190',
  'soundscape_29201_195',
  'so

In [42]:
cfg["test_loader"]["batch_size"] = 1
cfg["test_loader"]["num_workers"] = 0

In [43]:
test_dataloader = torch.utils.data.DataLoader(
    dataset, **cfg["test_loader"],
)

In [44]:
for i in test_dataloader:
    #i = i.squeeze()
    #print(i.shape)
    print(i)

['soundscape_29201_5', 'soundscape_29201_10', 'soundscape_29201_15', 'soundscape_29201_20', 'soundscape_29201_25', 'soundscape_29201_30', 'soundscape_29201_35', 'soundscape_29201_40', 'soundscape_29201_45', 'soundscape_29201_50', 'soundscape_29201_55', 'soundscape_29201_60', 'soundscape_29201_65', 'soundscape_29201_70', 'soundscape_29201_75', 'soundscape_29201_80', 'soundscape_29201_85', 'soundscape_29201_90', 'soundscape_29201_95', 'soundscape_29201_100', 'soundscape_29201_105', 'soundscape_29201_110', 'soundscape_29201_115', 'soundscape_29201_120', 'soundscape_29201_125', 'soundscape_29201_130', 'soundscape_29201_135', 'soundscape_29201_140', 'soundscape_29201_145', 'soundscape_29201_150', 'soundscape_29201_155', 'soundscape_29201_160', 'soundscape_29201_165', 'soundscape_29201_170', 'soundscape_29201_175', 'soundscape_29201_180', 'soundscape_29201_185', 'soundscape_29201_190', 'soundscape_29201_195', 'soundscape_29201_200', 'soundscape_29201_205', 'soundscape_29201_210', 'soundscape

In [38]:
i[1][0].shape

AttributeError: 'tuple' object has no attribute 'shape'

In [17]:
sample_submission

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,soundscape_29201_10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,soundscape_29201_15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
sample_submission = pd.read_csv(
    f"../../data/input/sample_submission.csv"
)

In [16]:
train

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,0
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,...,0,0,0,0,0,0,0,0,0,0
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,...,0,0,0,0,0,0,0,0,0,0
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,...,0,0,0,0,0,0,0,0,0,0
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16936,yewgre1,[],[''],-1.2502,29.7971,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,3.0,...,0,0,0,0,0,0,0,0,0,1
16937,yewgre1,[],[''],-1.2489,29.7923,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,1
16938,yewgre1,[],[''],-1.2433,29.7844,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,1
16939,yewgre1,[],[''],0.0452,36.3699,Eurillas latirostris,Yellow-whiskered Greenbul,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df_test = pd.DataFrame(
     [(path.stem, *path.stem.split("_"), path) for path in Path(Config.test_path).glob("*.ogg")],
    columns = ["filename", "name" ,"id", "path"]
)
print(df_test.shape)
df_test.head()

In [2]:
# Read csv
train = pd.read_csv(f"../../data/input/train_metadata.csv")
sample_submission = pd.read_csv(
    f"../../data/input/sample_submission.csv"
)

In [3]:
# Retrieve target
birds = sample_submission.columns[1:]

In [4]:
birds

Index(['abethr1', 'abhori1', 'abythr1', 'afbfly1', 'afdfly1', 'afecuc1',
       'affeag1', 'afgfly1', 'afghor1', 'afmdov1',
       ...
       'yebsto1', 'yeccan1', 'yefcan', 'yelbis1', 'yenspu1', 'yertin1',
       'yesbar1', 'yespet1', 'yetgre1', 'yewgre1'],
      dtype='object', length=264)

In [5]:
# Label encoding
train = pd.concat([train, pd.get_dummies(train["primary_label"])], axis=1)

In [6]:
train

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,0
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,...,0,0,0,0,0,0,0,0,0,0
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,...,0,0,0,0,0,0,0,0,0,0
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,...,0,0,0,0,0,0,0,0,0,0
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16936,yewgre1,[],[''],-1.2502,29.7971,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,3.0,...,0,0,0,0,0,0,0,0,0,1
16937,yewgre1,[],[''],-1.2489,29.7923,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,1
16938,yewgre1,[],[''],-1.2433,29.7844,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,1
16939,yewgre1,[],[''],0.0452,36.3699,Eurillas latirostris,Yellow-whiskered Greenbul,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# Fix order
new_columns = list(train.columns.difference(birds)) + list(birds)


In [9]:
list(train.columns.difference(birds))

['author',
 'common_name',
 'filename',
 'latitude',
 'license',
 'longitude',
 'primary_label',
 'rating',
 'scientific_name',
 'secondary_labels',
 'type',
 'url']

In [8]:
new_columns

['author',
 'common_name',
 'filename',
 'latitude',
 'license',
 'longitude',
 'primary_label',
 'rating',
 'scientific_name',
 'secondary_labels',
 'type',
 'url',
 'abethr1',
 'abhori1',
 'abythr1',
 'afbfly1',
 'afdfly1',
 'afecuc1',
 'affeag1',
 'afgfly1',
 'afghor1',
 'afmdov1',
 'afpfly1',
 'afpkin1',
 'afpwag1',
 'afrgos1',
 'afrgrp1',
 'afrjac1',
 'afrthr1',
 'amesun2',
 'augbuz1',
 'bagwea1',
 'barswa',
 'bawhor2',
 'bawman1',
 'bcbeat1',
 'beasun2',
 'bkctch1',
 'bkfruw1',
 'blacra1',
 'blacuc1',
 'blakit1',
 'blaplo1',
 'blbpuf2',
 'blcapa2',
 'blfbus1',
 'blhgon1',
 'blhher1',
 'blksaw1',
 'blnmou1',
 'blnwea1',
 'bltapa1',
 'bltbar1',
 'bltori1',
 'blwlap1',
 'brcale1',
 'brcsta1',
 'brctch1',
 'brcwea1',
 'brican1',
 'brobab1',
 'broman1',
 'brosun1',
 'brrwhe3',
 'brtcha1',
 'brubru1',
 'brwwar1',
 'bswdov1',
 'btweye2',
 'bubwar2',
 'butapa1',
 'cabgre1',
 'carcha1',
 'carwoo1',
 'categr',
 'ccbeat1',
 'chespa1',
 'chewea1',
 'chibat1',
 'chtapa3',
 'chucis1',
 'cibwar

In [None]:
train = train.reindex(columns=new_columns)

In [1]:
import torch
import torchaudio
import requests
import matplotlib.pyplot as plt



In [2]:
metadata = torchaudio.info("data/input/train_audio/abethr1/XC128013.ogg")
print(metadata)

RuntimeError: No audio I/O backend is available.