In [1]:
import os
import shutil
import warnings
warnings.filterwarnings(action="ignore")
import numpy as np
import pandas as pd
import torchaudio
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import json
import geopandas
import librosa
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import soundfile
import tensorflow as tf
import torch
from PIL import Image
from torchaudio.transforms import MelSpectrogram
from tqdm import tqdm

In [2]:
SEED = 42
IMG_SIZE = 260
SAMPLE_RATE = 32000
N_FFT = 2048
SIGNAL_LENGTH = 5  # seconds
FREQ_MIN = 0
FREQ_MAX = 15000
MAX_AUDIO_FILES = 100
WIN_LENGHT = 1024
AUDIO_PATH = "/app/_data/train_short_audio/"
calls_to_drop = [
    "XC509721.ogg",
    "XC428067.ogg",
    "XC523831.ogg",
    "XC523960.ogg",
    "XC237870.ogg",
    "XC129924.ogg",
    "XC576851.ogg",
    "XC579430.ogg",
    "XC590621.ogg",
]

In [3]:
def get_audio_info(filepath):
    with soundfile.SoundFile(filepath) as f:
        sr = f.samplerate
        frames = f.frames
        duration = float(frames) / sr
    return {"frames": frames, "duration": duration}

In [4]:
train_soundscapes_labels = pd.read_csv("/app/_data/train_soundscape_labels_orig.csv")
train_metadata = pd.read_csv("/app/_data/train_metadata_orig.csv")
test = pd.read_csv("/app/_data/test.csv")
test_dates = pd.read_csv("/app/_data/test_soundscapes/test_set_recording_dates.csv")

# train_soundscapes_labels

In [5]:
train_soundscapes_labels.head()

Unnamed: 0,row_id,site,audio_id,seconds,birds
0,7019_COR_5,COR,7019,5,nocall
1,7019_COR_10,COR,7019,10,nocall
2,7019_COR_15,COR,7019,15,nocall
3,7019_COR_20,COR,7019,20,nocall
4,7019_COR_25,COR,7019,25,nocall


In [6]:
train_soundscapes_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   row_id    2400 non-null   object
 1   site      2400 non-null   object
 2   audio_id  2400 non-null   int64 
 3   seconds   2400 non-null   int64 
 4   birds     2400 non-null   object
dtypes: int64(2), object(3)
memory usage: 93.9+ KB


In [7]:
train_soundscapes_labels.describe()

Unnamed: 0,audio_id,seconds
count,2400.0,2400.0
mean,26920.7,302.5
std,17346.241083,173.235161
min,2782.0,5.0
25%,11074.0,153.75
50%,24238.0,302.5
75%,43419.5,451.25
max,57610.0,600.0


## info about files

In [8]:
info_df = pd.DataFrame()
for filename in os.listdir("/app/_data/train_soundscapes/"):
    filename_full = filename
    filename = filename[:-4].split("_")
    audio_info = get_audio_info("/app/_data/train_soundscapes/"+filename_full)
    info_df = info_df.append(
        {
            "audio_id": np.int(filename[0]),
            'filename': filename_full,
            "date": filename[-1],
            "file_path": "/app/_data/train_soundscapes/" + filename_full,
            'frames': audio_info['frames'],
            'duration': audio_info['duration']
        },
        ignore_index=True,
    )

In [9]:
train_soundscapes_labels = train_soundscapes_labels.merge(
    info_df, on="audio_id", how="outer"
)

In [10]:
train_soundscapes_labels.head(2)

Unnamed: 0,row_id,site,audio_id,seconds,birds,date,duration,file_path,filename,frames
0,7019_COR_5,COR,7019,5,nocall,20190904,600.0,/app/_data/train_soundscapes/7019_COR_20190904...,7019_COR_20190904.ogg,19200000.0
1,7019_COR_10,COR,7019,10,nocall,20190904,600.0,/app/_data/train_soundscapes/7019_COR_20190904...,7019_COR_20190904.ogg,19200000.0


In [11]:
train_soundscapes_labels["date"] = pd.to_datetime(
    train_soundscapes_labels["date"], format="%Y%m%d"
)

In [12]:
train_soundscapes_labels["month"] = train_soundscapes_labels["date"].dt.month
train_soundscapes_labels["year"] = train_soundscapes_labels["date"].dt.year

## coordinates of sites

In [13]:
list_sites = []
for file_name in os.listdir("/app/_data/test_soundscapes/txt"):
    if file_name[-3:] == "txt":
        with open("/app/_data/test_soundscapes/txt/" + file_name, "r") as f:
            site = f.readlines()
            list_sites.append([file_name.split("_")[0], site[-2:]])

In [14]:
coord_sites = pd.DataFrame()
for i in list_sites:
    ids = i[0]
    coord_sites.loc[ids, "longitude"] = i[1][1].split(":")[1].split("\n")[0].split()[0]
    coord_sites.loc[ids, "latitude"] = i[1][0].split(":")[1].split("\n")[0].split()[0]
coord_sites = coord_sites.astype("float")
coord_sites["site"] = coord_sites.index

In [15]:
coord_sites

Unnamed: 0,longitude,latitude,site
COR,-84.51,10.12,COR
SNE,-119.95,38.49,SNE
SSW,-76.45,42.47,SSW
COL,-75.85,5.57,COL


In [16]:
train_soundscapes_labels = train_soundscapes_labels.merge(
    coord_sites, on="site", how="left"
)

## birds labels

In [17]:
for i in train_soundscapes_labels.index.tolist():
    l = train_soundscapes_labels.loc[i, "birds"].split()
    if len(l) > 1:
        train_soundscapes_labels.loc[i, "primary_label"] = str(l[0])
        train_soundscapes_labels.loc[i, "secondary_labels"] = " ".join(l[1:])
    else:
        train_soundscapes_labels.loc[i, "primary_label"] = str(l[0])

In [18]:
train_soundscapes_labels["end_sec"] = train_soundscapes_labels["seconds"]
train_soundscapes_labels["start_sec"] = train_soundscapes_labels["end_sec"] - 5

In [19]:
train_soundscapes_labels = train_soundscapes_labels[
    [
        'filename',
        "primary_label",
        "secondary_labels",
        "row_id",
        "date",
        "file_path",
        "longitude",
        "latitude",
        "month",
        "year",
        "start_sec",
        "end_sec",
        'duration',
        'site',
        'birds'
    ]
]

In [20]:
train_soundscapes_labels["rating"] = 6


### save prepared dataset

In [45]:
train_soundscapes_labels.to_csv(
    "/app/_data/train_soundscape_labels_full.csv", index=False
)

In [22]:
train_soundscapes_labels.head()

Unnamed: 0,filename,primary_label,secondary_labels,row_id,date,file_path,longitude,latitude,month,year,start_sec,end_sec,duration,site,birds,rating
0,7019_COR_20190904.ogg,nocall,,7019_COR_5,2019-09-04,/app/_data/train_soundscapes/7019_COR_20190904...,-84.51,10.12,9,2019,0,5,600.0,COR,nocall,6
1,7019_COR_20190904.ogg,nocall,,7019_COR_10,2019-09-04,/app/_data/train_soundscapes/7019_COR_20190904...,-84.51,10.12,9,2019,5,10,600.0,COR,nocall,6
2,7019_COR_20190904.ogg,nocall,,7019_COR_15,2019-09-04,/app/_data/train_soundscapes/7019_COR_20190904...,-84.51,10.12,9,2019,10,15,600.0,COR,nocall,6
3,7019_COR_20190904.ogg,nocall,,7019_COR_20,2019-09-04,/app/_data/train_soundscapes/7019_COR_20190904...,-84.51,10.12,9,2019,15,20,600.0,COR,nocall,6
4,7019_COR_20190904.ogg,nocall,,7019_COR_25,2019-09-04,/app/_data/train_soundscapes/7019_COR_20190904...,-84.51,10.12,9,2019,20,25,600.0,COR,nocall,6


## stat

In [23]:
train_soundscapes_labels["site"].value_counts()

COR    1200
SSW    1200
Name: site, dtype: int64

In [24]:
train_soundscapes_labels["filename"].value_counts()

54955_SSW_20170617.ogg    120
18003_COR_20190904.ogg    120
7843_SSW_20170325.ogg     120
21767_COR_20190904.ogg    120
2782_SSW_20170701.ogg     120
50878_COR_20191004.ogg    120
28933_SSW_20170408.ogg    120
51010_SSW_20170513.ogg    120
44957_COR_20190923.ogg    120
11254_COR_20190904.ogg    120
7954_COR_20190923.ogg     120
10534_SSW_20170429.ogg    120
31928_COR_20191004.ogg    120
42907_SSW_20170708.ogg    120
14473_SSW_20170701.ogg    120
26746_COR_20191004.ogg    120
26709_SSW_20170701.ogg    120
20152_SSW_20170805.ogg    120
57610_COR_20190904.ogg    120
7019_COR_20190904.ogg     120
Name: filename, dtype: int64

In [25]:
len(train_soundscapes_labels["birds"].unique())
train_soundscapes_labels["birds"].value_counts()

109

nocall                    1529
rucwar                     149
bobfly1                     77
reevir1                     57
rewbla                      52
                          ... 
balori comgra                1
norwat                       1
cangoo gockin sonspa         1
rucwar runwre1 yehcar1       1
gockin rewbla                1
Name: birds, Length: 109, dtype: int64

 - there are 108 bird species in train soundscapes and 'nocall'.
 - the most frequent label is 'nocall',
 - sometimes there are 2 or more birds in labels,
 - train sounscapes from 2 sites: COR and SSW,
 - each file devided into 120 5-seconds segments

# train_metadata

In [26]:
train_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url
0,acafly,['amegfi'],"['begging call', 'call', 'juvenile']",35.386,-84.125,Empidonax virescens,Acadian Flycatcher,Mike Nelson,2012-08-12,XC109605.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,09:30,https://www.xeno-canto.org/109605
1,acafly,[],['call'],9.1334,-79.6501,Empidonax virescens,Acadian Flycatcher,Allen T. Chartier,2000-12-26,XC11209.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,?,https://www.xeno-canto.org/11209
2,acafly,[],['call'],5.7813,-75.7452,Empidonax virescens,Acadian Flycatcher,Sergio Chaparro-Herrera,2012-01-10,XC127032.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,15:20,https://www.xeno-canto.org/127032
3,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129974.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129974
4,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129981.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129981


In [27]:
train_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62874 entries, 0 to 62873
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   primary_label     62874 non-null  object 
 1   secondary_labels  62874 non-null  object 
 2   type              62874 non-null  object 
 3   latitude          62874 non-null  float64
 4   longitude         62874 non-null  float64
 5   scientific_name   62874 non-null  object 
 6   common_name       62874 non-null  object 
 7   author            62874 non-null  object 
 8   date              62874 non-null  object 
 9   filename          62874 non-null  object 
 10  license           62874 non-null  object 
 11  rating            62874 non-null  float64
 12  time              62874 non-null  object 
 13  url               62874 non-null  object 
dtypes: float64(3), object(11)
memory usage: 6.7+ MB


In [28]:
train_metadata["month"] = train_metadata["date"].apply(lambda x: x.split("-")[1])
train_metadata["month"] = train_metadata["month"].astype("int")
train_metadata["year"] = train_metadata["date"].apply(lambda x: x.split("-")[0])
train_metadata["year"] = train_metadata["year"].astype("int")

In [29]:
train_metadata["type"] = train_metadata["type"].str.replace(r"[\[\]\'\"\(\)?]", "")
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].str.replace(
    r"[\[\]\',]", ""
)

In [30]:
train_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url,month,year
0,acafly,amegfi,"begging call, call, juvenile",35.386,-84.125,Empidonax virescens,Acadian Flycatcher,Mike Nelson,2012-08-12,XC109605.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,09:30,https://www.xeno-canto.org/109605,8,2012
1,acafly,,call,9.1334,-79.6501,Empidonax virescens,Acadian Flycatcher,Allen T. Chartier,2000-12-26,XC11209.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,?,https://www.xeno-canto.org/11209,12,2000
2,acafly,,call,5.7813,-75.7452,Empidonax virescens,Acadian Flycatcher,Sergio Chaparro-Herrera,2012-01-10,XC127032.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,15:20,https://www.xeno-canto.org/127032,1,2012
3,acafly,whwbec1,call,4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129974.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129974,6,2009
4,acafly,whwbec1,call,4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129981.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129981,6,2009


## info about audio files

In [31]:
for ids in tqdm(train_metadata.index.tolist()):
    bird = train_metadata.loc[ids, "primary_label"]
    path = AUDIO_PATH + bird
    record = train_metadata.loc[ids, "filename"]
    audio_info = get_audio_info(os.path.join(path, record))
    train_metadata.loc[ids, "file_path"] = os.path.join(path, record)
    train_metadata.loc[ids, "frames"] = audio_info["frames"]
    train_metadata.loc[ids, "duration"] = audio_info["duration"]

100%|██████████| 62874/62874 [04:23<00:00, 238.68it/s]


## info about species

In [32]:
train_metadata[["scientific_name", "common_name", 'primary_label']].value_counts()

scientific_name            common_name                 primary_label
Cardinalis cardinalis      Northern Cardinal           norcar           500
Loxia curvirostra          Red Crossbill               redcro           500
Passer domesticus          House Sparrow               houspa           500
Pipilo maculatus           Spotted Towhee              spotow           500
Toxostoma curvirostre      Curve-billed Thrasher       cubthr           500
                                                                       ... 
Pionus senilis             White-crowned Parrot        whcpar            12
Melozone leucotis          White-eared Ground-Sparrow  wegspa1           10
Campylorhynchus rufinucha  Rufous-naped Wren           runwre1            9
Amazilia saucerottei       Steely-vented Hummingbird   stvhum2            8
Psittacara finschi         Crimson-fronted Parakeet    crfpar             8
Length: 397, dtype: int64

 - There are 397 species in dataset.
 - The most frequent species are presented in 500 audio files, the most rare - only in 8-10.

In [33]:
train_metadata["secondary_labels"].value_counts()

                                      41358
rewbla                                  292
amerob                                  235
houspa                                  218
norcar                                  205
                                      ...  
bkcchi wewpew yebcha                      1
whwdov bncfly bnhcow cogdov trokin        1
lazbun wesmea                             1
pinsis pygnut wewpew daejun               1
eletro thbkin norcar stbori cacwre        1
Name: secondary_labels, Length: 10918, dtype: int64

Sometimes in "secondary_labels" there are more than one bird

In [34]:
list_secondary_labels = []
for i in train_metadata["secondary_labels"].values:
    list_secondary_labels.extend(i.split(" "))
len(set(list_secondary_labels))

394

In [35]:
set(train_metadata['primary_label']) - set(list_secondary_labels)
set(list_secondary_labels) - set(train_metadata['primary_label'])

{'bongul', 'grhcha1', 'heptan', 'rocpig', 'runwre1'}

{'', 'rocpig1'}

#### In secondary labels only 394 birds. In secondary labels there is bird "rocpig1' and in primary labels "rocpig". What's the difference?

In [36]:
train_metadata[train_metadata['secondary_labels'].str.contains('rocpig1')]

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url,month,year,file_path,frames,duration
4044,barswa,rocpig1,song,28.3686,-108.9283,Hirundo rustica,Barn Swallow,Richard E. Webster,2013-08-16,XC337983.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,05:18,https://www.xeno-canto.org/337983,8,2013,/app/_data/train_short_audio/barswa/XC337983.ogg,4320479.0,135.014969
4050,barswa,rocpig1,alarm call,58.4151,14.1515,Hirundo rustica,Barn Swallow,Patrik Åberg,2016-07-10,XC343870.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.5,04:40,https://www.xeno-canto.org/343870,7,2016,/app/_data/train_short_audio/barswa/XC343870.ogg,3139327.0,98.103969
12758,cangoo,rocpig1,begging call,51.5701,-0.103,Branta canadensis,Canada Goose,nick talbot,2018-07-15,XC425088.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,10:30,https://www.xeno-canto.org/425088,7,2018,/app/_data/train_short_audio/cangoo/XC425088.ogg,543800.0,16.99375
15958,cliswa,rocpig1 solsan whtdov,call,27.1621,-109.3975,Petrochelidon pyrrhonota,Cliff Swallow,Richard E. Webster,2019-07-20,XC512327.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,06:42,https://www.xeno-canto.org/512327,7,2019,/app/_data/train_short_audio/cliswa/XC512327.ogg,2089376.0,65.293
21499,eucdov,rocpig1 grtgra,"male, song",14.5884,-88.581,Streptopelia decaocto,Eurasian Collared-Dove,Oliver Komar,2017-09-06,XC385693.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,06:00,https://www.xeno-canto.org/385693,9,2017,/app/_data/train_short_audio/eucdov/XC385693.ogg,1126435.0,35.201094
21560,eucdov,rocpig1,"flight call, male, song",49.5345,26.2093,Streptopelia decaocto,Eurasian Collared-Dove,Ruslan Mazuryk,2019-06-02,XC481106.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.5,13:18,https://www.xeno-canto.org/481106,6,2019,/app/_data/train_short_audio/eucdov/XC481106.ogg,5771936.0,180.373
27180,grtgra,rocpig1,"male, song",14.5884,-88.581,Quiscalus mexicanus,Great-tailed Grackle,Oliver Komar,2017-09-06,XC385692.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,06:00,https://www.xeno-canto.org/385692,9,2017,/app/_data/train_short_audio/grtgra/XC385692.ogg,273799.0,8.556219
29386,houspa,rocpig1,"call, male",44.0635,1.9515,Passer domesticus,House Sparrow,Cedric Mroczko,2020-05-26,XC564291.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.5,09:10,https://www.xeno-canto.org/564291,5,2020,/app/_data/train_short_audio/houspa/XC564291.ogg,369093.0,11.534156
35142,moudov,rewbla rocpig1 cangoo saypho killde amerob,"song, wing whistle",40.0439,-105.1856,Zenaida macroura,Mourning Dove,Eric DeFonso,2013-04-05,XC172439.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,08:32,https://www.xeno-canto.org/172439,4,2013,/app/_data/train_short_audio/moudov/XC172439.ogg,3523232.0,110.101


In [37]:
train_metadata[train_metadata['primary_label'].str.contains('rocpig')]

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url,month,year,file_path,frames,duration
44225,rocpig,,song,-20.8004,-42.8884,Columba livia,Rock Pigeon,Noé Eiterer,2011-05-22,XC113783.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.0,14:30,https://www.xeno-canto.org/113783,5,2011,/app/_data/train_short_audio/rocpig/XC113783.ogg,226987.0,7.093344
44226,rocpig,,"aberrant, song",52.2776,20.9679,Columba livia,Rock Pigeon,Jarek Matusiak,2013-02-16,XC121680.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.0,14:00,https://www.xeno-canto.org/121680,2,2013,/app/_data/train_short_audio/rocpig/XC121680.ogg,1441184.0,45.037000
44227,rocpig,,"male, song",56.1364,47.2440,Columba livia,Rock Pigeon,Albert Lastukhin,2013-03-10,XC126804.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.0,12:30,https://www.xeno-canto.org/126804,3,2013,/app/_data/train_short_audio/rocpig/XC126804.ogg,2376093.0,74.252906
44228,rocpig,,"male, song",56.0828,47.2900,Columba livia,Rock Pigeon,Albert Lastukhin,2013-04-17,XC129886.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,16:00,https://www.xeno-canto.org/129886,4,2013,/app/_data/train_short_audio/rocpig/XC129886.ogg,1650584.0,51.580750
44229,rocpig,,song,56.0888,46.6264,Columba livia,Rock Pigeon,Albert Lastukhin,2013-07-24,XC144228.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.0,05:30,https://www.xeno-canto.org/144228,7,2013,/app/_data/train_short_audio/rocpig/XC144228.ogg,1068961.0,33.405031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44348,rocpig,rewbla yehbla,call,49.2720,-123.1965,Columba livia,Rock Pigeon,Peter Ward and Ken Hall,1990-07-08,XC613537.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.5,07:00,https://www.xeno-canto.org/613537,7,1990,/app/_data/train_short_audio/rocpig/XC613537.ogg,1781795.0,55.681094
44349,rocpig,,song,15.6690,-96.5732,Columba livia,Rock Pigeon,Manuel Grosselet,2021-01-12,XC615049.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,10:00,https://www.xeno-canto.org/615049,1,2021,/app/_data/train_short_audio/rocpig/XC615049.ogg,810458.0,25.326812
44350,rocpig,,song,58.3749,26.6876,Columba livia,Rock Pigeon,Uku Paal,2021-01-19,XC615956.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.5,10:00,https://www.xeno-canto.org/615956,1,2021,/app/_data/train_short_audio/rocpig/XC615956.ogg,525410.0,16.419062
44351,rocpig,,song,26.9280,75.7930,Columba livia,Rock Pigeon,Mike Nelson,2011-03-09,XC74354.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.0,7:00am,https://www.xeno-canto.org/74354,3,2011,/app/_data/train_short_audio/rocpig/XC74354.ogg,388319.0,12.134969


___rocpig___ - Columba livia, ___rocpig1___ - also Columba livia

In [38]:
train_metadata[train_metadata["secondary_labels"].str.contains("rocpig1")][
    "secondary_labels"
].values

array(['rocpig1', 'rocpig1', 'rocpig1', 'rocpig1 solsan whtdov',
       'rocpig1 grtgra', 'rocpig1', 'rocpig1', 'rocpig1',
       'rewbla rocpig1 cangoo saypho killde amerob'], dtype=object)

In [39]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].replace(
    "rocpig1", "rocpig"
)

In [40]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].replace(
    {
        "rocpig1 solsan whtdov": "rocpig solsan whtdov",
        "rocpig1 grtgra": "rocpig grtgra",
        "rewbla rocpig1 cangoo saypho killde amerob": "rewbla rocpig cangoo saypho killde amerob",
    }
)

In [41]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].replace(
    "", np.nan
)

In [42]:
train_metadata["start_sec"] = 0
train_metadata["end_sec"] = 5

In [43]:
train_metadata.to_csv("/app/_data/train_metadata_full.csv", index=False)

In [44]:
train_metadata = train_metadata[
    [
        "primary_label",
        "secondary_labels",
        "latitude",
        "longitude",
        "date",
        "filename",
        "rating",
        "month",
        "year",
        "file_path",
        "frames",
        "duration",
        "start_sec",
        "end_sec",
    ]
]

# concat

In [46]:
all_audio = pd.concat(
    [train_metadata, train_soundscapes_labels], axis=0, ignore_index=True
)

In [47]:
all_audio.head()

Unnamed: 0,primary_label,secondary_labels,latitude,longitude,date,filename,rating,month,year,file_path,frames,duration,start_sec,end_sec,row_id,site,birds
0,acafly,amegfi,35.386,-84.125,2012-08-12,XC109605.ogg,2.5,8,2012,/app/_data/train_short_audio/acafly/XC109605.ogg,2037586.0,63.674563,0,5,,,
1,acafly,,9.1334,-79.6501,2000-12-26,XC11209.ogg,3.0,12,2000,/app/_data/train_short_audio/acafly/XC11209.ogg,532933.0,16.654156,0,5,,,
2,acafly,,5.7813,-75.7452,2012-01-10,XC127032.ogg,3.0,1,2012,/app/_data/train_short_audio/acafly/XC127032.ogg,1508450.0,47.139063,0,5,,,
3,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129974.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129974.ogg,450177.0,14.068031,0,5,,,
4,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129981.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129981.ogg,1301142.0,40.660688,0,5,,,


## primary_label and secondary_labels

In [48]:
all_audio["secondary_labels"] = all_audio["secondary_labels"].replace("", np.nan)

In [49]:
dict_birds = {}
for i, bird in enumerate(sorted(all_audio["primary_label"].unique())):
    dict_birds[bird] = i
all_audio["label_id"] = all_audio["primary_label"].replace(dict_birds)

In [53]:
file_json = open("/app/_data/dict_birds.json", "w")
json.dump(dict_birds, file_json)
file_json.close()

In [50]:
for i in tqdm(all_audio.index.tolist()):
    if type(all_audio.loc[i, "secondary_labels"]) != float:
        labels = all_audio.loc[i, "secondary_labels"].split()
        list_ids = []
        for bird in labels:
            list_ids.append(str(dict_birds[bird]))
        all_audio.loc[i, "secondary_labels_id"] = " ".join(list_ids)

100%|██████████| 65274/65274 [00:27<00:00, 2389.11it/s]


### weights

In [51]:
dict_weights = (1 / all_audio["primary_label"].value_counts()).to_dict()
all_audio["class_weights"] = all_audio["primary_label"].replace(dict_weights)
all_audio["num_intervals"] = all_audio["duration"] // 5

### preparing additional data

In [52]:
all_audio["sin_month"] = np.sin(2 * np.pi * all_audio["month"] / 12)
all_audio["cos_month"] = np.cos(2 * np.pi * all_audio["month"] / 12)
all_audio["sin_longitude"] = np.sin(2 * np.pi * (all_audio["longitude"]) / 360)
all_audio["cos_longitude"] = np.cos(2 * np.pi * (all_audio["longitude"]) / 360)
all_audio["norm_latitude"] = (all_audio["latitude"] + 90) / 180

In [50]:
all_audio.sample()

Unnamed: 0,primary_label,secondary_labels,latitude,longitude,date,filename,rating,month,year,file_path,...,row_id,sin_month,cos_month,sin_longitude,cos_longitude,norm_latitude,label_id,class_weights,num_intervals,secondary_labels_id
34777,meapar,,-1.7375,-51.4556,2005-11-08,XC84848.ogg,4.0,11,2005,/app/_data/train_short_audio/meapar/XC84848.ogg,...,,-0.5,0.866025,-0.782126,0.623121,0.490347,210,0.005155,7.0,


In [51]:
all_audio[["start_sec", "end_sec", "rating"]] = all_audio[
    ["start_sec", "end_sec", "rating"]
].astype("float16")
all_audio[["year", "label_id"]] = all_audio[["year", "label_id"]].astype("int16")

In [52]:
all_audio = all_audio.query("filename not in @calls_to_drop").reset_index(drop=True)

In [53]:
all_audio.to_csv("/app/_data/all_audio_initial.csv", index=False)

In [54]:
all_audio

Unnamed: 0,primary_label,secondary_labels,latitude,longitude,date,filename,rating,month,year,file_path,...,row_id,sin_month,cos_month,sin_longitude,cos_longitude,norm_latitude,label_id,class_weights,num_intervals,secondary_labels_id
0,acafly,amegfi,35.3860,-84.1250,2012-08-12,XC109605.ogg,2.5,8,2012,/app/_data/train_short_audio/acafly/XC109605.ogg,...,,-8.660254e-01,-0.500000,-0.994748,0.102359,0.696589,0,0.007576,12.0,5
1,acafly,,9.1334,-79.6501,2000-12-26,XC11209.ogg,3.0,12,2000,/app/_data/train_short_audio/acafly/XC11209.ogg,...,,-2.449294e-16,1.000000,-0.983729,0.179659,0.550741,0,0.007576,3.0,
2,acafly,,5.7813,-75.7452,2012-01-10,XC127032.ogg,3.0,1,2012,/app/_data/train_short_audio/acafly/XC127032.ogg,...,,5.000000e-01,0.866025,-0.969210,0.246234,0.532118,0,0.007576,9.0,
3,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129974.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129974.ogg,...,,1.224647e-16,-1.000000,-0.968706,0.248211,0.525954,0,0.007576,2.0,371
4,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129981.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129981.ogg,...,,1.224647e-16,-1.000000,-0.968706,0.248211,0.525954,0,0.007576,8.0,371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65260,nocall,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,54955_SSW_580,1.224647e-16,-1.000000,-0.972166,0.234294,0.735944,220,0.000654,120.0,
65261,grycat,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,54955_SSW_585,1.224647e-16,-1.000000,-0.972166,0.234294,0.735944,168,0.003040,120.0,
65262,grycat,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,54955_SSW_590,1.224647e-16,-1.000000,-0.972166,0.234294,0.735944,168,0.003040,120.0,
65263,nocall,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,54955_SSW_595,1.224647e-16,-1.000000,-0.972166,0.234294,0.735944,220,0.000654,120.0,


## Dates sites

In [51]:
import pandas as pd

In [52]:
dates = pd.read_csv("/app/_data/test_soundscapes/test_set_recording_dates.csv")

In [53]:
dates["date"] = pd.to_datetime(dates["date"], format="%Y%m%d")
dates["month"] = dates["date"].dt.month
dates["year"] = dates["date"].dt.year

In [54]:
monts_sites = dates.groupby("site")[["month", "year"]].describe()

In [55]:
monts_sites

Unnamed: 0_level_0,month,month,month,month,month,month,month,month,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
site,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
COL,4.0,10.25,0.957427,9.0,9.75,10.5,11.0,11.0,4.0,2019.0,0.0,2019.0,2019.0,2019.0,2019.0,2019.0
COR,5.0,9.4,0.547723,9.0,9.0,9.0,10.0,10.0,5.0,2019.0,0.0,2019.0,2019.0,2019.0,2019.0,2019.0
SNE,4.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0,4.0,2018.0,0.0,2018.0,2018.0,2018.0,2018.0,2018.0
SSW,12.0,5.5,2.067058,2.0,3.75,6.0,7.0,8.0,12.0,2017.0,0.0,2017.0,2017.0,2017.0,2017.0,2017.0


In [56]:
dates_sites = (
    monts_sites["month"][["min", "max"]]
    .merge(monts_sites["year"][["min", "max"]], on="site", suffixes=["_month", "_year"])
    .astype("int")
)

In [57]:
dates_sites

Unnamed: 0_level_0,min_month,max_month,min_year,max_year
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
COL,9,11,2019,2019
COR,9,10,2019,2019
SNE,5,5,2018,2018
SSW,2,8,2017,2017


In [58]:
dates_sites["min_date"] = (
    dates_sites["min_year"].astype("str")
    + "0"
    + dates_sites["min_month"].astype("str")
    + "15"
)

In [59]:
dates_sites["max_date"] = (
    dates_sites["min_year"].astype("str")
    + dates_sites["max_month"].astype("str")
    + "15"
)

In [60]:
dates_sites["max_date"] = pd.to_datetime(dates_sites["max_date"], format="%Y%m%d")
dates_sites["min_date"] = pd.to_datetime(dates_sites["min_date"], format="%Y%m%d")

In [63]:
dates_sites.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, COL to SSW
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   min_month  4 non-null      int64         
 1   max_month  4 non-null      int64         
 2   min_year   4 non-null      int64         
 3   max_year   4 non-null      int64         
 4   min_date   4 non-null      datetime64[ns]
 5   max_date   4 non-null      datetime64[ns]
dtypes: datetime64[ns](2), int64(4)
memory usage: 224.0+ bytes


In [64]:
dates_sites.to_csv("/app/_data/dates_sites.csv")

In [65]:
dates_sites

Unnamed: 0_level_0,min_month,max_month,min_year,max_year,min_date,max_date
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
COL,9,11,2019,2019,2019-09-15,2019-11-15
COR,9,10,2019,2019,2019-09-15,2019-10-15
SNE,5,5,2018,2018,2018-05-15,2018-05-15
SSW,2,8,2017,2017,2017-02-15,2017-08-15


In [27]:
def choose_ids(distance_delta=500, months_delta=2, years_delta=5):
    import pandas as pd

    distances_df = pd.read_csv("/app/_data/distances.csv")
    dates_sites = pd.read_csv("/app/_data/dates_sites.csv")
    #     columns = ["dist_COR", "dist_SNE", "dist_SSW", "dist_COL"]
    #     df_ = pd.DataFrame(
    #         columns=[
    #             "dist_COR",
    #             "dist_SNE",
    #             "dist_SSW",
    #             "dist_COL",
    #             "filename",
    #             "month",
    #             "year",
    #         ]
    #     )
    df = distances_df.query(
        "dist_COR <= @distance_delta or dist_SNE <= @distance_delta or dist_SSW <= @distance_delta or dist_COL <= @distance_delta"
    ).reset_index(drop=True)
    #     for i in distances_df.index.tolist():
    #         for col in ["dist_COR", "dist_SNE", "dist_SSW", 'dist_COL']:
    #             site = col.split('_')[1]
    #             d_sites = dates_sites[dates_sites['site']==site]
    #             m_min = d_sites['min_month']
    #             m_max = d_sites['min_month']
    #             year = d_sites['min_year']
    #             df = distances_df.query('@col <= distance_delta and month<= ')

    return df

In [34]:
distances_df = pd.read_csv("/app/_data/distances.csv")
distances_df.shape

(65272, 7)

In [37]:
choose_ids(distance_delta=600)

Unnamed: 0,dist_COR,dist_SNE,dist_SSW,dist_COL,filename,month,year
0,338.317387,3212.209334,2303.470628,357.643978,XC11209.ogg,12,2000
1,670.282828,3564.116428,2525.970162,16.212663,XC127032.ogg,1,2012
2,714.907051,3624.555047,2602.335865,63.586812,XC129974.ogg,6,2009
3,714.907051,3624.555047,2602.335865,63.586812,XC129981.ogg,6,2009
4,714.907051,3624.555047,2602.335865,63.586812,XC130056.ogg,11,2007
...,...,...,...,...,...,...,...
30340,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
30341,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
30342,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
30343,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017


In [14]:
dates_sites = pd.read_csv("/app/_data/dates_sites.csv", parse_dates=[5, 6])

In [15]:
dates_sites

Unnamed: 0,site,min_month,max_month,min_year,max_year,min_date,max_date
0,COL,9,11,2019,2019,2019-09-15,2019-11-15
1,COR,9,10,2019,2019,2019-09-15,2019-10-15
2,SNE,5,5,2018,2018,2018-05-15,2018-05-15
3,SSW,2,8,2017,2017,2017-02-15,2017-08-15


In [26]:
import datetime

In [None]:
datetime.timedelta()

In [25]:
# dates_sites['max_date']+dates_sites['max_date']
dates_sites["max_date"] + pd.Timedelta(90, "day")

0   2020-02-13
1   2020-01-13
2   2018-08-13
3   2017-11-13
Name: max_date, dtype: datetime64[ns]

In [101]:
mon = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

In [102]:
mon[11] - 3

9

In [112]:
for i in range(12):
    d = 2
    start = mon[mon[i] - (d + 1)]
    middle = mon[mon[i] - (d)]
    end = mon[(mon[i] + (d - 1)) - 12]
    print(mon[i], start, end)

1 11 3
2 12 4
3 1 5
4 2 6
5 3 7
6 4 8
7 5 9
8 6 10
9 7 11
10 8 12
11 9 1
12 10 2


In [75]:
choose_ids()

Unnamed: 0,dist_COR,dist_SNE,dist_SSW,dist_COL,filename,month,year
1,338.317387,3212.209334,2303.470628,357.643978,XC11209.ogg,12,2000
2,670.282828,3564.116428,2525.970162,16.212663,XC127032.ogg,1,2012
3,714.907051,3624.555047,2602.335865,63.586812,XC129974.ogg,6,2009
4,714.907051,3624.555047,2602.335865,63.586812,XC129981.ogg,6,2009
5,714.907051,3624.555047,2602.335865,63.586812,XC130056.ogg,11,2007
...,...,...,...,...,...,...,...
65267,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
65268,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
65269,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
65270,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
