In [1]:
import os
import shutil
import warnings

import numpy as np
import pandas as pd
import torchaudio

warnings.filterwarnings(action="ignore")
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
import json

import geopandas
import librosa
import librosa.display
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import soundfile
import tensorflow as tf
import torch

%matplotlib inline
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from torchaudio.transforms import MelSpectrogram
from tqdm import tqdm

In [2]:
SEED = 42
IMG_SIZE = 260
SAMPLE_RATE = 32000
N_FFT = 2048
SIGNAL_LENGTH = 5  # seconds
FREQ_MIN = 500
FREQ_MAX = 15000
MAX_AUDIO_FILES = 100
WIN_LENGHT = 128
AUDIO_PATH = "/app/_data/train_short_audio/"
calls_to_drop = ["XC579430.ogg", "XC590621.ogg"]

In [3]:
train_soundscape_labels = pd.read_csv("/app/_data/train_soundscape_labels_orig.csv")
train_metadata = pd.read_csv("/app/_data/train_metadata_orig.csv")
test = pd.read_csv("/app/_data/test.csv")
test_dates = pd.read_csv("/app/_data/test_soundscapes/test_set_recording_dates.csv")

# train_soundscape_labels

In [4]:
train_soundscape_labels

Unnamed: 0,row_id,site,audio_id,seconds,birds
0,7019_COR_5,COR,7019,5,nocall
1,7019_COR_10,COR,7019,10,nocall
2,7019_COR_15,COR,7019,15,nocall
3,7019_COR_20,COR,7019,20,nocall
4,7019_COR_25,COR,7019,25,nocall
...,...,...,...,...,...
2395,54955_SSW_580,SSW,54955,580,nocall
2396,54955_SSW_585,SSW,54955,585,grycat
2397,54955_SSW_590,SSW,54955,590,grycat
2398,54955_SSW_595,SSW,54955,595,nocall


In [5]:
ss_df = pd.DataFrame()
for mel_name in os.listdir("/app/_data/train_soundscapes/"):
    mel_name_full = mel_name
    mel_name = mel_name[:-4].split("_")
    ss_df = ss_df.append(
        {
            "audio_id": np.int(mel_name[0]),
            "date": mel_name[-1],
            "file_path": "/app/_data/train_soundscapes/" + mel_name_full,
        },
        ignore_index=True,
    )

In [6]:
train_soundscape_labels = train_soundscape_labels.merge(
    ss_df, on="audio_id", how="outer"
)

In [7]:
train_soundscape_labels["date"] = pd.to_datetime(
    train_soundscape_labels["date"], format="%Y%m%d"
)

In [8]:
list_sites = []
for file_name in os.listdir("/app/_data/test_soundscapes/txt"):
    if file_name[-3:] == "txt":
        with open("/app/_data/test_soundscapes/txt/" + file_name, "r") as f:
            site = f.readlines()
            list_sites.append([file_name.split("_")[0], site[-2:]])

In [9]:
coord_sites = pd.DataFrame()
for i in list_sites:
    ids = i[0]
    coord_sites.loc[ids, "longitude"] = i[1][1].split(":")[1].split("\n")[0].split()[0]
    coord_sites.loc[ids, "latitude"] = i[1][0].split(":")[1].split("\n")[0].split()[0]
coord_sites = coord_sites.astype("float")
coord_sites["site"] = coord_sites.index

In [10]:
coord_sites

Unnamed: 0,longitude,latitude,site
COR,-84.51,10.12,COR
SNE,-119.95,38.49,SNE
SSW,-76.45,42.47,SSW
COL,-75.85,5.57,COL


In [11]:
train_soundscape_labels = train_soundscape_labels.merge(
    coord_sites, on="site", how="left"
)

In [12]:
train_soundscape_labels["month"] = train_soundscape_labels["date"].dt.month
train_soundscape_labels["year"] = train_soundscape_labels["date"].dt.year

In [13]:
for i in train_soundscape_labels.index.tolist():
    l = train_soundscape_labels.loc[i,'birds'].split()
    if len(l)>1:
        train_soundscape_labels.loc[i,'primary_label'] = str(l[0])
        train_soundscape_labels.loc[i,'secondary_labels'] = ' '.join(l[1:])
    else:
        train_soundscape_labels.loc[i,'primary_label'] = str(l[0])

In [14]:
train_soundscape_labels.to_csv('/app/_data/train_metadata.csv', index=False)

In [26]:
# train_soundscape_labels = pd.read_csv(''/app/_data/train_metadata.csv'')

# train_metadata

In [27]:
train_metadata

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url,secondary_labels1
0,acafly,['amegfi'],"['begging call', 'call', 'juvenile']",35.3860,-84.1250,Empidonax virescens,Acadian Flycatcher,Mike Nelson,2012-08-12,XC109605.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,09:30,https://www.xeno-canto.org/109605,['amegfi']
1,acafly,[],['call'],9.1334,-79.6501,Empidonax virescens,Acadian Flycatcher,Allen T. Chartier,2000-12-26,XC11209.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,?,https://www.xeno-canto.org/11209,[]
2,acafly,[],['call'],5.7813,-75.7452,Empidonax virescens,Acadian Flycatcher,Sergio Chaparro-Herrera,2012-01-10,XC127032.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,15:20,https://www.xeno-canto.org/127032,[]
3,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129974.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129974,['whwbec1']
4,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129981.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129981,['whwbec1']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62869,yetvir,[],"['adult', 'male', 'song']",30.2150,-97.6505,Vireo flavifrons,Yellow-throated Vireo,Caleb Helsel,2020-07-10,XC591680.ogg,Creative Commons Attribution-NonCommercial-Sha...,1.0,08:30,https://www.xeno-canto.org/591680,[]
62870,yetvir,[],"['life stage uncertain', 'sex uncertain', 'song']",42.3005,-72.5877,Vireo flavifrons,Yellow-throated Vireo,Christopher McPherson,2019-05-31,XC600085.ogg,Creative Commons Attribution-NonCommercial-Sha...,5.0,09:30,https://www.xeno-canto.org/600085,[]
62871,yetvir,"['amered', 'eawpew', 'norcar', 'reevir1']","['adult', 'male', 'song']",42.3005,-72.5877,Vireo flavifrons,Yellow-throated Vireo,Christopher McPherson,2020-06-02,XC602701.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.5,08:30,https://www.xeno-canto.org/602701,"['amered', 'eawpew', 'norcar', 'reevir1']"
62872,yetvir,[],['uncertain'],32.2357,-99.8811,Vireo flavifrons,Yellow-throated Vireo,Brad Banner,2019-04-27,XC614733.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.0,17:30,https://www.xeno-canto.org/614733,[]


In [28]:
train_metadata["month"] = train_metadata["date"].apply(lambda x: x.split("-")[1])
train_metadata["month"] = train_metadata["month"].astype("int")
train_metadata["year"] = train_metadata["date"].apply(lambda x: x.split("-")[0])
train_metadata["year"] = train_metadata["year"].astype("int")

In [29]:
train_metadata["type"] = train_metadata["type"].str.replace(r"[\[\]\'\"\(\)?]", "")
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].str.replace(
    r"[\[\]\',]", ""
)

In [30]:
def get_audio_info(filepath):
    """Get some properties from  an audio file"""
    with soundfile.SoundFile(filepath) as f:
        sr = f.samplerate
        frames = f.frames
        duration = float(frames) / sr
    return {"frames": frames, "sr": sr, "duration": duration}

In [31]:
for ids in train_metadata.index.tolist():
    bird = train_metadata.loc[ids, "primary_label"]
    path = AUDIO_PATH + bird
    record = train_metadata.loc[ids, "filename"]
    audio_info = get_audio_info(os.path.join(path, record))
    train_metadata.loc[ids, "file_path"] = os.path.join(path, record)
    train_metadata.loc[ids, "frames"] = audio_info["frames"]
    train_metadata.loc[ids, "sr"] = audio_info["sr"]
    train_metadata.loc[ids, "duration"] = audio_info["duration"]

In [32]:
train_metadata[train_metadata["secondary_labels"].str.contains("rocpig1")][
    "secondary_labels"
].values

array(['rocpig1', 'rocpig1', 'rocpig1', 'rocpig1 solsan whtdov',
       'rocpig1 grtgra', 'rocpig1', 'rocpig1', 'rocpig1',
       'rewbla rocpig1 cangoo saypho killde amerob'], dtype=object)

In [33]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].replace(
    "rocpig1", "rocpig"
)

In [34]:
train_metadata["secondary_labels"] = train_metadata["secondary_labels"].replace(
    {
        "rocpig1 solsan whtdov": "rocpig solsan whtdov",
        "rocpig1 grtgra": "rocpig grtgra",
        "rewbla rocpig1 cangoo saypho killde amerob": "rewbla rocpig cangoo saypho killde amerob",
    }
)

In [35]:
train_metadata['secondary_labels1'] = train_metadata['secondary_labels'].replace('',np.nan)

In [36]:
train_metadata

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,...,rating,time,url,secondary_labels1,month,year,file_path,frames,sr,duration
0,acafly,amegfi,"begging call, call, juvenile",35.3860,-84.1250,Empidonax virescens,Acadian Flycatcher,Mike Nelson,2012-08-12,XC109605.ogg,...,2.5,09:30,https://www.xeno-canto.org/109605,amegfi,8,2012,/app/_data/train_short_audio/acafly/XC109605.ogg,2037586.0,32000.0,63.674563
1,acafly,,call,9.1334,-79.6501,Empidonax virescens,Acadian Flycatcher,Allen T. Chartier,2000-12-26,XC11209.ogg,...,3.0,?,https://www.xeno-canto.org/11209,,12,2000,/app/_data/train_short_audio/acafly/XC11209.ogg,532933.0,32000.0,16.654156
2,acafly,,call,5.7813,-75.7452,Empidonax virescens,Acadian Flycatcher,Sergio Chaparro-Herrera,2012-01-10,XC127032.ogg,...,3.0,15:20,https://www.xeno-canto.org/127032,,1,2012,/app/_data/train_short_audio/acafly/XC127032.ogg,1508450.0,32000.0,47.139063
3,acafly,whwbec1,call,4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129974.ogg,...,3.5,07:50,https://www.xeno-canto.org/129974,whwbec1,6,2009,/app/_data/train_short_audio/acafly/XC129974.ogg,450177.0,32000.0,14.068031
4,acafly,whwbec1,call,4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129981.ogg,...,3.5,07:50,https://www.xeno-canto.org/129981,whwbec1,6,2009,/app/_data/train_short_audio/acafly/XC129981.ogg,1301142.0,32000.0,40.660688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62869,yetvir,,"adult, male, song",30.2150,-97.6505,Vireo flavifrons,Yellow-throated Vireo,Caleb Helsel,2020-07-10,XC591680.ogg,...,1.0,08:30,https://www.xeno-canto.org/591680,,7,2020,/app/_data/train_short_audio/yetvir/XC591680.ogg,1424288.0,32000.0,44.509000
62870,yetvir,,"life stage uncertain, sex uncertain, song",42.3005,-72.5877,Vireo flavifrons,Yellow-throated Vireo,Christopher McPherson,2019-05-31,XC600085.ogg,...,5.0,09:30,https://www.xeno-canto.org/600085,,5,2019,/app/_data/train_short_audio/yetvir/XC600085.ogg,2510240.0,32000.0,78.445000
62871,yetvir,amered eawpew norcar reevir1,"adult, male, song",42.3005,-72.5877,Vireo flavifrons,Yellow-throated Vireo,Christopher McPherson,2020-06-02,XC602701.ogg,...,4.5,08:30,https://www.xeno-canto.org/602701,amered eawpew norcar reevir1,6,2020,/app/_data/train_short_audio/yetvir/XC602701.ogg,3259808.0,32000.0,101.869000
62872,yetvir,,uncertain,32.2357,-99.8811,Vireo flavifrons,Yellow-throated Vireo,Brad Banner,2019-04-27,XC614733.ogg,...,4.0,17:30,https://www.xeno-canto.org/614733,,4,2019,/app/_data/train_short_audio/yetvir/XC614733.ogg,525410.0,32000.0,16.419062


In [37]:
train_metadata.to_csv("/app/_data/train_metadata_full.csv", index=False)

In [38]:
train_metadata = train_metadata[
    [
        "primary_label",
        "secondary_labels",
        "latitude",
        "longitude",
        "date",
        "filename",
        "rating",
        "month",
        "year",
        "file_path",
        "frames",
        "duration",
    ]
]

# concat

In [39]:
train_soundscape_labels = train_soundscape_labels[
    ['primary_label',
       'secondary_labels',"row_id", "date", "file_path", "longitude", "latitude", "month", "year"]
]

In [40]:
train_soundscape_labels["filename"] = train_soundscape_labels["file_path"].apply(
    lambda x: x.split("/")[-1]
)
train_soundscape_labels["rating"] = 6
train_soundscape_labels["frames"] = 32000 * 5
train_soundscape_labels["duration"] = 600

In [41]:
train_soundscape_labels.to_csv(
    "/app/_data/train_soundscape_labels_full.csv", index=False
)

In [42]:
all_audio = pd.concat(
    [train_metadata, train_soundscape_labels], axis=0, ignore_index=True
)

In [43]:
# all_audio1 = pd.read_csv("/app/_data/all_audio.csv")

In [44]:
all_audio

Unnamed: 0,primary_label,secondary_labels,latitude,longitude,date,filename,rating,month,year,file_path,frames,duration,row_id
0,acafly,amegfi,35.3860,-84.1250,2012-08-12,XC109605.ogg,2.5,8,2012,/app/_data/train_short_audio/acafly/XC109605.ogg,2037586.0,63.674563,
1,acafly,,9.1334,-79.6501,2000-12-26,XC11209.ogg,3.0,12,2000,/app/_data/train_short_audio/acafly/XC11209.ogg,532933.0,16.654156,
2,acafly,,5.7813,-75.7452,2012-01-10,XC127032.ogg,3.0,1,2012,/app/_data/train_short_audio/acafly/XC127032.ogg,1508450.0,47.139063,
3,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129974.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129974.ogg,450177.0,14.068031,
4,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129981.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129981.ogg,1301142.0,40.660688,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65269,nocall,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,160000.0,600.000000,54955_SSW_580
65270,grycat,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,160000.0,600.000000,54955_SSW_585
65271,grycat,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,160000.0,600.000000,54955_SSW_590
65272,nocall,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,160000.0,600.000000,54955_SSW_595


In [45]:
# all_audio["secondary_labels"] = all_audio["secondary_labels"].fillna("")

In [46]:
# for i in all_audio.index.tolist():
#     pr_label = all_audio.loc[i, "primary_label"].split(' ')
#     if len(pr_label)>1:
#         all_audio.loc[i, "primary_label"] = pr_label[0]
#         all_audio.loc[i, "secondary_labels"] = all_audio.loc[i, "secondary_labels"]+' '.join(pr_label[1:])

In [47]:
all_audio["secondary_labels"] = all_audio["secondary_labels"].replace('',np.nan)

In [48]:
# all_audio["secondary_labels"] = all_audio["secondary_labels"].fillna("nocall")
all_audio["sin_month"] = np.sin(2 * np.pi * all_audio["month"] / 12)
all_audio["cos_month"] = np.cos(2 * np.pi * all_audio["month"] / 12)
all_audio["sin_longitude"] = np.sin(2 * np.pi * (all_audio["longitude"]) / 360)
all_audio["cos_longitude"] = np.cos(2 * np.pi * (all_audio["longitude"]) / 360)
all_audio["norm_latitude"] = (all_audio["latitude"] +90) /180

In [49]:
dict_birds = {}
for i, bird in enumerate(sorted(all_audio["primary_label"].unique())):
    dict_birds[bird] = i
all_audio["label_id"] = all_audio["primary_label"].replace(dict_birds)

In [50]:
dict_weights = (1 / all_audio["primary_label"].value_counts()).to_dict()
all_audio["class_weights"] = all_audio["primary_label"].replace(dict_weights)
all_audio["num_intervals"] = all_audio["duration"] // 5
all_audio["start_sec"] = 0
all_audio["end_sec"] = 5

In [51]:
for i in all_audio.index.tolist():
    if type(all_audio.loc[i, "secondary_labels"])!=float:
        labels = all_audio.loc[i, "secondary_labels"].split()
        list_ids = []
        for bird in labels:
            list_ids.append(str(dict_birds[bird]))
        all_audio.loc[i, "secondary_labels_id"] = " ".join(list_ids)

In [52]:
# file_json = open("/app/_data/dict_birds.json", "w")
# json.dump(dict_birds, file_json)
# file_json.close()

In [53]:
all_audio.sample()

Unnamed: 0,primary_label,secondary_labels,latitude,longitude,date,filename,rating,month,year,file_path,...,cos_month,sin_longitude,cos_longitude,norm_latitude,label_id,class_weights,num_intervals,start_sec,end_sec,secondary_labels_id
56413,wesant1,barant1,4.351,-74.652,2020-03-24,XC561190.ogg,0.0,3,2020,/app/_data/train_short_audio/wesant1/XC561190.ogg,...,6.123234000000001e-17,-0.964336,0.264681,0.524172,354,0.008333,3.0,0,5,22


In [54]:
all_audio[["start_sec", "end_sec", "rating"]] = all_audio[["start_sec", "end_sec", "rating"]].astype(
    "float16"
)
all_audio[["year", "label_id"]] = all_audio[["year", "label_id"]].astype("int16")

In [55]:
all_audio = all_audio.query('filename not in @calls_to_drop').reset_index(drop=True)

In [56]:
all_audio.to_csv("/app/_data/all_audio_initial.csv", index=False)

In [57]:
all_audio

Unnamed: 0,primary_label,secondary_labels,latitude,longitude,date,filename,rating,month,year,file_path,...,cos_month,sin_longitude,cos_longitude,norm_latitude,label_id,class_weights,num_intervals,start_sec,end_sec,secondary_labels_id
0,acafly,amegfi,35.3860,-84.1250,2012-08-12,XC109605.ogg,2.5,8,2012,/app/_data/train_short_audio/acafly/XC109605.ogg,...,-0.500000,-0.994748,0.102359,0.696589,0,0.007576,12.0,0.0,5.0,5
1,acafly,,9.1334,-79.6501,2000-12-26,XC11209.ogg,3.0,12,2000,/app/_data/train_short_audio/acafly/XC11209.ogg,...,1.000000,-0.983729,0.179659,0.550741,0,0.007576,3.0,0.0,5.0,
2,acafly,,5.7813,-75.7452,2012-01-10,XC127032.ogg,3.0,1,2012,/app/_data/train_short_audio/acafly/XC127032.ogg,...,0.866025,-0.969210,0.246234,0.532118,0,0.007576,9.0,0.0,5.0,
3,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129974.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129974.ogg,...,-1.000000,-0.968706,0.248211,0.525954,0,0.007576,2.0,0.0,5.0,371
4,acafly,whwbec1,4.6717,-75.6283,2009-06-19,XC129981.ogg,3.5,6,2009,/app/_data/train_short_audio/acafly/XC129981.ogg,...,-1.000000,-0.968706,0.248211,0.525954,0,0.007576,8.0,0.0,5.0,371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65267,nocall,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,-1.000000,-0.972166,0.234294,0.735944,220,0.000654,120.0,0.0,5.0,
65268,grycat,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,-1.000000,-0.972166,0.234294,0.735944,168,0.003040,120.0,0.0,5.0,
65269,grycat,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,-1.000000,-0.972166,0.234294,0.735944,168,0.003040,120.0,0.0,5.0,
65270,nocall,,42.4700,-76.4500,2017-06-17 00:00:00,54955_SSW_20170617.ogg,6.0,6,2017,/app/_data/train_soundscapes/54955_SSW_2017061...,...,-1.000000,-0.972166,0.234294,0.735944,220,0.000654,120.0,0.0,5.0,


## Dates sites

In [27]:
import pandas as pd

In [28]:
dates = pd.read_csv('/app/_data/test_soundscapes/test_set_recording_dates.csv')

In [29]:
dates['date'] = pd.to_datetime(dates['date'], format='%Y%m%d')
dates['month'] = dates['date'].dt.month
dates['year'] = dates['date'].dt.year
# dates['month'] = dates['date'].apply(lambda x: int(str(x)[4:6]))
# dates['year'] = dates['date'].apply(lambda x: int(str(x)[:4]))

In [30]:
monts_sites = dates.groupby('site')[['month', 'year']].describe()

In [31]:
monts_sites

Unnamed: 0_level_0,month,month,month,month,month,month,month,month,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
site,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
COL,4.0,10.25,0.957427,9.0,9.75,10.5,11.0,11.0,4.0,2019.0,0.0,2019.0,2019.0,2019.0,2019.0,2019.0
COR,5.0,9.4,0.547723,9.0,9.0,9.0,10.0,10.0,5.0,2019.0,0.0,2019.0,2019.0,2019.0,2019.0,2019.0
SNE,4.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0,4.0,2018.0,0.0,2018.0,2018.0,2018.0,2018.0,2018.0
SSW,12.0,5.5,2.067058,2.0,3.75,6.0,7.0,8.0,12.0,2017.0,0.0,2017.0,2017.0,2017.0,2017.0,2017.0


In [32]:
dates_sites = monts_sites['month'][['min','max']].merge(monts_sites['year'][['min','max']], on='site', suffixes = ['_month', '_year']).astype('int')

In [33]:
dates_sites

Unnamed: 0_level_0,min_month,max_month,min_year,max_year
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
COL,9,11,2019,2019
COR,9,10,2019,2019
SNE,5,5,2018,2018
SSW,2,8,2017,2017


In [34]:
dates_sites['min_date'] = dates_sites['min_year'].astype('str')+'0'+dates_sites['min_month'].astype('str')+'15'

In [35]:
dates_sites['max_date'] =  dates_sites['min_year'].astype('str')+dates_sites['max_month'].astype('str')+'15'

In [36]:
dates_sites['max_date'] = pd.to_datetime(dates_sites['max_date'], format='%Y%m%d')
dates_sites['min_date'] = pd.to_datetime(dates_sites['min_date'], format='%Y%m%d')


In [40]:
dates_sites.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, COL to SSW
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   min_month  4 non-null      int64         
 1   max_month  4 non-null      int64         
 2   min_year   4 non-null      int64         
 3   max_year   4 non-null      int64         
 4   min_date   4 non-null      datetime64[ns]
 5   max_date   4 non-null      datetime64[ns]
dtypes: datetime64[ns](2), int64(4)
memory usage: 396.0+ bytes


In [38]:
dates_sites.to_csv('/app/_data/dates_sites.csv')

In [7]:
dates_sites

Unnamed: 0_level_0,min_month,max_month,min_year,max_year
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
COL,9,11,2019,2019
COR,9,10,2019,2019
SNE,5,5,2018,2018
SSW,2,8,2017,2017


In [27]:
def choose_ids(distance_delta=500, months_delta=2, years_delta=5):
    import pandas as pd

    distances_df = pd.read_csv("/app/_data/distances.csv")
    dates_sites = pd.read_csv('/app/_data/dates_sites.csv')
#     columns = ["dist_COR", "dist_SNE", "dist_SSW", "dist_COL"]
#     df_ = pd.DataFrame(
#         columns=[
#             "dist_COR",
#             "dist_SNE",
#             "dist_SSW",
#             "dist_COL",
#             "filename",
#             "month",
#             "year",
#         ]
#     )
    df = distances_df.query('dist_COR <= @distance_delta or dist_SNE <= @distance_delta or dist_SSW <= @distance_delta or dist_COL <= @distance_delta').reset_index(drop=True)
#     for i in distances_df.index.tolist():
#         for col in ["dist_COR", "dist_SNE", "dist_SSW", 'dist_COL']:
#             site = col.split('_')[1]
#             d_sites = dates_sites[dates_sites['site']==site]
#             m_min = d_sites['min_month']
#             m_max = d_sites['min_month']
#             year = d_sites['min_year']
#             df = distances_df.query('@col <= distance_delta and month<= ')

    return df

In [34]:
distances_df = pd.read_csv("/app/_data/distances.csv")
distances_df.shape

(65272, 7)

In [37]:
choose_ids(distance_delta=600)

Unnamed: 0,dist_COR,dist_SNE,dist_SSW,dist_COL,filename,month,year
0,338.317387,3212.209334,2303.470628,357.643978,XC11209.ogg,12,2000
1,670.282828,3564.116428,2525.970162,16.212663,XC127032.ogg,1,2012
2,714.907051,3624.555047,2602.335865,63.586812,XC129974.ogg,6,2009
3,714.907051,3624.555047,2602.335865,63.586812,XC129981.ogg,6,2009
4,714.907051,3624.555047,2602.335865,63.586812,XC130056.ogg,11,2007
...,...,...,...,...,...,...,...
30340,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
30341,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
30342,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
30343,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017


In [14]:
dates_sites = pd.read_csv('/app/_data/dates_sites.csv', parse_dates=[5,6])

In [15]:
dates_sites

Unnamed: 0,site,min_month,max_month,min_year,max_year,min_date,max_date
0,COL,9,11,2019,2019,2019-09-15,2019-11-15
1,COR,9,10,2019,2019,2019-09-15,2019-10-15
2,SNE,5,5,2018,2018,2018-05-15,2018-05-15
3,SSW,2,8,2017,2017,2017-02-15,2017-08-15


In [26]:
import datetime

In [None]:
datetime.timedelta()

In [25]:
# dates_sites['max_date']+dates_sites['max_date']
dates_sites['max_date']+pd.Timedelta(90,'day')


0   2020-02-13
1   2020-01-13
2   2018-08-13
3   2017-11-13
Name: max_date, dtype: datetime64[ns]

In [101]:
mon = np.array([1,2,3,4,5,6,7,8,9,10,11,12])

In [102]:
mon[11]-3

9

In [112]:
for i in range(12):
    d = 2
    start = mon[mon[i]-(d+1)]
    middle = mon[mon[i]-(d)]
    end = mon[(mon[i]+(d-1))-12]
    print(mon[i], start, end)


1 11 3
2 12 4
3 1 5
4 2 6
5 3 7
6 4 8
7 5 9
8 6 10
9 7 11
10 8 12
11 9 1
12 10 2


In [75]:
choose_ids()

Unnamed: 0,dist_COR,dist_SNE,dist_SSW,dist_COL,filename,month,year
1,338.317387,3212.209334,2303.470628,357.643978,XC11209.ogg,12,2000
2,670.282828,3564.116428,2525.970162,16.212663,XC127032.ogg,1,2012
3,714.907051,3624.555047,2602.335865,63.586812,XC129974.ogg,6,2009
4,714.907051,3624.555047,2602.335865,63.586812,XC129981.ogg,6,2009
5,714.907051,3624.555047,2602.335865,63.586812,XC130056.ogg,11,2007
...,...,...,...,...,...,...,...
65267,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
65268,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
65269,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
65270,2280.324735,2283.094690,0.000000,2540.386312,54955_SSW_20170617.ogg,6,2017
