In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from src.config import c

In [2]:
df = pd.read_csv("/app/res/n_nocall_labels.csv")

In [3]:
train_metadata = pd.read_csv("/app/_data/competition_data/train_metadata.csv")
train_metadata = train_metadata[
    train_metadata["filename"].isin(df["filename"].unique())
]

In [4]:
# leave only nocalls and short audio
df = df[df["filename"].str.startswith("XC") & (df["birds"] == "nocall")]
# sort by filename and start time
df = df.sort_values(["filename", "start_sec"]).reset_index(drop=True)

In [5]:
# sample intervals

STRIDE_S = 5
LEN_S = c["AUDIO_TARGET_LEN_S"]

out_df = pd.DataFrame()

for ix, row in tqdm(df.iterrows(), total=df.shape[0]):
    for start_s in np.arange(row["start_sec"], row["end_sec"] - LEN_S, STRIDE_S):
        file_info = train_metadata[train_metadata.filename == row.filename].iloc[0]
        out_df = out_df.append(
            {
                "filename": row["filename"],
                "_from_s": start_s,
                "_to_s": start_s + LEN_S,
                "_primary_labels": "nocall",
                "__date__": file_info["date"],
                "_year": (file_info["date"][:4]),
                "_month": int(file_info["date"][5:7]),
                "latitude": file_info["latitude"],
                "longitude": file_info["longitude"],
                "rating": file_info["rating"],
                "_source": "n_nocall_labels",
            },
            ignore_index=True,
        )

100%|██████████| 518/518 [00:08<00:00, 60.36it/s]


In [6]:
# add missing standard cols
for col in c["DATASET_COLS"]:
    if col not in out_df:
        out_df[col] = ""

# save
out_df = out_df[c["DATASET_COLS"]]
out_df.to_csv("/app/_work/n_nocall.csv", index=False)