In [1]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
from py7zr import unpack_7zarchive
import shutil
from datasets import Dataset
import librosa
import soundfile as sf
import pandas as pd

In [24]:
api = KaggleApi()
api.authenticate()

In [25]:
download_path = "./data_raw"
os.makedirs(download_path, exist_ok=True)

In [26]:
api.competition_download_file('tensorflow-speech-recognition-challenge', path=download_path, file_name = 'train.7z')

Downloading train.7z to ./data_raw


100%|██████████| 1.04G/1.04G [00:03<00:00, 365MB/s]







In [3]:
shutil.register_unpack_format('7zip', ['.7z'], unpack_7zarchive)
shutil.unpack_archive('./data_raw/train.7z', './data_raw/')

In [34]:
final_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']
idname = {i: name for i, name in enumerate(final_labels)}
nameid = {name: i for i, name in idname.items()}

In [35]:
data_dir = './data_raw/train/audio'
data = []
for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    if os.path.isdir(folder_path):
        label = folder
        label_name = label
        if label == '_background_noise_':
            label_name = 'silence'
        elif label not in final_labels:
            label_name = 'unknown'
        label_id = nameid[label_name]

        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                relative_path = os.path.join(label, file_name).replace("\\", "/")
                data.append({
                    "audio_path": relative_path,
                    "label": label_name,
                    "label_id": label_id
                })

In [36]:
def read_list(filepath):
    with open(filepath, "r") as f:
        return set(line.strip().replace("\\", "/") for line in f if line.strip())

val_list = read_list("./data_raw/train/validation_list.txt")
test_list = read_list("./data_raw/train/testing_list.txt")

train_data, val_data, test_data = [], [], []

for example in data:
    path = example["audio_path"]
    if path in val_list:
        val_data.append(example)
    elif path in test_list:
        test_data.append(example)
    else:
        train_data.append(example)


train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

print(f"Train: {len(train_dataset)}, Validation: {len(val_dataset)}, Test: {len(test_dataset)}")

Train: 51492, Validation: 6798, Test: 6835


In [37]:
output_dir  = "./data"
os.makedirs(output_dir, exist_ok=True)

def save_dataset(dataset, name, output_dir):
    df = dataset.to_pandas()
    output_path = os.path.join(output_dir, f"{name}.csv")
    df.to_csv(output_path, index=False)

save_dataset(train_dataset, "train", output_dir)
save_dataset(val_dataset, "validation", output_dir)
save_dataset(test_dataset, "test", output_dir)

In [38]:
input_dir = './data_raw/train/audio/_background_noise_'
output_dir = './data_raw/train/audio/processed_silence'
csv_path = './data/silence_dataset.csv'
sr = 16000
chunk_duration = 1.0 
chunk_samples = int(sr * chunk_duration)

os.makedirs(output_dir, exist_ok=True)

data = []

for filename in os.listdir(input_dir):
    if filename.endswith('.wav'):
        filepath = os.path.join(input_dir, filename)
        signal, _ = librosa.load(filepath, sr=sr)

        num_chunks = len(signal) // chunk_samples
        base_name = os.path.splitext(filename)[0]

        for i in range(num_chunks):
            chunk = signal[i * chunk_samples : (i + 1) * chunk_samples]
            out_filename = f"{base_name}_chunk_{i}.wav"
            out_path = os.path.join(output_dir, out_filename)

            sf.write(out_path, chunk, sr)
            data.append({
                "audio_path": f'processed_silence/{out_filename}',
                "label": 'silence',
                "label_id": 10
            })

silence_df = pd.DataFrame(data)

In [39]:
train_csv = "./data/train.csv"
val_csv = "./data/validation.csv"
test_csv = "./data/test.csv"

silence_df["group"] = silence_df["audio_path"].apply(lambda x: x.split("_chunk_")[0])


train_rows, val_rows, test_rows = [], [], []

for group, group_df in silence_df.groupby("group"):
    group_df = group_df.sample(frac=1, random_state=42)

    n = len(group_df)
    n_train = int(0.8 * n)
    n_val = int(0.1 * n)

    train_rows.append(group_df.iloc[:n_train].drop(columns="group"))
    val_rows.append(group_df.iloc[n_train:n_train + n_val].drop(columns="group"))
    test_rows.append(group_df.iloc[n_train + n_val:].drop(columns="group"))

new_train_df = pd.concat(train_rows)
new_val_df = pd.concat(val_rows)
new_test_df = pd.concat(test_rows)

train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)
test_df = pd.read_csv(test_csv)

def remove_background_noise(df):
    return df[~df["audio_path"].str.startswith("_background_noise_")]

train_df = remove_background_noise(train_df)
val_df = remove_background_noise(val_df)
test_df = remove_background_noise(test_df)

train_df = pd.concat([train_df, new_train_df], ignore_index=True)
val_df = pd.concat([val_df, new_val_df], ignore_index=True)
test_df = pd.concat([test_df, new_test_df], ignore_index=True)

train_df.to_csv(train_csv, index=False)
val_df.to_csv(val_csv, index=False)
test_df.to_csv(test_csv, index=False)

### Wczytanie danych

In [None]:
from AudioDataset import AudioDataset
import torch

dataset = AudioDataset(
    csv_path="./data/train.csv",
    audio_dir="./data_raw/train/audio"
)

In [None]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

for features, labels in dataloader:
    print(features.shape)
    print(labels.shape)
    break