In [24]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from datasets import load_dataset, DatasetDict, concatenate_datasets, ClassLabel
from transformers import WhisperFeatureExtractor, AutoModelForAudioClassification
# WhisperModel, WhisperConfig, 
import torch
import random
# import torch.nn as nn
import numpy as np
from collections import Counter
from transformers import Trainer, TrainingArguments

def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if you are using multi-GPU.
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    # The below two lines are for deterministic algorithm behavior in CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed()

# load & preprocess

In [52]:
# import data
ds_full = load_dataset("xbgoose/dusha")

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/37 [00:00<?, ?it/s]

In [53]:
ds_full

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 150352
    })
    test: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 14035
    })
})

In [54]:
unique_labels = list(set(ds_full['train']['emotion']))
class_label = ClassLabel(names=['neutral', 'angry', 'positive', 'sad', 'other']) # чтобы соблюсти изначальную нумерацию
ds_full["train"] = ds_full["train"].cast_column("emotion", class_label)
ds_full["test"] = ds_full["test"].cast_column("emotion", class_label)

Casting the dataset:   0%|          | 0/150352 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14035 [00:00<?, ? examples/s]

In [55]:
class_label

ClassLabel(names=['neutral', 'angry', 'positive', 'sad', 'other'], id=None)

In [56]:
train = ds_full['train'].train_test_split(test_size=0.5, stratify_by_column="emotion", seed=42)
val_test = ds_full['test'].train_test_split(test_size=0.5, stratify_by_column="emotion", seed=42)

ds = DatasetDict({
    "train": train['train'],
    "val": val_test['train'],
    "test": val_test['test'],
})
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 75176
    })
    val: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 7017
    })
    test: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 7018
    })
})

In [57]:
np.unique(ds['train']['emotion'])

array([0, 1, 2, 3, 4])

In [58]:
def get_label_dist(ds, split):
    ds_split = ds[split]
    emotions_split = ds_split["emotion"]
    emotion_counts_split = Counter(emotions_split)
    print(split + ":", emotion_counts_split)
    return

get_label_dist(ds, "train")
get_label_dist(ds, "val")
get_label_dist(ds, "test")

train: Counter({0: 48908, 3: 10576, 1: 7837, 2: 7114, 4: 741})
val: Counter({0: 4517, 3: 1082, 1: 723, 2: 609, 4: 86})
test: Counter({0: 4517, 3: 1082, 1: 723, 2: 610, 4: 86})


In [59]:
# Делаем более сбалансированный датасет
def limit_class_samples(dataset_dict, split='train', label_field='label', class_limits=None, seed=42):
    split_dataset = dataset_dict[split]

    filtered_subsets = []

    for class_value, max_count in class_limits.items():
        subset = split_dataset.filter(lambda x: x[label_field] == class_value)
        subset = subset.select(range(min(len(subset), max_count)))
        filtered_subsets.append(subset)

    all_limited_classes = set(class_limits.keys())
    remaining = split_dataset.filter(lambda x: x[label_field] not in all_limited_classes)
    filtered_subsets.append(remaining)

    new_split = concatenate_datasets(filtered_subsets).shuffle(seed=seed)

    new_dataset_dict = dataset_dict.copy()
    new_dataset_dict[split] = new_split
    return new_dataset_dict

new_dataset = limit_class_samples(ds, split='train', label_field='emotion', class_limits={0: 10600})
new_dataset = limit_class_samples(new_dataset, split='val', label_field='emotion', class_limits={0: 1100})
new_dataset = limit_class_samples(new_dataset, split='test', label_field='emotion', class_limits={0: 1100})

get_label_dist(new_dataset, "train")
get_label_dist(new_dataset, "val")
get_label_dist(new_dataset, "test")

Filter:   0%|          | 0/75176 [00:00<?, ? examples/s]

Filter:   0%|          | 0/75176 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7017 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7017 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7018 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7018 [00:00<?, ? examples/s]

train: Counter({0: 10600, 3: 10576, 1: 7837, 2: 7114, 4: 741})
val: Counter({0: 1100, 3: 1082, 1: 723, 2: 609, 4: 86})
test: Counter({0: 1100, 3: 1082, 1: 723, 2: 610, 4: 86})


In [65]:
ds = DatasetDict({
    "train": new_dataset['train'],
    "val": new_dataset['val'],
    "test": new_dataset['test'],
})
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 36868
    })
    val: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 3600
    })
    test: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 3601
    })
})

In [66]:
ds.push_to_hub('nixiieee/dusha_balanced')

Uploading the dataset shards:   0%|          | 0/11 [00:00<?, ?it/s]

Map:   0%|          | 0/3352 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3352 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3352 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3352 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3352 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3352 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3352 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3351 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3351 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3351 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Map:   0%|          | 0/3351 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/1801 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/686 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nixiieee/dusha_balanced/commit/94ef044a030bc8db41c4fcceb07b349f2fdb8fe0', commit_message='Upload dataset', commit_description='', oid='94ef044a030bc8db41c4fcceb07b349f2fdb8fe0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nixiieee/dusha_balanced', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nixiieee/dusha_balanced'), pr_revision=None, pr_num=None)