In [41]:
import torch
import pandas as pd

In [42]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device for GPU acceleration.")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device for GPU acceleration.")
else:
    device = torch.device("cpu")
    print("No GPU device found. Using CPU.")

Using CUDA device for GPU acceleration.


In [43]:
df = pd.read_csv("./data/train.csv")

In [44]:
class_counts = df['label'].value_counts()
print("Przed downsamplingiem:\n", df['label'].value_counts(normalize=True) * 100)

Przed downsamplingiem:
 label
unknown    63.603722
stop        3.638856
on          3.598317
go          3.592525
yes         3.590595
no          3.577082
right       3.575152
up          3.557778
down        3.555847
left        3.550056
off         3.550056
silence     0.610015
Name: proportion, dtype: float64


In [45]:
target_count = class_counts[class_counts.index != 'unknown'].max()

In [46]:
unknown_df = df[df['label'] == 'unknown']
other_df = df[df['label'] != 'unknown']

unknown_df['sub_label'] = unknown_df['audio_path'].apply(lambda x: x.split('/')[0])
unique_groups = unknown_df['sub_label'].unique()
n_groups = len(unique_groups)
samples_per_group = target_count // n_groups

unknown_df_balanced_samples = (
    unknown_df
    .groupby('sub_label')
    .apply(lambda x: x.sample(n=min(samples_per_group, len(x)), random_state=42))
    .reset_index(drop=True)
)

unknown_df_balanced_samples.drop(columns=['sub_label'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unknown_df['sub_label'] = unknown_df['audio_path'].apply(lambda x: x.split('/')[0])
  .apply(lambda x: x.sample(n=min(samples_per_group, len(x)), random_state=42))


In [47]:
balanced_df = pd.concat([unknown_df_balanced_samples, other_df])

print("\nPo downsamplingu:\n", balanced_df['label'].value_counts(normalize=True) * 100)


Po downsamplingu:
 label
stop       9.096173
unknown    9.018964
on         8.994837
go         8.980360
yes        8.975534
no         8.941756
right      8.936930
up         8.893500
down       8.888674
left       8.874198
off        8.874198
silence    1.524876
Name: proportion, dtype: float64


In [48]:
balanced_df.to_csv("./data/train_balanced.csv", index=False)