In [1]:
import torch
import pandas as pd

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device for GPU acceleration.")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device for GPU acceleration.")
else:
    device = torch.device("cpu")
    print("No GPU device found. Using CPU.")

Using CUDA device for GPU acceleration.


In [9]:
df = pd.read_csv("./data/train.csv")

In [10]:
class_counts = df['label'].value_counts()
print("Przed downsamplingiem:\n", class_counts)

Przed downsamplingiem:
 label
unknown    32948
stop        1885
on          1864
go          1861
yes         1860
no          1853
right       1852
up          1843
down        1842
left        1839
off         1839
silence      316
Name: count, dtype: int64


In [11]:
print("\nPo downsamplingu:\n", df['label'].value_counts(normalize=True) * 100)


Po downsamplingu:
 label
unknown    63.603722
stop        3.638856
on          3.598317
go          3.592525
yes         3.590595
no          3.577082
right       3.575152
up          3.557778
down        3.555847
left        3.550056
off         3.550056
silence     0.610015
Name: proportion, dtype: float64


In [12]:
target_count = class_counts[class_counts.index != 'unknown'].max()

In [13]:
unknown_df = df[df['label'] == 'unknown']
other_df = df[df['label'] != 'unknown']

unknown_downsampled = unknown_df.sample(n=target_count, random_state=42)

In [14]:
balanced_df = pd.concat([unknown_downsampled, other_df])

print("\nPo downsamplingu:\n", balanced_df['label'].value_counts(normalize=True) * 100)


Po downsamplingu:
 label
unknown    9.089156
stop       9.089156
on         8.987897
go         8.973432
yes        8.968610
no         8.934857
right      8.930035
up         8.886639
down       8.881817
left       8.867351
off        8.867351
silence    1.523699
Name: proportion, dtype: float64


In [15]:
balanced_df.to_csv("./data/train_balanced.csv", index=False)