In [1]:
import os
import pandas as pd
import librosa.feature
import torch.nn as nn
import torch.optim as optim
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt
from torch.utils.data import Dataset, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.manifold import TSNE

In [2]:
CLIPS_PATH = os.path.join("clips_filtered")
CSV_PATH = os.path.join("all_validated.tsv")

df = pd.read_csv(CSV_PATH, sep="\t")

df = df[df["gender"].notna()]
df = df[df["gender"].isin(["male_masculine", "female_feminine"])]

df["file_path"] = df["path"].apply(lambda x: os.path.join(CLIPS_PATH, x))
df["label"] = df["gender"].map({"male_masculine": 0, "female_feminine": 1})

df = df[df["file_path"].apply(os.path.exists)]

df = df[["file_path", "label"]]

print(df.head())

                              file_path  label
0  clips17/common_voice_en_40117514.mp3      0
1  clips17/common_voice_en_39603786.mp3      0
2  clips17/common_voice_en_40048623.mp3      0
3  clips17/common_voice_en_39602035.mp3      0
4  clips17/common_voice_en_39593984.mp3      0


In [3]:
def extract_mfcc(file_path, n_mfcc=13, max_len=100):
    y, sr = librosa.load(file_path, sr=22050)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] > max_len:
        mfcc = mfcc[:, :max_len]
    else:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode="constant")
    return mfcc

In [4]:
features, labels = [], []

for _, row in df.iterrows():
    mfcc = extract_mfcc(row["file_path"])
    if mfcc is not None:
        features.append(mfcc)
        labels.append(row["label"])

features = np.array(features)
labels = np.array(labels)

np.save("features.npy", features)
np.save("labels.npy", labels)

print(df["label"].value_counts())
print(f"{len(features)} files handled")

features = np.load("features.npy")
labels = np.load("labels.npy")

label
0    225
1    224
Name: count, dtype: int64
449 files handled


In [5]:
class VoiceDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        mfcc = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mfcc.T, label

In [6]:
input_size = 13
hidden_size = 128
batch_size = 32
lr = 0.001
epochs = 10

dataset = VoiceDataset(features, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [7]:
class GenderClassifier(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GenderClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, 2)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        x = h_n[-1]
        x = self.dropout(x)
        return self.fc(x)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GenderClassifier(input_size, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    train_losses = []
    for batch_features, batch_labels in train_loader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_losses.append(loss.item())
    avg_train_loss = np.mean(train_losses)

    model.eval()
    val_losses = []
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for batch_features, batch_labels in val_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            val_losses.append(loss.item())
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(batch_labels.cpu().numpy())
    avg_val_loss = np.mean(val_losses)

    accuracy = accuracy_score(all_targets, all_preds)
    precision = precision_score(all_targets, all_preds)
    recall = recall_score(all_targets, all_preds)

    print(f"Epoch {epoch + 1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, "
          f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

Epoch 1: Train Loss: 0.6961, Val Loss: 0.6652, Accuracy: 0.6333, Precision: 0.6857, Recall: 0.5217
Epoch 2: Train Loss: 0.6342, Val Loss: 0.6294, Accuracy: 0.7333, Precision: 0.7115, Recall: 0.8043
Epoch 3: Train Loss: 0.5913, Val Loss: 0.5842, Accuracy: 0.7667, Precision: 0.8205, Recall: 0.6957
Epoch 4: Train Loss: 0.5583, Val Loss: 0.5274, Accuracy: 0.7778, Precision: 0.8250, Recall: 0.7174
Epoch 5: Train Loss: 0.5145, Val Loss: 0.4918, Accuracy: 0.7667, Precision: 0.8205, Recall: 0.6957
Epoch 6: Train Loss: 0.4915, Val Loss: 0.4602, Accuracy: 0.7889, Precision: 0.8462, Recall: 0.7174
Epoch 7: Train Loss: 0.4341, Val Loss: 0.4397, Accuracy: 0.7889, Precision: 0.8649, Recall: 0.6957
Epoch 8: Train Loss: 0.3977, Val Loss: 0.4048, Accuracy: 0.8444, Precision: 0.8810, Recall: 0.8043
Epoch 9: Train Loss: 0.3797, Val Loss: 0.3879, Accuracy: 0.8111, Precision: 0.8718, Recall: 0.7391
Epoch 10: Train Loss: 0.3354, Val Loss: 0.3618, Accuracy: 0.8111, Precision: 0.8537, Recall: 0.7609


In [9]:
def extract_embeddings(model, dataset, batch_size=32):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_embeddings = []
    all_labels = []

    with torch.no_grad():
        for batch_features, batch_labels in dataloader:
            batch_features = batch_features.to(device)
            _, (h_n, _) = model.lstm(batch_features)
            embeddings = h_n[-1]
            embeddings = model.dropout(embeddings)
            all_embeddings.append(embeddings.cpu().numpy())
            all_labels.append(batch_labels.cpu().numpy())

    all_embeddings = np.concatenate(all_embeddings, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    return all_embeddings, all_labels

embeddings, labels = extract_embeddings(model, dataset, batch_size=32)

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

plt.figure(figsize=(8,6))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='viridis', alpha=0.7)
plt.title("tsne: отображение эмбеддингов в 2d")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")

legend1 = plt.legend(*scatter.legend_elements(), title="Класс")
plt.gca().add_artist(legend1)
plt.show()

