In [1]:
from main_new import VoiceIdentificationPipeline
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
pipeline = VoiceIdentificationPipeline(sample_rate=16000)

In [3]:
common_voice_tar_gz = "pt.tar"
dataset_base_path = pipeline.extract_common_voice_dataset(common_voice_tar_gz)

Diretório de extração 'cv-corpus' já existe. Pulando a extração.


In [4]:
df_metadata = pipeline.load_common_voice_dataset(
    dataset_base_path, 
    max_samples=None, # Limite o carregamento inicial
    split_file='validated.tsv',
    min_samples_per_speaker=80,
    max_samples_per_speaker=100 # Passa o novo parâmetro
)

Metadados carregados com sucesso! Número total de amostras: 22131
Número de amostras válidas após limpeza inicial: 22131
Número de locutores únicos após limpeza inicial: 319

Aplicando filtragem: Mantendo locutores com pelo menos 80 amostras...
Aplicando subamostragem: Limitando a 100 amostras por locutor.
Número final de amostras após filtragem: 2146
Número final de locutores únicos: 22
Novas classes (locutores) e suas contagens:
client_id
9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015eb7b12d269cf363dec73496b009c86266b8b7dda93ff257139bab1ddf599719f46ce5cee865a6d80c1    100
261bca862597269f97d1fd29b83962289b6f0bf276dc81d17bc992af4c86c92597be42aab409ba98400aaf045a99240a44d2af249ed7f883a7902e9343505fa5    100
c4b22e1d51a2dd8996727f70ddece23ef448164a1a4a5507e4608173685448b5e4821c1b5010ef68018093b01651ee6480c5573e00d5dab17098d5dbcf0ead0d    100
7d20201a44583f681d3e6f97849ee7d16e50e61c9ac3443892ac935294b48dbc047c7f17ae03a941cdf8e1735e2f3dad517e88e8acb26642c8c03bba2ac2385e    100
328e68bbd38

In [5]:
features, labels = pipeline.process_dataset(df_metadata, dataset_base_path)
pipeline.features = features
pipeline.labels = labels

Processadas 500/2146 amostras...
Processadas 1000/2146 amostras...
Processadas 1500/2146 amostras...
Processadas 2000/2146 amostras...

Processamento concluído: 2146 amostras válidas


In [6]:
unique_labels = np.unique(labels)
encoded_labels = pipeline.label_encoder.fit_transform(labels)

# Divisão treino/teste
# Stratify é crucial para manter a proporção das classes (locutores) em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    features, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

In [7]:
X_train_scaled = pipeline.scaler.fit_transform(X_train)
X_test_scaled = pipeline.scaler.transform(X_test)
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced') 
model.fit(X_train_scaled, y_train)

In [8]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.958139534883721

In [9]:
importances = model.feature_importances_
importances

array([0.01939531, 0.01674688, 0.02532601, 0.01412283, 0.01207611,
       0.01187328, 0.00497354, 0.01406836, 0.01530284, 0.01336762,
       0.01638233, 0.01081653, 0.02333796, 0.00486421, 0.00514108,
       0.03700477, 0.00945095, 0.01926626, 0.0152858 , 0.01272898,
       0.02906627, 0.01211069, 0.02085837, 0.01748873, 0.01861456,
       0.01959155, 0.02094841, 0.00898435, 0.00543902, 0.00356334,
       0.00527448, 0.00352329, 0.0056103 , 0.00514248, 0.00870852,
       0.00411157, 0.00553634, 0.00417326, 0.00989516, 0.00776701,
       0.01028733, 0.01600052, 0.0059583 , 0.00670129, 0.00454205,
       0.00337505, 0.00972717, 0.00346109, 0.00722891, 0.01047418,
       0.00435415, 0.00363649, 0.01213034, 0.00360963, 0.02623431,
       0.00453305, 0.00716646, 0.00583696, 0.00538897, 0.00738153,
       0.00870233, 0.01105549, 0.00655309, 0.00939784, 0.00829943,
       0.01018457, 0.00515609, 0.00245233, 0.00332996, 0.00331885,
       0.00227121, 0.00255425, 0.00178348, 0.00229653, 0.00183