In [1]:
from common_voice import VoiceIdentificationPipeline
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
def run_pipeline_to_find_accuracy(features_to_use, min_samples, max_samples):
    pipeline = VoiceIdentificationPipeline(sample_rate=16000, features_to_use=features_to_use)
    common_voice_tar_gz = "pt.tar"
    dataset_base_path = pipeline.extract_common_voice_dataset(common_voice_tar_gz)
    df_metadata = pipeline.load_common_voice_dataset(
        dataset_base_path, 
        max_samples=None, # Limite o carregamento inicial
        split_file='validated.tsv',
        min_samples_per_speaker=min_samples,
        max_samples_per_speaker=max_samples # Passa o novo parâmetro
    )
    
    features, labels = pipeline.process_dataset(df_metadata, dataset_base_path)
    pipeline.features = features
    pipeline.labels = labels

    unique_labels = np.unique(labels)
    encoded_labels = pipeline.label_encoder.fit_transform(labels)

    # Divisão treino/teste
    # Stratify é crucial para manter a proporção das classes (locutores) em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(
        features, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
    )

    X_train_scaled = pipeline.scaler.fit_transform(X_train)
    X_test_scaled = pipeline.scaler.transform(X_test)
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced') 
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy


In [3]:
run_pipeline_to_find_accuracy([0,1,0,0,0,0], 10, 30)

Diretório de extração 'cv-corpus' já existe. Pulando a extração.
Metadados carregados com sucesso! Número total de amostras: 22131
Número de amostras válidas após limpeza inicial: 22131
Número de locutores únicos após limpeza inicial: 319

Aplicando filtragem: Mantendo locutores com pelo menos 10 amostras...
Aplicando subamostragem: Limitando a 30 amostras por locutor.
Número final de amostras após filtragem: 2965
Número final de locutores únicos: 134
Novas classes (locutores) e suas contagens:
client_id
9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015eb7b12d269cf363dec73496b009c86266b8b7dda93ff257139bab1ddf599719f46ce5cee865a6d80c1    30
939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810f4ab853aa12872a811941be35b7e91af5df8350d75d3c61d1301aba694c41deb3f38f1414703bc0461    30
a9a5b0da1aadab7621f6ff144245201cafbe186e8871008f87abf78d1f6291c5fc94e7904d54390209efb537ab7dc4fc46c600e40afecd8b52c96ac3e69abf2a    30
30bb34575fe9b8717726a6140ff62bda3f40f45e233f561d56b5977801ed068462b89e1ab8644987c8158

0.006745362563237774

In [3]:
FEATURES_TO_TEST = [4]
NUM_FEATURES = 6
MAX_NUM_SAMPLES = 90
accuracy_stats = {}
for i in FEATURES_TO_TEST:
    vec0 = [0, 0, 0, 0, 0, 0]
    vec0[i] = 1
    features_to_use = vec0
    for j in range(10, MAX_NUM_SAMPLES, 20):
        min_sample = j
        max_sample = j + 40
        accuracy = run_pipeline_to_find_accuracy(features_to_use, min_sample, max_sample)
        try:
            accuracy_stats[j].append(accuracy)
        except:
            accuracy_stats[j] = [accuracy]

Diretório de extração 'cv-corpus' já existe. Pulando a extração.
Metadados carregados com sucesso! Número total de amostras: 22131
Número de amostras válidas após limpeza inicial: 22131
Número de locutores únicos após limpeza inicial: 319

Aplicando filtragem: Mantendo locutores com pelo menos 10 amostras...
Aplicando subamostragem: Limitando a 50 amostras por locutor.
Número final de amostras após filtragem: 3949
Número final de locutores únicos: 134
Novas classes (locutores) e suas contagens:
client_id
9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015eb7b12d269cf363dec73496b009c86266b8b7dda93ff257139bab1ddf599719f46ce5cee865a6d80c1    50
939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810f4ab853aa12872a811941be35b7e91af5df8350d75d3c61d1301aba694c41deb3f38f1414703bc0461    50
a9a5b0da1aadab7621f6ff144245201cafbe186e8871008f87abf78d1f6291c5fc94e7904d54390209efb537ab7dc4fc46c600e40afecd8b52c96ac3e69abf2a    50
30bb34575fe9b8717726a6140ff62bda3f40f45e233f561d56b5977801ed068462b89e1ab8644987c8158

In [5]:
vec0 = [1, 1, 1, 1, 1, 1]
features_to_use = vec0
for j in range(10, MAX_NUM_SAMPLES, 20):
    min_sample = j
    max_sample = j + 40
    accuracy = run_pipeline_to_find_accuracy(features_to_use, min_sample, max_sample)
    try:
        accuracy_stats[j].append(accuracy)
    except:
        accuracy_stats[j] = [accuracy]

Diretório de extração 'cv-corpus' já existe. Pulando a extração.
Metadados carregados com sucesso! Número total de amostras: 22131
Número de amostras válidas após limpeza inicial: 22131
Número de locutores únicos após limpeza inicial: 319

Aplicando filtragem: Mantendo locutores com pelo menos 10 amostras...
Aplicando subamostragem: Limitando a 50 amostras por locutor.
Número final de amostras após filtragem: 3949
Número final de locutores únicos: 134
Novas classes (locutores) e suas contagens:
client_id
9d91497cb25399ad6cf6a4d88548fa84ee3c3687363015eb7b12d269cf363dec73496b009c86266b8b7dda93ff257139bab1ddf599719f46ce5cee865a6d80c1    50
939991dfcf12d8250739f3b4e5ed47e0dc4bf8ffb69810f4ab853aa12872a811941be35b7e91af5df8350d75d3c61d1301aba694c41deb3f38f1414703bc0461    50
a9a5b0da1aadab7621f6ff144245201cafbe186e8871008f87abf78d1f6291c5fc94e7904d54390209efb537ab7dc4fc46c600e40afecd8b52c96ac3e69abf2a    50
30bb34575fe9b8717726a6140ff62bda3f40f45e233f561d56b5977801ed068462b89e1ab8644987c8158

In [6]:
accuracy_stats

{10: [0.42658227848101266, 0.8367088607594937],
 30: [0.5824964131994261, 0.9010043041606887],
 50: [0.6752577319587629, 0.9037800687285223],
 70: [0.6761904761904762, 0.9352380952380952]}

In [9]:
pd.DataFrame(accuracy_stats)

Unnamed: 0,10,30,50,70
0,0.279747,0.416069,0.522337,0.60381
1,0.002532,0.011478,0.027491,0.032381
2,0.002532,0.011478,0.027491,0.032381
3,0.864557,0.915352,0.946735,0.942857
4,0.002532,0.011478,0.027491,0.032381
5,0.051899,0.077475,0.158076,0.190476
