In [None]:
import random
import numpy as np
import torch
import os

def set_seed(seed):
    """
    设置所有随机种子以确保结果可复现
    
    Args:
        seed (int): 随机种子数值
    """
    random.seed(seed)  # Python的随机种子
    np.random.seed(seed)  # Numpy的随机种子
    torch.manual_seed(seed)  # PyTorch的CPU随机种子
    torch.cuda.manual_seed(seed)  # PyTorch的GPU随机种子
    torch.cuda.manual_seed_all(seed)  # 如果使用多GPU，为所有GPU设置种子
    
    # 设置cudnn的随机种子
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # 设置Python的hash种子
    os.environ['PYTHONHASHSEED'] = str(seed)
seed = 2003  # 可以设置任何整数
set_seed(seed)

In [1]:
import os
import shutil
from tqdm import tqdm

def organize_files(src_root_dir, dst_root_dir):
    cases = ['case1', 'case2', 'case3', 'case4']

    # Create directories for each case
    for case in cases:
        case_dir = os.path.join(dst_root_dir, case)
        if not os.path.exists(case_dir):
            os.makedirs(case_dir)

    # Traverse the source directory
    for main_dir in tqdm(os.listdir(src_root_dir), desc="Processing directories"):
        main_dir_path = os.path.join(src_root_dir, main_dir)
        if os.path.isdir(main_dir_path):
            for sub_dir in os.listdir(main_dir_path):
                sub_dir_path = os.path.join(main_dir_path, sub_dir)
                if os.path.isdir(sub_dir_path):
                    case_name = sub_dir.split('_')[0]
                    case_dir = os.path.join(dst_root_dir, case_name)

                    # 首先复制原始的case1_1.wav文件
                    if case_name == 'case1':
                        original_wav = os.path.join(sub_dir_path, f"{main_dir}_{sub_dir}.wav")
                        if os.path.exists(original_wav):
                            dst_original_wav = os.path.join(case_dir, f"{main_dir}_{sub_dir}.wav")
                            shutil.copy(original_wav, dst_original_wav)

                    # 然后复制分割后的音频和对应的npy文件
                    for filename in os.listdir(sub_dir_path):
                        if filename.startswith('sample_') and (filename.endswith('.wav') or filename.endswith('.npy')):
                            src_path = os.path.join(sub_dir_path, filename)
                            dst_path = os.path.join(case_dir, f"{main_dir}_{sub_dir}_{filename}")
                            shutil.copy(src_path, dst_path)
src_directory = 'dataset_1k2k3k_withbandpass_extrafeatures_v3'
dst_directory = 'data_1k2k3k_nobandpass_organized_dataset_extrafeatures'
    
print(f"源目录: {src_directory}")
print(f"目标目录: {dst_directory}")

if not os.path.exists(src_directory):
    print(f"源目录不存在: {src_directory}")
    
organize_files(src_directory, dst_directory)
print("文件整理完成。")

源目录: dataset_1k2k3k_withbandpass_extrafeatures_v3
目标目录: data_1k2k3k_nobandpass_organized_dataset_extrafeatures


Processing directories: 100%|██████████| 19/19 [00:41<00:00,  2.18s/it]

文件整理完成。





In [2]:
import os
import shutil

# 定义路径
root_path = 'data_1k2k3k_nobandpass_organized_dataset_extrafeatures'
new_structure_path = 'data_1k2k3k_nobandpass_organized_withoutA3A7_dataset'

# 创建新的目录结构
if not os.path.exists(new_structure_path):
    os.makedirs(new_structure_path)

# 遍历case1到case4
for case in ['case1', 'case2', 'case3', 'case4']:
    case_path = os.path.join(root_path, case)
    for root, dirs, files in os.walk(case_path):
        for file in files:
            if file.endswith('.wav'):
                # 解析文件名获取前缀、case_id和sample_set
                parts = file.split('_')
                prefix = parts[0]  # 获取前缀部分，如A1

                # 跳过prefix为"A3"或"A7"的文件
                if prefix in ['A3', 'A7']:
                    continue

                case_id = f"{parts[1]}_{parts[2]}"
                sample_set = parts[3]
                sample_id = parts[4].split('.')[0]  # 获取sample集编号

                # 创建新的路径
                new_case_path = os.path.join(new_structure_path, case)
                if not os.path.exists(new_case_path):
                    os.makedirs(new_case_path)

                new_prefix_path = os.path.join(new_case_path, prefix)
                if not os.path.exists(new_prefix_path):
                    os.makedirs(new_prefix_path)

                new_case_id_path = os.path.join(new_prefix_path, case_id)
                if not os.path.exists(new_case_id_path):
                    os.makedirs(new_case_id_path)

                new_sample_set_path = os.path.join(new_case_id_path, f'sample_{sample_set}')
                if not os.path.exists(new_sample_set_path):
                    os.makedirs(new_sample_set_path)

                # 移动文件到新的路径
                old_file_path = os.path.join(root, file)
                new_file_path = os.path.join(new_sample_set_path, file)
                shutil.move(old_file_path, new_file_path)

                # npy_file = file.replace('.wav', '.npy')
                # old_npy_path = os.path.join(root, npy_file)
                # if os.path.exists(old_npy_path):  # 检查npy文件是否存在
                #     new_npy_path = os.path.join(new_sample_set_path, npy_file)
                #     shutil.move(old_npy_path, new_npy_path)

print("文件重新组织完成。")


文件重新组织完成。


In [3]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split
import os
import sys

In [4]:
import os
from pathlib import Path
import torchaudio
from tqdm import tqdm

data = []

for case in ['case1', 'case2', 'case3', 'case4']:
    case_path = Path(f'data_1k2k3k_nobandpass_organized_withoutA3A7_dataset/{case}')
    for path in tqdm(case_path.glob("**/*.wav")):
        name = path.stem
        # 获取前缀、case_id和sample_set
        parts = path.parts[-4:]  # 获取最后4部分: case文件夹, 前缀文件夹, case_id文件夹, 和文件名
        prefix = parts[1]  # 前缀文件夹
        case_id = parts[2]  # case_id文件夹
        sample_set = parts[3].split('_')[1]  # 从文件名中提取sample_set

        try:
            # 加载文件
            s = torchaudio.load(path)
            # 加载对应的 .npy 文件
            # npy_path = path.with_suffix('.npy')
            # if npy_path.exists():
            #     energy_features = np.load(npy_path)
            # else:
            #     energy_features = None
            #     print(f"Warning: No .npy file found for {path}")
            data.append({
                "name": name,
                "path": str(path),
                "case": case,
                "prefix": prefix,
                "case_id": case_id,
                "sample_set": sample_set,
                # "energy_features": energy_features
            })
        except Exception as e:
            # 跳过损坏的文件
            pass

# 显示收集到的数据条目数
print(f"Collected {len(data)} items.")


684it [00:04, 137.61it/s]
684it [00:05, 133.17it/s]
988it [00:07, 144.43it/s]

In [5]:
import pandas as pd
df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,path,case,prefix,case_id,sample_set
0,A1_case1_1_sample_10_2,data_1k2k3k_nobandpass_organized_withoutA3A7_d...,case1,case1_1,sample_sample,case1
1,A1_case1_1_sample_11_2,data_1k2k3k_nobandpass_organized_withoutA3A7_d...,case1,case1_1,sample_sample,case1
2,A1_case1_1_sample_12_2,data_1k2k3k_nobandpass_organized_withoutA3A7_d...,case1,case1_1,sample_sample,case1
3,A1_case1_1_sample_13_3,data_1k2k3k_nobandpass_organized_withoutA3A7_d...,case1,case1_1,sample_sample,case1
4,A1_case1_1_sample_14_3,data_1k2k3k_nobandpass_organized_withoutA3A7_d...,case1,case1_1,sample_sample,case1


In [6]:
import os
import pandas as pd
import torch
import torchaudio
from scipy.io import wavfile
from scipy.signal import resample
from torchaudio.utils import download_asset
from IPython.display import Audio

# Define function to resample audio
def resample_audio(data, orig_sr, target_sr=16000):
    number_of_samples = round(len(data) * float(target_sr) / orig_sr)
    resampled_data = resample(data, number_of_samples)
    return resampled_data

# Load and process your dataframe `df`
file_path_column = "path"
df["status"] = df[file_path_column].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["status"])
df = df.drop("status", axis=1)

# Shuffle and reset index of dataframe
df = df.sample(frac=1).reset_index(drop=True)
print(df.head())  # Check the first few rows after preprocessing

# Process each audio file in the dataframe
for index, row in df.iterrows():
    file_path = row[file_path_column]
    orig_sample_rate, data = wavfile.read(file_path)

    # Resample audio to target sample rate (16000 Hz)
    resampled_data = resample_audio(data, orig_sample_rate, target_sr=16000)

    # Save the resampled data back to a file
    resampled_path = file_path.replace(".wav", "_resampled.wav")
    torchaudio.save(resampled_path, torch.from_numpy(resampled_data).unsqueeze(0), sample_rate=16000)

    # Update the path in the dataframe
    df.at[index, file_path_column] = resampled_path

    # Print progress or any other processing steps
    print(f"Processed {file_path} and saved resampled audio to {resampled_path}")

# Optionally, save the updated dataframe with processed data paths
df.to_csv("processed_audio_data.csv", index=False)


                     name                                               path  \
0  E9_case2_4_sample_20_2  data_1k2k3k_nobandpass_organized_withoutA3A7_d...   
1  E11_case2_1_sample_3_1  data_1k2k3k_nobandpass_organized_withoutA3A7_d...   
2  E3_case4_5_sample_29_2  data_1k2k3k_nobandpass_organized_withoutA3A7_d...   
3  A10_case1_1_sample_3_1  data_1k2k3k_nobandpass_organized_withoutA3A7_d...   
4  E3_case1_1_sample_12_2  data_1k2k3k_nobandpass_organized_withoutA3A7_d...   

    case   prefix        case_id sample_set  
0  case2  case2_4  sample_sample      case2  
1  case2  case2_1  sample_sample      case2  
2  case4  case4_5  sample_sample      case4  
3  case1  case1_1  sample_sample      case1  
4  case1  case1_1  sample_sample      case1  
Processed data_1k2k3k_nobandpass_organized_withoutA3A7_dataset\case2\E9\case2_4\sample_sample\E9_case2_4_sample_20_2.wav and saved resampled audio to data_1k2k3k_nobandpass_organized_withoutA3A7_dataset\case2\E9\case2_4\sample_sample\E9_case2_

Let's explore how many labels (emotions) are in the dataset with what distribution.

In [7]:
# Filter broken and non-existent paths

print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["status"])
df = df.drop("status", axis=1)
print(f"Step 1: {len(df)}")

df = df.sample(frac=1)
df = df.reset_index(drop=True)

# Print unique emotions and count
print("Labels: ", df["case"].unique())
print()
print(df.groupby("case").count()[["path"]])


Step 0: 4104
Step 1: 4104
Labels:  ['case3' 'case2' 'case1' 'case4']

       path
case       
case1   684
case2   684
case3  1368
case4  1368


Let's display some random sample of the dataset and run it a couple of times to get a feeling for the audio and the emotional label.

In [8]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
case_label = sample["case"]

print(f"ID Location: {idx}")
print(f"      Case: {case_label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), orig_sr = sr, target_sr = 16000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)


ID Location: 2322
      Case: case2



For training purposes, we need to split data into train test sets; in this specific example, we break with a `20%` rate for the test set.

In [9]:
import os
import pandas as pd
import torchaudio
import librosa
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from scipy.ndimage import maximum_filter1d, uniform_filter1d
from tqdm import tqdm
from pathlib import Path

# 添加一列来标识每个参与者
df['participant'] = df['name'].apply(lambda x: x.split('_')[0])

# 确保路径存在
df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["status"])
df = df.drop("status", axis=1)

# 获取所有参与者的唯一列表
participants = df['participant'].unique()

# 随机划分参与者
train_participants, eval_participants = train_test_split(participants, test_size=0.2, random_state=101)

# 根据参与者划分数据集
train_df = df[df['participant'].isin(train_participants)].reset_index(drop=True)
eval_df = df[df['participant'].isin(eval_participants)].reset_index(drop=True)

# 打印参与者信息以确认划分
print("Unique participants in training dataset:", train_df['participant'].unique())
print("Unique participants in evaluation dataset:", eval_df['participant'].unique())

# 保存为 CSV 文件
save_path = "data_1k2k3k_nobandpass_organized_withoutA3A7_dataset"
train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
eval_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)

# 打印数据集的形状
print(train_df.shape)
print(eval_df.shape)
# 加载数据集
from datasets import load_dataset

data_files = {
    "train": f"{save_path}/train.csv",
    "validation": f"{save_path}/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

# 指定输入和输出列
input_column = "path"
output_column = "case"

# 加载能量特征
# def load_energy_features(example):
#     try:
#         # 从 wav 文件路径获取对应的 npy 文件路径
#         wav_path = example['path']
#         npy_path = wav_path.replace('_resampled.wav', '.npy')
        
#         # 加载 npy 文件
#         features = np.load(npy_path)
#         example['energy_features'] = features
#         return example
#     except Exception as e:
#         print(f"Error loading energy features from {example}: {e}")
#         example['energy_features'] = None
#         return example

# train_dataset = train_dataset.map(load_energy_features)
# eval_dataset = eval_dataset.map(load_energy_features)

# 检查能量特征加载情况
# print("Train dataset with energy features:")
# print(train_dataset[:5])
# print("Validation dataset with energy features:")
# print(eval_dataset[:5])

# 打印数据集的前几行，检查数据完整性
print("Train dataset preview:")
print(train_dataset[:10])
print("Validation dataset preview:")
print(eval_dataset[:10])

# 打印每个 case 的样本数
print("Sample count per case in training dataset:")
print(train_dataset.to_pandas()[output_column].value_counts())
print("Sample count per case in validation dataset:")
print(eval_dataset.to_pandas()[output_column].value_counts())

# 识别和排序标签列表
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")


Unique participants in training dataset: ['E1' 'A6' 'E6' 'A10' 'E10' 'E7' 'A8' 'A1' 'E3' 'A2' 'E8' 'A9' 'E11' 'A5'
 'A4']
Unique participants in evaluation dataset: ['E4' 'E9' 'E2' 'E5']
(3240, 7)
(864, 7)


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 3240 examples [00:00, 223203.88 examples/s]
Generating validation split: 864 examples [00:00, 136858.59 examples/s]

Train dataset preview:
{'name': ['E1_case3_8_sample_48_2', 'A6_case4_4_sample_23_1', 'E6_case4_6_sample_34_3', 'A10_case1_2_sample_17_3', 'E10_case3_1_sample_6_1', 'E7_case3_6_sample_32_3', 'A8_case2_1_sample_2_1', 'E6_case4_8_sample_48_2', 'A1_case4_8_sample_47_2', 'E3_case2_1_sample_3_1'], 'path': ['data_1k2k3k_nobandpass_organized_withoutA3A7_dataset\\case3\\E1\\case3_8\\sample_sample\\E1_case3_8_sample_48_2_resampled.wav', 'data_1k2k3k_nobandpass_organized_withoutA3A7_dataset\\case4\\A6\\case4_4\\sample_sample\\A6_case4_4_sample_23_1_resampled.wav', 'data_1k2k3k_nobandpass_organized_withoutA3A7_dataset\\case4\\E6\\case4_6\\sample_sample\\E6_case4_6_sample_34_3_resampled.wav', 'data_1k2k3k_nobandpass_organized_withoutA3A7_dataset\\case1\\A10\\case1_2\\sample_sample\\A10_case1_2_sample_17_3_resampled.wav', 'data_1k2k3k_nobandpass_organized_withoutA3A7_dataset\\case3\\E10\\case3_1\\sample_sample\\E10_case3_1_sample_6_1_resampled.wav', 'data_1k2k3k_nobandpass_organized_withoutA3A7_data




In [10]:
# 统计包含能量特征的样本数
# train_with_features = sum(1 for item in train_dataset if item['energy_features'] is not None)
# eval_with_features = sum(1 for item in eval_dataset if item['energy_features'] is not None)
# print(f"Train samples with energy features: {train_with_features} out of {len(train_dataset)}")
# print(f"Validation samples with energy features: {eval_with_features} out of {len(eval_dataset)}")

## Prepare Data for Training

In [11]:
# Loading the created dataset using datasets
from datasets import load_dataset

# 定义数据文件路径
data_files = {
    "train": "data_1k2k3k_nobandpass_organized_withoutA3A7_dataset/train.csv",
    "validation": "data_1k2k3k_nobandpass_organized_withoutA3A7_dataset/test.csv",
}

# 加载数据集
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

# 打印数据集信息
print(train_dataset)
print(eval_dataset)


Dataset({
    features: ['name', 'path', 'case', 'prefix', 'case_id', 'sample_set', 'participant'],
    num_rows: 3240
})
Dataset({
    features: ['name', 'path', 'case', 'prefix', 'case_id', 'sample_set', 'participant'],
    num_rows: 864
})


In [12]:
input_column = "path"
output_column = "case"

In [13]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 4 classes: ['case1', 'case2', 'case3', 'case4']


In order to preprocess the audio into our classification model, we need to set up the relevant Wav2Vec2 assets regarding our language in this case `lighteternal/wav2vec2-large-xlsr-53-greek` fine-tuned by [Dimitris Papadopoulos](https://huggingface.co/lighteternal/wav2vec2-large-xlsr-53-greek). To handle the context representations in any audio length we use a merge strategy plan (pooling mode) to concatenate that 3D representations into 2D representations.

There are three merge strategies `mean`, `sum`, and `max`. In this example, we achieved better results on the mean approach. In the following, we need to initiate the config and the feature extractor from the Dimitris model.

In [14]:
from transformers import AutoConfig, Wav2Vec2Processor

In [15]:
model_name_or_path = "c3f9d884181a224a6ac87bf8885c84d1cff3384f"
pooling_mode = "mean"

In [16]:
# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

setattr(config, 'use_energy_features', False)
# setattr(config, 'energy_feature_dim', 1)

In [17]:
# processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
from transformers import Wav2Vec2FeatureExtractor
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
# target_sampling_rate = processor.feature_extractor.sampling_rate
target_sampling_rate = 16000
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


# Preprocess Data

So far, we downloaded, loaded, and split the SER dataset into train and test sets. The instantiated our strategy configuration for using context representations in our classification problem SER. Now, we need to extract features from the audio path in context representation tensors and feed them into our classification model to determine the emotion in the speech.

Since the audio file is saved in the `.wav` format, it is easy to use **[Librosa](https://librosa.org/doc/latest/index.html)** or others, but we suppose that the format may be in the `.mp3` format in case of generality. We found that the **[Torchaudio](https://pytorch.org/audio/stable/index.html)** library works best for reading in `.mp3` data.

An audio file usually stores both its values and the sampling rate with which the speech signal was digitalized. We want to store both in the dataset and write a **map(...)** function accordingly. Also, we need to handle the string labels into integers for our specific classification task in this case, the **single-label classification** you may want to use for your **regression** or even **multi-label classification**.

In [18]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

# def preprocess_function(examples):
#     speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
#     target_list = [label_to_id(label, label_list) for label in examples[output_column]]

#     result = processor(speech_list, sampling_rate=target_sampling_rate)
#     result["labels"] = list(target_list)

#     return result

In [19]:
df.head(2)

Unnamed: 0,name,path,case,prefix,case_id,sample_set,participant
0,E1_case3_8_sample_48_2,data_1k2k3k_nobandpass_organized_withoutA3A7_d...,case3,case3_8,sample_sample,case3,E1
1,E4_case2_6_sample_36_3,data_1k2k3k_nobandpass_organized_withoutA3A7_d...,case2,case2_6,sample_sample,case2,E4


In [20]:
from transformers import Wav2Vec2FeatureExtractor

processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
# 定义输入和输出列
input_column = "path"
output_column = "case"

# 定义预处理函数
def preprocess_function(examples):
    # 处理音频数据
    audio = [torchaudio.load(path)[0].numpy().squeeze() for path in examples["path"]]
    result = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    result["labels"] = [label_list.index(label) for label in examples["emotion"]]
    
    # 加载能量特征
    # energy_features = []
    # for wav_path in examples["path"]:
    #     try:
    #         # 从wav文件路径构造npy文件路径
    #         npy_path = wav_path.replace('_resampled.wav', '.npy')
    #         if os.path.exists(npy_path):
    #             features = np.load(npy_path)
    #             energy_features.append(features)
    #         else:
    #             print(f"Warning: No .npy file found at {npy_path}")
    #             energy_features.append(None)
    #     except Exception as e:
    #         print(f"Error loading energy features from {wav_path}: {e}")
    #         energy_features.append(None)
            
    # result["energy_features"] = energy_features
    
    return result

# 加载数据集
data_files = {
    "train": "data_1k2k3k_nobandpass_organized_withoutA3A7_dataset/train.csv",
    "validation": "data_1k2k3k_nobandpass_organized_withoutA3A7_dataset/test.csv",
}
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

# 重命名 case 字段为 emotion
train_dataset = train_dataset.rename_column("case", "emotion")
eval_dataset = eval_dataset.rename_column("case", "emotion")

# 获取独特的标签
label_list = train_dataset.unique("emotion")
label_list.sort()  # 排序以确保确定性
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

# 预处理和映射数据集
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=1,
    remove_columns=["case_id", "sample_set", "prefix"]
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=1,
    remove_columns=["case_id", "sample_set", "prefix"]
)

print("Train dataset:")
print(train_dataset)
print("\nEval dataset:")
print(eval_dataset)

A classification problem with 4 classes: ['case1', 'case2', 'case3', 'case4']


Map: 100%|██████████| 3240/3240 [00:20<00:00, 160.52 examples/s]
Map: 100%|██████████| 864/864 [00:04<00:00, 177.22 examples/s]

Train dataset:
Dataset({
    features: ['name', 'path', 'emotion', 'participant', 'input_values', 'attention_mask', 'labels'],
    num_rows: 3240
})

Eval dataset:
Dataset({
    features: ['name', 'path', 'emotion', 'participant', 'input_values', 'attention_mask', 'labels'],
    num_rows: 864
})





In [21]:
idx = 0
print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['emotion']}")

Training input_values: [-0.23719975352287292, 0.03784312680363655, 0.06766840070486069, 0.2158680260181427, 0.00942351296544075, -0.3207499086856842, -0.08602462708950043, 0.23380739986896515, 0.12406977266073227, 0.01587657257914543, 0.11226576566696167, 0.21059145033359528, 0.019981849938631058, -0.048538610339164734, 0.15583264827728271, 0.18839165568351746, 0.0017722544725984335, -0.033230382949113846, 0.10432080924510956, 0.012225075624883175, -0.18777988851070404, 0.12037404626607895, 0.3729451894760132, 0.07761891931295395, -0.06896517425775528, 0.03170633688569069, 0.16916310787200928, 0.13602487742900848, 0.028270822018384933, 0.05562348663806915, 0.057938385754823685, 0.12209232896566391, 0.3806702494621277, 0.408314973115921, 0.10065194964408875, -0.15076538920402527, -0.2152487188577652, -0.1509162336587906, 0.09993205964565277, 0.2940880358219147, 0.4063582122325897, 0.41306594014167786, 0.11149216443300247, -0.1781863421201706, -0.24741211533546448, -0.10721710324287415, 

Great, now we've successfully read all the audio files, resampled the audio files to 16kHz, and mapped each audio to the corresponding label.

## Model

Before diving into the training part, we need to build our classification model based on the merge strategy.

In [22]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [23]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


## Training

The data is processed so that we are ready to start setting up the training pipeline. We will make use of 🤗's [Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer) for which we essentially need to do the following:

- Define a data collator. In contrast to most NLP models, XLSR-Wav2Vec2 has a much larger input length than output length. *E.g.*, a sample of input length 50000 has an output length of no more than 100. Given the large input sizes, it is much more efficient to pad the training batches dynamically meaning that all training samples should only be padded to the longest sample in their batch and not the overall longest sample. Therefore, fine-tuning XLSR-Wav2Vec2 requires a special padding data collator, which we will define below

- Evaluation metric. During training, the model should be evaluated on the word error rate. We should define a `compute_metrics` function accordingly

- Load a pretrained checkpoint. We need to load a pretrained checkpoint and configure it correctly for training.

- Define the training configuration.

After having fine-tuned the model, we will correctly evaluate it on the test data and verify that it has indeed learned to correctly transcribe speech.

### Set-up Trainer

Let's start by defining the data collator. The code for the data collator was copied from [this example](https://github.com/huggingface/transformers/blob/9a06b6b11bdfc42eea08fa91d0c737d1863c99e3/examples/research_projects/wav2vec2/run_asr.py#L81).

Without going into too many details, in contrast to the common data collators, this data collator treats the `input_values` and `labels` differently and thus applies to separate padding functions on them (again making use of XLSR-Wav2Vec2's context manager). This is necessary because in speech input and output are of different modalities meaning that they should not be treated by the same padding function.
Analogous to the common data collators, the padding tokens in the labels with `-100` so that those tokens are **not** taken into account when computing the loss.

In [24]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [25]:
from transformers import Wav2Vec2FeatureExtractor

processor = Wav2Vec2FeatureExtractor.from_pretrained('c3f9d884181a224a6ac87bf8885c84d1cff3384f')
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
config.energy_feature_dim = 1

Next, the evaluation metric is defined. There are many pre-defined metrics for classification/regression problems, but in this case, we would continue with just **Accuracy** for classification and **MSE** for regression. You can define other metrics on your own.

In [26]:
is_regression = False

In [27]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

Now, we can load the pretrained XLSR-Wav2Vec2 checkpoint into our classification model with a pooling strategy.

In [28]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at c3f9d884181a224a6ac87bf8885c84d1cff3384f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The first component of XLSR-Wav2Vec2 consists of a stack of CNN layers that are used to extract acoustically meaningful - but contextually independent - features from the raw speech signal. This part of the model has already been sufficiently trained during pretraining and as stated in the [paper](https://arxiv.org/pdf/2006.13979.pdf) does not need to be fine-tuned anymore.
Thus, we can set the `requires_grad` to `False` for all parameters of the *feature extraction* part.

In [29]:
model.freeze_feature_extractor()

In a final step, we define all parameters related to training.
To give more explanation on some of the parameters:
- `learning_rate` and `weight_decay` were heuristically tuned until fine-tuning has become stable. Note that those parameters strongly depend on the Common Voice dataset and might be suboptimal for other speech datasets.

For more explanations on other parameters, one can take a look at the [docs](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer#trainingarguments).

**Note**: If one wants to save the trained models in his/her google drive the commented-out `output_dir` can be used instead.

In [30]:
# from google.colab import drive

# drive.mount('/gdrive')

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="train_result/123k_extrafeature",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=1,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
    seed=2003,
)



In [32]:
# !pip uninstall transformers[torch] -y
# !pip install transformers[torch]

In [33]:
# !pip uninstall transformers -y
# !pip install transformers


For future use we can create our training script, we do it in a simple way. You can add more on you own.

In [34]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_cuda_amq:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_cuda_amq:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()


Now, all instances can be passed to Trainer and we are ready to start training!

In [35]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # tokenizer=processor.feature_extractor,
)


### Training

Training will take between 10 and 60 minutes depending on the GPU allocated to this notebook.

In case you want to use this google colab to fine-tune your model, you should make sure that your training doesn't stop due to inactivity. A simple hack to prevent this is to paste the following code into the console of this tab (right mouse click -> inspect -> Console tab and insert code).

\\```javascript
function ConnectButton(){
    console.log("Connect pushed");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton,60000);
```

In [36]:
trainer.train()

  return F.conv1d(input, weight, bias, self.stride,
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  2%|▏         | 10/405 [00:03<01:44,  3.77it/s]

{'loss': 1.3353, 'grad_norm': nan, 'learning_rate': 9.777777777777778e-05, 'epoch': 0.02}


                                                
  2%|▏         | 10/405 [00:24<01:44,  3.77it/s] 

{'eval_loss': 1.3681877851486206, 'eval_accuracy': 0.3333333432674408, 'eval_runtime': 20.7911, 'eval_samples_per_second': 41.556, 'eval_steps_per_second': 10.389, 'epoch': 0.02}


  5%|▍         | 20/405 [00:29<03:29,  1.84it/s]

{'loss': 1.3177, 'grad_norm': nan, 'learning_rate': 9.580246913580247e-05, 'epoch': 0.05}


                                                
  5%|▍         | 20/405 [00:49<03:29,  1.84it/s] 

{'eval_loss': 1.359315037727356, 'eval_accuracy': 0.3333333432674408, 'eval_runtime': 20.6322, 'eval_samples_per_second': 41.876, 'eval_steps_per_second': 10.469, 'epoch': 0.05}


  7%|▋         | 30/405 [00:55<03:33,  1.76it/s]

{'loss': 1.2639, 'grad_norm': 24.504886627197266, 'learning_rate': 9.333333333333334e-05, 'epoch': 0.07}


                                                
  7%|▋         | 30/405 [01:17<03:33,  1.76it/s] 

{'eval_loss': 1.3495477437973022, 'eval_accuracy': 0.33564814925193787, 'eval_runtime': 21.5879, 'eval_samples_per_second': 40.022, 'eval_steps_per_second': 10.006, 'epoch': 0.07}


 10%|▉         | 40/405 [01:22<03:32,  1.72it/s]

{'loss': 1.2076, 'grad_norm': 15.640555381774902, 'learning_rate': 9.08641975308642e-05, 'epoch': 0.1}


                                                
 10%|▉         | 40/405 [01:45<03:32,  1.72it/s] 

{'eval_loss': 1.2256073951721191, 'eval_accuracy': 0.37268519401550293, 'eval_runtime': 22.087, 'eval_samples_per_second': 39.118, 'eval_steps_per_second': 9.779, 'epoch': 0.1}


 12%|█▏        | 50/405 [01:50<03:32,  1.67it/s]

{'loss': 1.0912, 'grad_norm': 2.3723907470703125, 'learning_rate': 8.839506172839507e-05, 'epoch': 0.12}


                                                
 12%|█▏        | 50/405 [02:12<03:32,  1.67it/s] 

{'eval_loss': 1.1620126962661743, 'eval_accuracy': 0.5115740895271301, 'eval_runtime': 21.7843, 'eval_samples_per_second': 39.662, 'eval_steps_per_second': 9.915, 'epoch': 0.12}


 15%|█▍        | 60/405 [02:18<03:25,  1.68it/s]

{'loss': 0.9811, 'grad_norm': 13.801350593566895, 'learning_rate': 8.592592592592593e-05, 'epoch': 0.15}


                                                
 15%|█▍        | 60/405 [02:40<03:25,  1.68it/s] 

{'eval_loss': 1.813031554222107, 'eval_accuracy': 0.1666666716337204, 'eval_runtime': 22.2602, 'eval_samples_per_second': 38.814, 'eval_steps_per_second': 9.703, 'epoch': 0.15}


 17%|█▋        | 70/405 [02:45<03:21,  1.66it/s]

{'loss': 0.8295, 'grad_norm': 3.5838510990142822, 'learning_rate': 8.34567901234568e-05, 'epoch': 0.17}


                                                
 17%|█▋        | 70/405 [03:07<03:21,  1.66it/s] 

{'eval_loss': 1.5755404233932495, 'eval_accuracy': 0.1944444477558136, 'eval_runtime': 21.6486, 'eval_samples_per_second': 39.91, 'eval_steps_per_second': 9.978, 'epoch': 0.17}


 20%|█▉        | 80/405 [03:13<03:14,  1.67it/s]

{'loss': 0.8151, 'grad_norm': 15.705230712890625, 'learning_rate': 8.098765432098767e-05, 'epoch': 0.2}


                                                
 20%|█▉        | 80/405 [03:35<03:14,  1.67it/s] 

{'eval_loss': 1.5326124429702759, 'eval_accuracy': 0.23379629850387573, 'eval_runtime': 21.9308, 'eval_samples_per_second': 39.397, 'eval_steps_per_second': 9.849, 'epoch': 0.2}


 22%|██▏       | 90/405 [03:41<03:11,  1.65it/s]

{'loss': 0.7692, 'grad_norm': 11.308257102966309, 'learning_rate': 7.851851851851852e-05, 'epoch': 0.22}


                                                
 22%|██▏       | 90/405 [04:04<03:11,  1.65it/s] 

{'eval_loss': 3.4646763801574707, 'eval_accuracy': 0.1666666716337204, 'eval_runtime': 22.6831, 'eval_samples_per_second': 38.09, 'eval_steps_per_second': 9.523, 'epoch': 0.22}


 25%|██▍       | 100/405 [04:12<03:17,  1.54it/s]

{'loss': 0.7281, 'grad_norm': 3.057086706161499, 'learning_rate': 7.60493827160494e-05, 'epoch': 0.25}


                                                 
 25%|██▍       | 100/405 [04:33<03:17,  1.54it/s]

{'eval_loss': 0.9463376402854919, 'eval_accuracy': 0.5081018805503845, 'eval_runtime': 21.6631, 'eval_samples_per_second': 39.884, 'eval_steps_per_second': 9.971, 'epoch': 0.25}


 27%|██▋       | 110/405 [04:39<02:58,  1.65it/s]

{'loss': 0.9853, 'grad_norm': 3.313979148864746, 'learning_rate': 7.382716049382717e-05, 'epoch': 0.27}


                                                 
 27%|██▋       | 110/405 [05:01<02:58,  1.65it/s]

{'eval_loss': 1.860094428062439, 'eval_accuracy': 0.26620370149612427, 'eval_runtime': 21.7613, 'eval_samples_per_second': 39.704, 'eval_steps_per_second': 9.926, 'epoch': 0.27}


 30%|██▉       | 120/405 [05:07<02:45,  1.73it/s]

{'loss': 0.8116, 'grad_norm': 6.600299835205078, 'learning_rate': 7.135802469135802e-05, 'epoch': 0.3}


                                                 
 30%|██▉       | 120/405 [05:29<02:45,  1.73it/s]

{'eval_loss': 0.7808619141578674, 'eval_accuracy': 0.5578703880310059, 'eval_runtime': 21.9927, 'eval_samples_per_second': 39.286, 'eval_steps_per_second': 9.821, 'epoch': 0.3}


 32%|███▏      | 130/405 [05:34<02:45,  1.66it/s]

{'loss': 0.731, 'grad_norm': 5.13027811050415, 'learning_rate': 6.88888888888889e-05, 'epoch': 0.32}


                                                 
 32%|███▏      | 130/405 [05:56<02:45,  1.66it/s]

{'eval_loss': 0.8142464756965637, 'eval_accuracy': 0.5902777910232544, 'eval_runtime': 21.9842, 'eval_samples_per_second': 39.301, 'eval_steps_per_second': 9.825, 'epoch': 0.32}


 35%|███▍      | 140/405 [06:02<02:35,  1.71it/s]

{'loss': 0.7178, 'grad_norm': 14.9618501663208, 'learning_rate': 6.641975308641975e-05, 'epoch': 0.35}


                                                 
 35%|███▍      | 140/405 [06:24<02:35,  1.71it/s]

{'eval_loss': 0.7901562452316284, 'eval_accuracy': 0.5439814925193787, 'eval_runtime': 21.9394, 'eval_samples_per_second': 39.381, 'eval_steps_per_second': 9.845, 'epoch': 0.35}


 37%|███▋      | 150/405 [06:30<02:30,  1.69it/s]

{'loss': 0.7755, 'grad_norm': 6.610732555389404, 'learning_rate': 6.395061728395062e-05, 'epoch': 0.37}


                                                 
 37%|███▋      | 150/405 [06:52<02:30,  1.69it/s]

{'eval_loss': 1.246552586555481, 'eval_accuracy': 0.3680555522441864, 'eval_runtime': 21.9633, 'eval_samples_per_second': 39.338, 'eval_steps_per_second': 9.835, 'epoch': 0.37}


 40%|███▉      | 160/405 [06:57<02:27,  1.66it/s]

{'loss': 0.6773, 'grad_norm': 3.2345290184020996, 'learning_rate': 6.148148148148148e-05, 'epoch': 0.4}


                                                 
 40%|███▉      | 160/405 [07:19<02:27,  1.66it/s]

{'eval_loss': 0.9070694446563721, 'eval_accuracy': 0.49421295523643494, 'eval_runtime': 21.8772, 'eval_samples_per_second': 39.493, 'eval_steps_per_second': 9.873, 'epoch': 0.4}


 42%|████▏     | 170/405 [07:25<02:20,  1.67it/s]

{'loss': 0.7553, 'grad_norm': 3.3092124462127686, 'learning_rate': 5.901234567901235e-05, 'epoch': 0.42}


                                                 
 42%|████▏     | 170/405 [07:47<02:20,  1.67it/s]

{'eval_loss': 0.7762740850448608, 'eval_accuracy': 0.5509259104728699, 'eval_runtime': 21.991, 'eval_samples_per_second': 39.289, 'eval_steps_per_second': 9.822, 'epoch': 0.42}


 44%|████▍     | 180/405 [07:52<02:14,  1.67it/s]

{'loss': 0.6292, 'grad_norm': 12.196998596191406, 'learning_rate': 5.654320987654321e-05, 'epoch': 0.44}


                                                 
 44%|████▍     | 180/405 [08:14<02:14,  1.67it/s]

{'eval_loss': 0.7673254609107971, 'eval_accuracy': 0.5740740895271301, 'eval_runtime': 21.8151, 'eval_samples_per_second': 39.606, 'eval_steps_per_second': 9.901, 'epoch': 0.44}


 47%|████▋     | 190/405 [08:20<02:08,  1.68it/s]

{'loss': 0.6994, 'grad_norm': 9.184884071350098, 'learning_rate': 5.4074074074074075e-05, 'epoch': 0.47}


                                                 
 47%|████▋     | 190/405 [08:42<02:08,  1.68it/s]

{'eval_loss': 1.1072816848754883, 'eval_accuracy': 0.4513888955116272, 'eval_runtime': 22.5017, 'eval_samples_per_second': 38.397, 'eval_steps_per_second': 9.599, 'epoch': 0.47}


 49%|████▉     | 200/405 [08:49<02:07,  1.61it/s]

{'loss': 0.6013, 'grad_norm': 5.444161891937256, 'learning_rate': 5.160493827160494e-05, 'epoch': 0.49}


                                                 
 49%|████▉     | 200/405 [09:12<02:07,  1.61it/s]

{'eval_loss': 0.9669063687324524, 'eval_accuracy': 0.47800925374031067, 'eval_runtime': 22.873, 'eval_samples_per_second': 37.774, 'eval_steps_per_second': 9.443, 'epoch': 0.49}


 52%|█████▏    | 210/405 [09:20<02:06,  1.54it/s]

{'loss': 0.6012, 'grad_norm': 5.787593364715576, 'learning_rate': 4.913580246913581e-05, 'epoch': 0.52}


                                                 
 52%|█████▏    | 210/405 [09:43<02:06,  1.54it/s]

{'eval_loss': 0.65180504322052, 'eval_accuracy': 0.6180555820465088, 'eval_runtime': 22.7542, 'eval_samples_per_second': 37.971, 'eval_steps_per_second': 9.493, 'epoch': 0.52}


 54%|█████▍    | 220/405 [09:49<01:53,  1.63it/s]

{'loss': 0.573, 'grad_norm': 3.226868152618408, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.54}


                                                 
 54%|█████▍    | 220/405 [10:11<01:53,  1.63it/s]

{'eval_loss': 0.6109054088592529, 'eval_accuracy': 0.6273148059844971, 'eval_runtime': 22.6799, 'eval_samples_per_second': 38.095, 'eval_steps_per_second': 9.524, 'epoch': 0.54}


 57%|█████▋    | 230/405 [10:17<01:46,  1.64it/s]

{'loss': 0.5885, 'grad_norm': 0.6035750508308411, 'learning_rate': 4.4197530864197535e-05, 'epoch': 0.57}


                                                 
 57%|█████▋    | 230/405 [10:40<01:46,  1.64it/s]

{'eval_loss': 0.5558375120162964, 'eval_accuracy': 0.6423611044883728, 'eval_runtime': 22.4074, 'eval_samples_per_second': 38.559, 'eval_steps_per_second': 9.64, 'epoch': 0.57}


 59%|█████▉    | 240/405 [10:45<01:40,  1.65it/s]

{'loss': 0.5559, 'grad_norm': 1.5357674360275269, 'learning_rate': 4.17283950617284e-05, 'epoch': 0.59}


                                                 
 59%|█████▉    | 240/405 [11:07<01:40,  1.65it/s]

{'eval_loss': 0.5591073632240295, 'eval_accuracy': 0.6423611044883728, 'eval_runtime': 21.9381, 'eval_samples_per_second': 39.383, 'eval_steps_per_second': 9.846, 'epoch': 0.59}


 62%|██████▏   | 250/405 [11:13<01:32,  1.67it/s]

{'loss': 0.5924, 'grad_norm': 1.505614161491394, 'learning_rate': 3.925925925925926e-05, 'epoch': 0.62}


                                                 
 62%|██████▏   | 250/405 [11:35<01:32,  1.67it/s]

{'eval_loss': 0.5912346243858337, 'eval_accuracy': 0.6377314925193787, 'eval_runtime': 22.0257, 'eval_samples_per_second': 39.227, 'eval_steps_per_second': 9.807, 'epoch': 0.62}


 64%|██████▍   | 260/405 [11:41<01:27,  1.67it/s]

{'loss': 0.5392, 'grad_norm': 2.1077916622161865, 'learning_rate': 3.6790123456790125e-05, 'epoch': 0.64}


                                                 
 64%|██████▍   | 260/405 [12:03<01:27,  1.67it/s]

{'eval_loss': 0.9449015259742737, 'eval_accuracy': 0.5520833134651184, 'eval_runtime': 22.3555, 'eval_samples_per_second': 38.648, 'eval_steps_per_second': 9.662, 'epoch': 0.64}


 67%|██████▋   | 270/405 [12:09<01:20,  1.67it/s]

{'loss': 0.5689, 'grad_norm': 7.632463455200195, 'learning_rate': 3.432098765432099e-05, 'epoch': 0.67}


                                                 
 67%|██████▋   | 270/405 [12:32<01:20,  1.67it/s]

{'eval_loss': 1.59260892868042, 'eval_accuracy': 0.39351850748062134, 'eval_runtime': 22.2838, 'eval_samples_per_second': 38.773, 'eval_steps_per_second': 9.693, 'epoch': 0.67}


 69%|██████▉   | 280/405 [12:38<01:15,  1.65it/s]

{'loss': 0.5206, 'grad_norm': 3.3575356006622314, 'learning_rate': 3.185185185185185e-05, 'epoch': 0.69}


                                                 
 69%|██████▉   | 280/405 [13:00<01:15,  1.65it/s]

{'eval_loss': 0.5553040504455566, 'eval_accuracy': 0.6678240895271301, 'eval_runtime': 22.0193, 'eval_samples_per_second': 39.238, 'eval_steps_per_second': 9.81, 'epoch': 0.69}


 72%|███████▏  | 290/405 [13:06<01:07,  1.70it/s]

{'loss': 0.7483, 'grad_norm': 1.0880954265594482, 'learning_rate': 2.9382716049382718e-05, 'epoch': 0.72}


                                                 
 72%|███████▏  | 290/405 [13:28<01:07,  1.70it/s]

{'eval_loss': 0.5043160319328308, 'eval_accuracy': 0.6747685074806213, 'eval_runtime': 22.3243, 'eval_samples_per_second': 38.702, 'eval_steps_per_second': 9.676, 'epoch': 0.72}


 74%|███████▍  | 300/405 [13:34<01:04,  1.64it/s]

{'loss': 0.6512, 'grad_norm': 3.7985358238220215, 'learning_rate': 2.6913580246913585e-05, 'epoch': 0.74}


                                                 
 74%|███████▍  | 300/405 [13:56<01:04,  1.64it/s]

{'eval_loss': 0.5870890021324158, 'eval_accuracy': 0.6354166865348816, 'eval_runtime': 21.7191, 'eval_samples_per_second': 39.781, 'eval_steps_per_second': 9.945, 'epoch': 0.74}


 77%|███████▋  | 310/405 [14:01<00:56,  1.69it/s]

{'loss': 0.498, 'grad_norm': 2.153569459915161, 'learning_rate': 2.4444444444444445e-05, 'epoch': 0.77}


                                                 
 77%|███████▋  | 310/405 [14:24<00:56,  1.69it/s]

{'eval_loss': 0.5325269103050232, 'eval_accuracy': 0.6539351940155029, 'eval_runtime': 22.353, 'eval_samples_per_second': 38.653, 'eval_steps_per_second': 9.663, 'epoch': 0.77}


 79%|███████▉  | 320/405 [14:30<00:52,  1.63it/s]

{'loss': 0.5568, 'grad_norm': 1.2446420192718506, 'learning_rate': 2.1975308641975308e-05, 'epoch': 0.79}


                                                 
 79%|███████▉  | 320/405 [14:52<00:52,  1.63it/s]

{'eval_loss': 0.5183055996894836, 'eval_accuracy': 0.6597222089767456, 'eval_runtime': 22.3304, 'eval_samples_per_second': 38.692, 'eval_steps_per_second': 9.673, 'epoch': 0.79}


 81%|████████▏ | 330/405 [14:58<00:45,  1.65it/s]

{'loss': 0.6093, 'grad_norm': 0.8723267912864685, 'learning_rate': 1.950617283950617e-05, 'epoch': 0.81}


                                                 
 81%|████████▏ | 330/405 [15:20<00:45,  1.65it/s]

{'eval_loss': 0.5275468230247498, 'eval_accuracy': 0.6527777910232544, 'eval_runtime': 22.0115, 'eval_samples_per_second': 39.252, 'eval_steps_per_second': 9.813, 'epoch': 0.81}


 84%|████████▍ | 340/405 [15:26<00:38,  1.69it/s]

{'loss': 0.5294, 'grad_norm': 1.0964710712432861, 'learning_rate': 1.7037037037037038e-05, 'epoch': 0.84}


                                                 
 84%|████████▍ | 340/405 [15:48<00:38,  1.69it/s]

{'eval_loss': 0.5313467383384705, 'eval_accuracy': 0.6493055820465088, 'eval_runtime': 22.4451, 'eval_samples_per_second': 38.494, 'eval_steps_per_second': 9.623, 'epoch': 0.84}


 86%|████████▋ | 350/405 [15:54<00:32,  1.68it/s]

{'loss': 0.4644, 'grad_norm': 1.5587849617004395, 'learning_rate': 1.4567901234567902e-05, 'epoch': 0.86}


                                                 
 86%|████████▋ | 350/405 [16:16<00:32,  1.68it/s]

{'eval_loss': 0.5057828426361084, 'eval_accuracy': 0.6574074029922485, 'eval_runtime': 22.0684, 'eval_samples_per_second': 39.151, 'eval_steps_per_second': 9.788, 'epoch': 0.86}


 89%|████████▉ | 360/405 [16:21<00:27,  1.66it/s]

{'loss': 0.5422, 'grad_norm': 2.9523978233337402, 'learning_rate': 1.2098765432098767e-05, 'epoch': 0.89}


                                                 
 89%|████████▉ | 360/405 [16:43<00:27,  1.66it/s]

{'eval_loss': 0.5100970268249512, 'eval_accuracy': 0.65625, 'eval_runtime': 22.014, 'eval_samples_per_second': 39.248, 'eval_steps_per_second': 9.812, 'epoch': 0.89}


 91%|█████████▏| 370/405 [16:49<00:20,  1.70it/s]

{'loss': 0.5068, 'grad_norm': 0.6925785541534424, 'learning_rate': 9.62962962962963e-06, 'epoch': 0.91}


                                                 
 91%|█████████▏| 370/405 [17:11<00:20,  1.70it/s]

{'eval_loss': 0.510997474193573, 'eval_accuracy': 0.6550925970077515, 'eval_runtime': 22.0385, 'eval_samples_per_second': 39.204, 'eval_steps_per_second': 9.801, 'epoch': 0.91}


 94%|█████████▍| 380/405 [17:17<00:14,  1.69it/s]

{'loss': 0.4871, 'grad_norm': 4.792758464813232, 'learning_rate': 7.160493827160494e-06, 'epoch': 0.94}


                                                 
 94%|█████████▍| 380/405 [17:40<00:14,  1.69it/s]

{'eval_loss': 0.5067716240882874, 'eval_accuracy': 0.6574074029922485, 'eval_runtime': 22.7774, 'eval_samples_per_second': 37.932, 'eval_steps_per_second': 9.483, 'epoch': 0.94}


 96%|█████████▋| 390/405 [17:46<00:09,  1.61it/s]

{'loss': 0.5424, 'grad_norm': 0.7286778688430786, 'learning_rate': 4.691358024691358e-06, 'epoch': 0.96}


                                                 
 96%|█████████▋| 390/405 [18:08<00:09,  1.61it/s]

{'eval_loss': 0.5127418041229248, 'eval_accuracy': 0.6585648059844971, 'eval_runtime': 22.4557, 'eval_samples_per_second': 38.476, 'eval_steps_per_second': 9.619, 'epoch': 0.96}


 99%|█████████▉| 400/405 [18:14<00:03,  1.64it/s]

{'loss': 0.5597, 'grad_norm': 1.920746922492981, 'learning_rate': 2.2222222222222225e-06, 'epoch': 0.99}


                                                 
 99%|█████████▉| 400/405 [18:36<00:03,  1.64it/s]

{'eval_loss': 0.514777421951294, 'eval_accuracy': 0.6643518805503845, 'eval_runtime': 22.263, 'eval_samples_per_second': 38.809, 'eval_steps_per_second': 9.702, 'epoch': 0.99}


100%|██████████| 405/405 [18:43<00:00,  2.77s/it]

{'train_runtime': 1123.0932, 'train_samples_per_second': 2.885, 'train_steps_per_second': 0.361, 'train_loss': 0.7208315537299639, 'epoch': 1.0}





TrainOutput(global_step=405, training_loss=0.7208315537299639, metrics={'train_runtime': 1123.0932, 'train_samples_per_second': 2.885, 'train_steps_per_second': 0.361, 'total_flos': 7.875344185344e+16, 'train_loss': 0.7208315537299639, 'epoch': 1.0})

In [37]:
# trainer.evaluate()
trainer.evaluate(eval_dataset=eval_dataset)

100%|██████████| 216/216 [00:22<00:00,  9.65it/s]


{'eval_loss': 0.5150967240333557,
 'eval_accuracy': 0.6643518805503845,
 'eval_runtime': 22.5042,
 'eval_samples_per_second': 38.393,
 'eval_steps_per_second': 9.598,
 'epoch': 1.0}

In [38]:
trainer.evaluate(eval_dataset=train_dataset)

100%|██████████| 810/810 [01:25<00:00,  9.45it/s]


{'eval_loss': 0.4907015264034271,
 'eval_accuracy': 0.6722221970558167,
 'eval_runtime': 85.844,
 'eval_samples_per_second': 37.743,
 'eval_steps_per_second': 9.436,
 'epoch': 1.0}

In [39]:
import torch
from collections import Counter
from transformers import Trainer, EvalPrediction
import numpy as np

class WeightedVoteTrainer(Trainer):
    def __init__(self, *args, num_votes=5, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_votes = num_votes

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        # 进行多次预测
        predictions = []
        confidences = []
        model.train()  # 启用 dropout
        for i in range(self.num_votes):
            torch.manual_seed(i)  # 为每次预测设置不同的随机种子
            inputs_copy = {k: v.clone() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
            
            # 添加小的随机噪声到输入
            if 'input_values' in inputs_copy:
                noise = torch.randn_like(inputs_copy['input_values']) * 0.01
                inputs_copy['input_values'] += noise
            
            _, logits, labels = super().prediction_step(model, inputs_copy, prediction_loss_only, ignore_keys)
            probs = torch.softmax(logits, dim=-1)
            confidence, pred = torch.max(probs, dim=-1)
            predictions.append(pred)
            confidences.append(confidence)
        
        model.eval()  # 恢复评估模式
        
        # 将预测结果和置信度堆叠起来
        stacked_preds = torch.stack(predictions, dim=0)
        stacked_confidences = torch.stack(confidences, dim=0)
        
        # 计算加权投票结果
        num_classes = logits.shape[-1]
        weighted_votes = torch.zeros((stacked_preds.shape[1], num_classes), device=stacked_preds.device)
        for i in range(self.num_votes):
            weighted_votes.scatter_add_(1, stacked_preds[i].unsqueeze(1), stacked_confidences[i].unsqueeze(1))
        
        weighted_vote_result = torch.argmax(weighted_votes, dim=1)

        return None, weighted_vote_result, labels

def compute_metrics_with_weighted_vote(eval_pred: EvalPrediction):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    
    # 确保预测和标签都是 NumPy 数组
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    
    # 计算准确率
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# 创建新的 Trainer 实例
weighted_vote_trainer = WeightedVoteTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics_with_weighted_vote,
    num_votes=6  # 设置投票次数
)

# 进行评估
eval_results = weighted_vote_trainer.evaluate(eval_dataset=eval_dataset)
print(eval_results)

100%|██████████| 216/216 [00:39<00:00,  5.41it/s]

{'eval_model_preparation_time': 0.0037, 'eval_accuracy': 0.6921296296296297, 'eval_runtime': 40.2358, 'eval_samples_per_second': 21.473, 'eval_steps_per_second': 5.368}





In [40]:
import pickle
import torch

# 1. 准备要保存的信息
model_info = {
    'model_state': model.state_dict(),          # 模型权重
    'config': config,                           # 模型配置
    'processor': processor,                     # 特征提取器
    'label_list': label_list,                  # 标签列表
    'training_args': training_args,             # 训练参数
    'pooling_mode': config.pooling_mode,        # 池化模式
}

# 2. 保存模型信息
save_path = "wav2vec2.pkl"
with open(save_path, 'wb') as f:
    pickle.dump(model_info, f)