In [15]:
# === Standard ===
import os
import re
import random
import numpy as np
from pathlib import Path
from collections import defaultdict

# === PyTorch ===
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
# === Audio ===
import soundfile as sf

# === CV ===
from sklearn.model_selection import KFold

# === Reproducibility ===
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [2]:
EMOTION_MAP = {
    "W": ("anger", 0),
    "L": ("boredom", 1),
    "E": ("disgust", 2),
    "A": ("fear", 3),
    "F": ("happiness", 4),
    "T": ("sadness", 5),
    "N": ("neutral", 6),
}

N_CLASSES = len(EMOTION_MAP)

def parse_filename(filename):
    m = re.match(r'(\d{2})([a-z]\d{2})([A-Z])([a-z]?)\.wav', filename)
    if not m:
        return None
    speaker, text, emotion_code, version = m.groups()
    emotion, emotion_idx = EMOTION_MAP[emotion_code]
    return {
        "filename": filename,
        "speaker_id": speaker,
        "text_id": text,
        "emotion_code": emotion_code,
        "emotion": emotion,
        "emotion_idx": emotion_idx,
        "version": version or None
    }

In [3]:
%cd /kaggle/input/dnn-models

/kaggle/input/dnn-models


In [4]:
EMODB_ROOT = Path("/kaggle/input/berlin-database-of-emotional-speech-emodb/wav")

meta = []
for wav in EMODB_ROOT.glob("*.wav"):
    info = parse_filename(wav.name)
    if info:
        meta.append(info)

print(f"Loaded {len(meta)} utterances")

Loaded 535 utterances


In [21]:
class EmoDBDataset(Dataset):
    def __init__(self, items, wav_root, fs=16000,
                 cw_len=200, cw_shift=10,
                 audio_cache=None):
        
        self.items = items
        self.wav_root = wav_root
        self.audio_cache = audio_cache if audio_cache is not None else {}

        self.fs = fs
        self.wlen = int(fs * cw_len / 1000)
        self.wshift = int(fs * cw_shift / 1000)

        self.index = []
        self._prepare_chunks()

    def _load_audio(self, filename):
        if filename not in self.audio_cache:
            signal, sr = sf.read(self.wav_root / filename)
            assert sr == self.fs
            self.audio_cache[filename] = signal
        return self.audio_cache[filename]

    def _prepare_chunks(self):
        for item in self.items:
            signal = self._load_audio(item["filename"])
            length = len(signal)

            for beg in range(0, length - self.wlen, self.wshift):
                self.index.append((item, beg))

    def __len__(self):
        return len(self.index)

    def __getitem__(self, idx):
        item, beg = self.index[idx]
        signal = self._load_audio(item["filename"])

        chunk = signal[beg:beg + self.wlen]
        label = item["emotion_idx"]

        return (
            torch.tensor(chunk, dtype=torch.float32),
            torch.tensor(label, dtype=torch.long)
        )

In [6]:
SPEAKERS = ['03', '08', '09', '10', '11', '12', '13', '14', '15', '16']
speaker_to_items = defaultdict(list)

for item in meta:
    speaker_to_items[item["speaker_id"]].append(item)

kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
speaker_ids = np.array(SPEAKERS)

In [7]:
from dnn_models import SincNet, MLP

def build_model():
    cnn_options = {
        "cnn_N_filt": [80, 60, 60],
        "cnn_len_filt": [251, 5, 5],
        "cnn_max_pool_len": [3, 3, 3],
        "cnn_act": ["relu", "relu", "relu"],
        "cnn_drop": [0.0, 0.0, 0.0],
        "cnn_use_laynorm": [True, True, True],
        "cnn_use_batchnorm": [False, False, False],
        "cnn_use_laynorm_inp": True,
        "cnn_use_batchnorm_inp": False,
        "input_dim": int(16000 * 0.2),
        "fs": 16000
    }

    dnn_options = {
        "input_dim": None,
        "fc_lay": [2048, 2048],
        "fc_drop": [0.5, 0.5],
        "fc_use_laynorm": [True, True],
        "fc_use_batchnorm": [False, False],
        "fc_use_laynorm_inp": False,
        "fc_use_batchnorm_inp": False,
        "fc_act": ["relu", "relu"]
    }

    cnn = SincNet(cnn_options)
    dnn_options["input_dim"] = cnn.out_dim
    dnn = MLP(dnn_options)
    classifier = nn.Linear(dnn_options["fc_lay"][-1], N_CLASSES)

    model = nn.Sequential(cnn, dnn, classifier)
    return model.to(device)

In [16]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    correct, total = 0, 0
    for x, y in tqdm(loader):
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()

        correct += (out.argmax(1) == y).sum().item()
        total += y.size(0)
    return correct / total


def eval_epoch(model, loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            correct += (out.argmax(1) == y).sum().item()
            total += y.size(0)

    return correct / total

In [18]:
from collections import defaultdict

def eval_utterance_level(model, items, wav_root,
                         fs=16000, cw_len=200, cw_shift=10,
                         batch_size=128):
    """
    Returns utterance-level accuracy using mean posterior voting
    """
    model.eval()

    wlen = int(fs * cw_len / 1000)
    wshift = int(fs * cw_shift / 1000)

    correct = 0
    total = 0

    with torch.no_grad():
        for item in items:
            signal, sr = sf.read(wav_root / item["filename"])
            assert sr == fs

            chunks = []
            for beg in range(0, len(signal) - wlen, wshift):
                chunks.append(signal[beg:beg + wlen])

            if len(chunks) == 0:
                continue

            chunks = torch.tensor(chunks, dtype=torch.float32).to(device)

            # batch inference
            probs_sum = torch.zeros(N_CLASSES, device=device)

            for i in range(0, len(chunks), batch_size):
                out = model(chunks[i:i + batch_size])
                probs = torch.softmax(out, dim=1)
                probs_sum += probs.sum(dim=0)

            probs_mean = probs_sum / len(chunks)
            pred = probs_mean.argmax().item()

            if pred == item["emotion_idx"]:
                correct += 1
            total += 1

    return correct / total

In [22]:
fold_accuracies = []
audio_cache = {}

for fold, (train_idx, test_idx) in enumerate(kf.split(speaker_ids), 1):
    train_speakers = speaker_ids[train_idx]
    test_speakers  = speaker_ids[test_idx]

    train_items = sum([speaker_to_items[s] for s in train_speakers], [])
    test_items  = sum([speaker_to_items[s] for s in test_speakers], [])

    train_set = EmoDBDataset(
        train_items,
        EMODB_ROOT,
        audio_cache=audio_cache
    )
    train_loader = DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True)
    test_set = EmoDBDataset(
        test_items,
        EMODB_ROOT,
        audio_cache=audio_cache
    )
    model = build_model()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print(f"\n=== Fold {fold} ===")
    print(f"\n=== Fold {fold} | Train speakers: {train_speakers} | Test speakers: {test_speakers} ===")
    for epoch in range(1, 31):
        tr_acc = train_epoch(model, train_loader, optimizer, criterion)

    te_acc = eval_utterance_level(model, test_items, EMODB_ROOT)
    print(f"Utterance-level accuracy: {te_acc:.3f}")

    fold_accuracies.append(te_acc)


=== Fold 1 ===

=== Fold 1 | Train speakers: ['03' '08' '09' '10' '11' '12' '13' '15' '16'] | Test speakers: ['14'] ===


100%|██████████| 939/939 [00:45<00:00, 20.44it/s]
100%|██████████| 939/939 [00:45<00:00, 20.46it/s]
100%|██████████| 939/939 [00:45<00:00, 20.45it/s]
100%|██████████| 939/939 [00:45<00:00, 20.42it/s]
100%|██████████| 939/939 [00:45<00:00, 20.49it/s]
100%|██████████| 939/939 [00:46<00:00, 20.41it/s]
100%|██████████| 939/939 [00:45<00:00, 20.42it/s]
100%|██████████| 939/939 [00:46<00:00, 20.38it/s]
100%|██████████| 939/939 [00:46<00:00, 20.36it/s]
100%|██████████| 939/939 [00:46<00:00, 20.41it/s]
100%|██████████| 939/939 [00:45<00:00, 20.41it/s]
100%|██████████| 939/939 [00:45<00:00, 20.43it/s]
100%|██████████| 939/939 [00:46<00:00, 20.41it/s]
100%|██████████| 939/939 [00:45<00:00, 20.45it/s]
100%|██████████| 939/939 [00:46<00:00, 20.36it/s]
100%|██████████| 939/939 [00:46<00:00, 20.37it/s]
100%|██████████| 939/939 [00:46<00:00, 20.21it/s]
100%|██████████| 939/939 [00:46<00:00, 20.40it/s]
100%|██████████| 939/939 [00:46<00:00, 20.35it/s]
100%|██████████| 939/939 [00:46<00:00, 20.04it/s]


Utterance-level accuracy: 0.855

=== Fold 2 ===

=== Fold 2 | Train speakers: ['03' '08' '10' '11' '12' '13' '14' '15' '16'] | Test speakers: ['09'] ===


100%|██████████| 995/995 [00:48<00:00, 20.44it/s]
100%|██████████| 995/995 [00:48<00:00, 20.47it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]
100%|██████████| 995/995 [00:48<00:00, 20.48it/s]
100%|██████████| 995/995 [00:48<00:00, 20.47it/s]
100%|██████████| 995/995 [00:48<00:00, 20.43it/s]
100%|██████████| 995/995 [00:48<00:00, 20.44it/s]
100%|██████████| 995/995 [00:48<00:00, 20.45it/s]
100%|██████████| 995/995 [00:48<00:00, 20.47it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]
100%|██████████| 995/995 [00:48<00:00, 20.45it/s]
100%|██████████| 995/995 [00:48<00:00, 20.44it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]
100%|██████████| 995/995 [00:48<00:00, 20.47it/s]
100%|██████████| 995/995 [00:48<00:00, 20.48it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]
100%|██████████| 995/995 [00:48<00:00, 20.46it/s]


Utterance-level accuracy: 0.581

=== Fold 3 ===

=== Fold 3 | Train speakers: ['03' '08' '09' '10' '11' '12' '13' '14' '15'] | Test speakers: ['16'] ===


100%|██████████| 921/921 [00:45<00:00, 20.43it/s]
100%|██████████| 921/921 [00:45<00:00, 20.45it/s]
100%|██████████| 921/921 [00:45<00:00, 20.45it/s]
100%|██████████| 921/921 [00:45<00:00, 20.42it/s]
100%|██████████| 921/921 [00:45<00:00, 20.43it/s]
100%|██████████| 921/921 [00:45<00:00, 20.44it/s]
100%|██████████| 921/921 [00:45<00:00, 20.44it/s]
100%|██████████| 921/921 [00:45<00:00, 20.46it/s]
100%|██████████| 921/921 [00:45<00:00, 20.44it/s]
100%|██████████| 921/921 [00:45<00:00, 20.45it/s]
100%|██████████| 921/921 [00:45<00:00, 20.43it/s]
100%|██████████| 921/921 [00:45<00:00, 20.46it/s]
100%|██████████| 921/921 [00:45<00:00, 20.46it/s]
100%|██████████| 921/921 [00:45<00:00, 20.45it/s]
100%|██████████| 921/921 [00:45<00:00, 20.45it/s]
100%|██████████| 921/921 [00:45<00:00, 20.44it/s]
100%|██████████| 921/921 [00:45<00:00, 20.45it/s]
100%|██████████| 921/921 [00:45<00:00, 20.44it/s]
100%|██████████| 921/921 [00:45<00:00, 20.43it/s]
100%|██████████| 921/921 [00:45<00:00, 20.43it/s]


Utterance-level accuracy: 0.620

=== Fold 4 ===

=== Fold 4 | Train speakers: ['03' '09' '10' '11' '12' '13' '14' '15' '16'] | Test speakers: ['08'] ===


100%|██████████| 950/950 [00:47<00:00, 19.95it/s]
100%|██████████| 950/950 [00:47<00:00, 19.95it/s]
100%|██████████| 950/950 [00:47<00:00, 19.95it/s]
100%|██████████| 950/950 [00:47<00:00, 19.97it/s]
100%|██████████| 950/950 [00:47<00:00, 19.95it/s]
100%|██████████| 950/950 [00:46<00:00, 20.43it/s]
100%|██████████| 950/950 [00:46<00:00, 20.47it/s]
100%|██████████| 950/950 [00:46<00:00, 20.46it/s]
100%|██████████| 950/950 [00:46<00:00, 20.47it/s]
100%|██████████| 950/950 [00:46<00:00, 20.47it/s]
100%|██████████| 950/950 [00:46<00:00, 20.46it/s]
100%|██████████| 950/950 [00:46<00:00, 20.47it/s]
100%|██████████| 950/950 [00:46<00:00, 20.47it/s]
100%|██████████| 950/950 [00:46<00:00, 20.45it/s]
100%|██████████| 950/950 [00:46<00:00, 20.43it/s]
100%|██████████| 950/950 [00:46<00:00, 20.30it/s]
100%|██████████| 950/950 [00:46<00:00, 20.34it/s]
100%|██████████| 950/950 [00:46<00:00, 20.35it/s]
100%|██████████| 950/950 [00:46<00:00, 20.35it/s]
100%|██████████| 950/950 [00:46<00:00, 20.35it/s]


Utterance-level accuracy: 0.724

=== Fold 5 ===

=== Fold 5 | Train speakers: ['08' '09' '10' '11' '12' '13' '14' '15' '16'] | Test speakers: ['03'] ===


100%|██████████| 986/986 [00:48<00:00, 20.33it/s]
100%|██████████| 986/986 [00:48<00:00, 20.36it/s]
100%|██████████| 986/986 [00:48<00:00, 20.34it/s]
100%|██████████| 986/986 [00:48<00:00, 20.33it/s]
100%|██████████| 986/986 [00:48<00:00, 20.16it/s]
100%|██████████| 986/986 [00:48<00:00, 20.24it/s]
100%|██████████| 986/986 [00:48<00:00, 20.31it/s]
100%|██████████| 986/986 [00:48<00:00, 20.31it/s]
100%|██████████| 986/986 [00:48<00:00, 20.34it/s]
100%|██████████| 986/986 [00:48<00:00, 20.37it/s]
100%|██████████| 986/986 [00:48<00:00, 20.37it/s]
100%|██████████| 986/986 [00:48<00:00, 20.37it/s]
100%|██████████| 986/986 [00:48<00:00, 20.37it/s]
100%|██████████| 986/986 [00:48<00:00, 20.37it/s]
100%|██████████| 986/986 [00:48<00:00, 20.30it/s]
100%|██████████| 986/986 [00:48<00:00, 20.33it/s]
100%|██████████| 986/986 [00:48<00:00, 20.34it/s]
100%|██████████| 986/986 [00:48<00:00, 20.35it/s]
100%|██████████| 986/986 [00:48<00:00, 20.37it/s]
100%|██████████| 986/986 [00:48<00:00, 20.39it/s]


Utterance-level accuracy: 0.837

=== Fold 6 ===

=== Fold 6 | Train speakers: ['03' '08' '09' '10' '11' '12' '13' '14' '16'] | Test speakers: ['15'] ===


100%|██████████| 979/979 [00:48<00:00, 20.35it/s]
100%|██████████| 979/979 [00:48<00:00, 20.37it/s]
100%|██████████| 979/979 [00:48<00:00, 20.39it/s]
100%|██████████| 979/979 [00:48<00:00, 20.39it/s]
100%|██████████| 979/979 [00:48<00:00, 20.39it/s]
100%|██████████| 979/979 [00:47<00:00, 20.40it/s]
100%|██████████| 979/979 [00:48<00:00, 20.39it/s]
100%|██████████| 979/979 [00:48<00:00, 20.38it/s]
100%|██████████| 979/979 [00:47<00:00, 20.40it/s]
100%|██████████| 979/979 [00:47<00:00, 20.40it/s]
100%|██████████| 979/979 [00:48<00:00, 20.39it/s]
100%|██████████| 979/979 [00:47<00:00, 20.41it/s]
100%|██████████| 979/979 [00:47<00:00, 20.40it/s]
100%|██████████| 979/979 [00:48<00:00, 20.34it/s]
100%|██████████| 979/979 [00:48<00:00, 20.34it/s]
100%|██████████| 979/979 [00:48<00:00, 20.34it/s]
100%|██████████| 979/979 [00:48<00:00, 20.35it/s]
100%|██████████| 979/979 [00:48<00:00, 20.33it/s]
100%|██████████| 979/979 [00:48<00:00, 20.35it/s]
100%|██████████| 979/979 [00:48<00:00, 20.33it/s]


Utterance-level accuracy: 0.571

=== Fold 7 ===

=== Fold 7 | Train speakers: ['03' '08' '09' '10' '12' '13' '14' '15' '16'] | Test speakers: ['11'] ===


100%|██████████| 964/964 [00:47<00:00, 20.28it/s]
100%|██████████| 964/964 [00:47<00:00, 20.29it/s]
100%|██████████| 964/964 [00:47<00:00, 20.29it/s]
100%|██████████| 964/964 [00:47<00:00, 20.29it/s]
100%|██████████| 964/964 [00:47<00:00, 20.27it/s]
100%|██████████| 964/964 [00:47<00:00, 20.29it/s]
100%|██████████| 964/964 [00:47<00:00, 20.36it/s]
100%|██████████| 964/964 [00:47<00:00, 20.32it/s]
100%|██████████| 964/964 [00:47<00:00, 20.32it/s]
100%|██████████| 964/964 [00:47<00:00, 20.23it/s]
100%|██████████| 964/964 [00:47<00:00, 20.22it/s]
100%|██████████| 964/964 [00:47<00:00, 20.22it/s]
100%|██████████| 964/964 [00:47<00:00, 20.32it/s]
100%|██████████| 964/964 [00:47<00:00, 20.33it/s]
100%|██████████| 964/964 [00:47<00:00, 20.30it/s]
100%|██████████| 964/964 [00:47<00:00, 20.32it/s]
100%|██████████| 964/964 [00:47<00:00, 20.34it/s]
100%|██████████| 964/964 [00:47<00:00, 20.32it/s]
100%|██████████| 964/964 [00:47<00:00, 20.36it/s]
100%|██████████| 964/964 [00:47<00:00, 20.37it/s]


Utterance-level accuracy: 0.564

=== Fold 8 ===

=== Fold 8 | Train speakers: ['03' '08' '09' '10' '11' '13' '14' '15' '16'] | Test speakers: ['12'] ===


100%|██████████| 1007/1007 [00:49<00:00, 20.40it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.40it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.41it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.40it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.41it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.39it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.40it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.39it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.40it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.37it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.37it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.32it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.33it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.33it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.32it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.33it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.31it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.32it/s]
100%|██████████| 1007/1007 [00:49<00:00, 20.32it/s]
100%|███████

Utterance-level accuracy: 0.600

=== Fold 9 ===

=== Fold 9 | Train speakers: ['03' '08' '09' '10' '11' '12' '14' '15' '16'] | Test speakers: ['13'] ===


100%|██████████| 964/964 [00:47<00:00, 20.33it/s]
100%|██████████| 964/964 [00:47<00:00, 20.24it/s]
100%|██████████| 964/964 [00:47<00:00, 20.25it/s]
100%|██████████| 964/964 [00:47<00:00, 20.29it/s]
100%|██████████| 964/964 [00:47<00:00, 20.34it/s]
100%|██████████| 964/964 [00:47<00:00, 20.31it/s]
100%|██████████| 964/964 [00:47<00:00, 20.35it/s]
100%|██████████| 964/964 [00:47<00:00, 20.35it/s]
100%|██████████| 964/964 [00:47<00:00, 20.35it/s]
100%|██████████| 964/964 [00:47<00:00, 20.35it/s]
100%|██████████| 964/964 [00:47<00:00, 20.34it/s]
100%|██████████| 964/964 [00:47<00:00, 20.35it/s]
100%|██████████| 964/964 [00:47<00:00, 20.34it/s]
100%|██████████| 964/964 [00:47<00:00, 20.37it/s]
100%|██████████| 964/964 [00:47<00:00, 20.34it/s]
100%|██████████| 964/964 [00:47<00:00, 20.35it/s]
100%|██████████| 964/964 [00:47<00:00, 20.37it/s]
100%|██████████| 964/964 [00:47<00:00, 20.37it/s]
100%|██████████| 964/964 [00:47<00:00, 20.37it/s]
100%|██████████| 964/964 [00:47<00:00, 20.37it/s]


Utterance-level accuracy: 0.672

=== Fold 10 ===

=== Fold 10 | Train speakers: ['03' '08' '09' '11' '12' '13' '14' '15' '16'] | Test speakers: ['10'] ===


100%|██████████| 1014/1014 [00:50<00:00, 20.03it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.01it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.04it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.05it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.05it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.08it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.04it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.04it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.06it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.05it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.05it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.05it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.08it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.13it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.14it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.16it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.13it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.17it/s]
100%|██████████| 1014/1014 [00:50<00:00, 20.14it/s]
100%|███████

Utterance-level accuracy: 0.711


In [23]:
fold_accuracies

[0.855072463768116,
 0.5813953488372093,
 0.6197183098591549,
 0.7241379310344828,
 0.8367346938775511,
 0.5714285714285714,
 0.5636363636363636,
 0.6,
 0.6721311475409836,
 0.7105263157894737]

In [27]:
sum(fold_accuracies)/len(fold_accuracies)

0.6734781145771906

In [29]:
max(fold_accuracies)

0.855072463768116