# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Data loading](#Data-loading)
* [Strata](#Strata)
* [Train test split](#Train-test-split)
* [Results](#Results)

# Introduction

To speed development, 5-fold train-test splits are preprocessed and prepared as separate .feather files. Although scaling, dimensionality reduction, and resampling components may be reconfigured downstream, they are precomputed here for initial model selection.

In [1]:
from time import time

notebook_begin_time = time()

# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2021)
del environ
del random_seed
del np_seed
del set_seed
del reset_seeds

In [2]:
# extensions
%load_ext autotime
%load_ext lab_black
%load_ext nb_black

In [3]:
# core
import numpy as np
import pandas as pd

# utility
from gc import collect as gc_collect
from os import mkdir
from tqdm.notebook import tqdm

# typing
from sklearn.base import BaseEstimator
from typing import Dict, List, Tuple, Sequence

# faster pandas & sklearn
import swifter
from sklearnex import patch_sklearn

patch_sklearn()
del patch_sklearn

# preprocessing
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import OneSidedSelection
from sklearn.random_projection import SparseRandomProjection

# cross validator
from sklearn.model_selection import StratifiedGroupKFold

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

# hide warnings
import warnings

warnings.filterwarnings("ignore")
del warnings

time: 5.61 s


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Location of FRILL .feather files
FRILL_FEATHERS_FOLDER = "../1.0-mic-extract_FRILL_embeddings"

# Location where this notebook will output
DATA_OUT_FOLDER = "."

N_SPLITS = 5

_ = gc_collect()

time: 99.6 ms


# Data loading

In [5]:
keep_columns = [
    "id",
    "source",
    "speaker_id",
    "speaker_gender",
    "emo",
    "valence",
    "neg",
    "neu",
    "lang1",
    "length",
]

data = pd.concat(
    (
        pd.read_feather(f"{FRILL_FEATHERS_FOLDER}/dev_FRILL.feather"),
        pd.read_feather(f"{FRILL_FEATHERS_FOLDER}/nondev_FRILL.feather"),
    )
).set_index("id")
labels = pd.concat(
    (
        pd.read_feather(
            f"{FRILL_FEATHERS_FOLDER}/dev_labels.feather", columns=keep_columns
        ),
        pd.read_feather(
            f"{FRILL_FEATHERS_FOLDER}/nondev_labels.feather", columns=keep_columns
        ),
    )
).set_index("id")

assert all(data.index == labels.index)
del keep_columns
_ = gc_collect()

time: 2.23 s


In [6]:
# uncomment lines to undersample data
# labels = labels.sample(n=100, random_state=SEED)
# data = data.loc[labels.index]

assert all(data.index == labels.index)
_ = gc_collect()

time: 100 ms


In [7]:
data.head(1)
data.info()
labels.head(1)
labels.info()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.119926,-0.049218,0.039702,0.141101,0.044318,0.049631,0.016178,0.025797,-0.104466,-0.016044,...,-0.226529,-0.091773,0.028184,-0.074794,0.025474,0.046511,-0.002693,-0.017953,-0.146394,-0.06233


<class 'pandas.core.frame.DataFrame'>
UInt64Index: 86752 entries, 0 to 87363
Columns: 2048 entries, 0 to 2047
dtypes: float32(2048)
memory usage: 678.4 MB


Unnamed: 0_level_0,source,speaker_id,speaker_gender,emo,valence,neg,neu,lang1,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,aesdd,aesdd.1,f,ang,0,True,False,ell,medium


<class 'pandas.core.frame.DataFrame'>
UInt64Index: 86752 entries, 0 to 87363
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   source          86752 non-null  category
 1   speaker_id      86752 non-null  category
 2   speaker_gender  86752 non-null  category
 3   emo             86752 non-null  category
 4   valence         86752 non-null  int8    
 5   neg             86752 non-null  bool    
 6   neu             86752 non-null  bool    
 7   lang1           86752 non-null  category
 8   length          86752 non-null  category
dtypes: bool(2), category(6), int8(1)
memory usage: 1.5 MB
time: 247 ms


# Strata

In this section, strata are set up for the train-test split.

In [8]:
# fields are concatentated for quick permutation omitting non-existent combos
strata = labels.loc[
    :, ["source", "speaker_gender", "emo", "valence", "lang1", "length"]
]
strata.valence = strata.valence.astype(str)
strata = strata.swifter.apply("".join, axis=1)

Dask Apply: 100%|██████████| 16/16 [00:04<00:00,  3.37it/s]


time: 10.2 s


In [9]:
# utility function for identifying strata with only i occurences
def get_solo(i: int, strata_: pd.Series) -> np.ndarray:
    """Given a series of stratum memberships, return a shuffled array of strata with only i members."""
    return np.unique(
        strata_.loc[
            strata_.isin(
                (strata_counts := strata_.value_counts())
                .where(strata_counts == i)
                .dropna()
                .index
            )
        ]
        .sample(frac=1, random_state=SEED)
        .values
    )


# get solos, print stuff
def get_onlys(
    strata_: pd.Series, print_me: str = "", n_splits: int = N_SPLITS
) -> List[Dict[int, np.ndarray]]:
    """Optinally prints something and returns calls of get_solo on strata_ in a list"""
    print(print_me)
    solos = []
    for i in range(1, n_splits):
        solo: np.ndarray = get_solo(i, strata_)
        print(f"only {i}:", (_ := solo.size))
        if _:  # >= 1 strata with only i samples
            solos.append({i: solo})
    return solos


def process_strata(strata: pd.Series, n_splits: int = N_SPLITS) -> pd.Series:
    """Corrects strata membership column according to n_splits"""

    count = get_onlys_calls = 0

    while onlys := get_onlys(
        strata,
        print_me=f"merge passes performed: {get_onlys_calls}",
        n_splits=n_splits,
    ):
        get_onlys_calls += 1
        if len(onlys) == 1:
            last = onlys[0]
            strata_to_merge: np.ndarray = list(last.values())[0]
            only_key = list(last.keys())[0]
            tuplet_size = n_splits // only_key + (1 if n_splits % only_key else 0)
            # perform tuplet merge
            interval = len(strata_to_merge) // n_splits
            for strata_tuplet in zip(
                *[
                    strata_to_merge[interval * i : interval * (i + 1)]
                    for i in range(tuplet_size)
                ]
            ):
                strata = strata.replace(strata_tuplet, f"stratum_group_{count}")
                count += 1
            remainder = strata_to_merge[tuplet_size * interval :]
            if len(remainder) == 1:
                # process remainder unmatched
                n = n_splits
                strata_counts = strata.value_counts()
                while not (candidates := strata_counts.loc[strata_counts == n]).size:
                    n += 1
                strata = strata.replace(
                    [remainder[0], candidates.sample(n=1, random_state=SEED).index[0]],
                    f"stratum_group_{count}",
                )
                count += 1
            else:
                # self-pair last
                remainder = remainder.tolist()
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
        else:
            pop_onlys = lambda _: list(onlys.pop(_).values())[0].tolist()
            while len(onlys) >= 2:
                # pop the ends
                shortside = pop_onlys(0)
                longside = pop_onlys(-1)
                # merge until one end empty
                while shortside and longside:
                    strata = strata.replace(
                        (shortside.pop(), longside.pop()), f"stratum_group_{count}"
                    )
                    count += 1
            if onlys:
                # self-pair middle
                remainder = pop_onlys(0)
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
    return strata


_ = gc_collect()

time: 107 ms


In [10]:
STRATA = process_strata(strata, n_splits=N_SPLITS)
del strata
STRATA.value_counts()

merge passes performed: 0
only 1: 53
only 2: 39
only 3: 31
only 4: 17
merge passes performed: 1
only 1: 36
only 2: 8
only 3: 0
only 4: 0
merge passes performed: 2
only 1: 28
only 2: 0
only 3: 8
only 4: 0
merge passes performed: 3
only 1: 20
only 2: 0
only 3: 0
only 4: 8
merge passes performed: 4
only 1: 12
only 2: 0
only 3: 0
only 4: 0
merge passes performed: 5
only 1: 0
only 2: 1
only 3: 0
only 4: 0
merge passes performed: 6
only 1: 0
only 2: 0
only 3: 0
only 4: 0


MELDmneu1engmedium    2905
MELDfneu1engmedium    2452
esdmhap2cmnmedium     1750
esdfneu1engmedium     1750
esdfhap2engmedium     1750
                      ... 
MAVmpai0___medium        5
MAVmple2___medium        5
MAVmsad0___medium        5
stratum_group_28         5
stratum_group_29         5
Length: 486, dtype: int64

time: 1.69 s


# Train test split

The cross-validation splits are pre-computed to save time in the train-predict loops. Scaling and dimensionality reduction for each fold are pre-computed as well.

In [11]:
prepared_splits = [
    *StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED).split(
        X=data, y=STRATA, groups=labels.speaker_id
    )
]

scaler = lambda: MinMaxScaler(feature_range=(-1, 1))  # preserve sign


def SRP_dim_red(X_train: Sequence) -> BaseEstimator:
    """Reduce dimensions with sparse random projection"""
    eps = 0.1  # default
    while True:
        # increase eps by 0.05 until n_components < n_features
        try:
            reducer = SparseRandomProjection(eps=eps, random_state=SEED).fit(X_train)
        except ValueError:
            eps += 0.05
        else:
            break
    return reducer


make_adasyn = lambda strategy: ADASYN(
    sampling_strategy=strategy, random_state=SEED, n_jobs=-1
).fit_resample


def adasyn_upsample(X_train: Sequence, y_train: Sequence) -> Tuple[Sequence, Sequence]:
    """Upsample with ADASYN"""
    try:
        train = make_adasyn("auto")(X_train, y_train)
    except ValueError:
        try:
            train = make_adasyn("not majority")(X_train, y_train)
        except ValueError:
            try:
                train = make_adasyn("minority")(X_train, y_train)
            except ValueError:
                try:
                    train = make_adasyn("all")(X_train, y_train)
                except ValueError:
                    train = make_adasyn("not minority")(X_train, y_train)
    return train


_ = gc_collect()

time: 1.59 s


In [12]:
iloc_values = lambda series, idx: series.iloc[idx].values
choose_labels = {
    "neg": labels.loc[:, "neg"],
    "neu": labels.loc[:, "neu"],
    "ter": labels.valence,
}


def write_df(data_obj: Sequence, fname: str, fold_num: int) -> None:
    """Save the data object as .feather"""
    df = pd.DataFrame(data_obj)
    df.columns = df.columns.astype(str)
    folder_path = f"{DATA_OUT_FOLDER}/cv_{fold_num}"
    try:
        mkdir(folder_path)
    except FileExistsError:
        pass
    df.to_feather(f"{folder_path}/{fname}.feather")


fold_num = 0
for train_idx, test_idx in tqdm(prepared_splits):
    # select and save untransformed versions
    write_df(X_train := iloc_values(data, train_idx), "X_train_untransformed", fold_num)
    write_df(X_test := iloc_values(data, test_idx), "X_test_untransformed", fold_num)
    write_df(
        y_train := iloc_values(choose_labels["ter"], train_idx),
        "y_train_untransformed",
        fold_num,
    )
    write_df(
        y_test := iloc_values(choose_labels["ter"], test_idx),
        "y_test_untransformed",
        fold_num,
    )

    # select and scale X
    scale = scaler().fit(X_train := data.iloc[train_idx].values)
    X_train, X_test = scale.transform(X_train), scale.transform(
        data.iloc[test_idx].values
    )
    del scale
    _ = gc_collect()

    # reduce dimensions
    reducer = SRP_dim_red(X_train)
    X_train, X_test = reducer.transform(X_train), reducer.transform(X_test)
    del reducer
    _ = gc_collect()

    # save X_test
    write_df(X_test, "X_test", fold_num)
    del X_test
    _ = gc_collect()

    # process labels and X_train
    for case in {"neg", "neu", "ter"}:
        label_set = choose_labels[case]
        # oversample with ADASYN, undersample with OSS
        X_train_, y_train = OneSidedSelection(
            random_state=SEED, n_jobs=-1
        ).fit_resample(*adasyn_upsample(X_train, iloc_values(label_set, train_idx)))
        # save y_test
        write_df(iloc_values(label_set, test_idx), f"y_test_{case}", fold_num)
        del label_set
        _ = gc_collect()
        # save y_train
        write_df(y_train, f"y_train_{case}", fold_num)
        del y_train
        _ = gc_collect()
        # save X_train
        write_df(X_train_, f"X_train_{case}", fold_num)
        del X_train_
        _ = gc_collect()

    fold_num += 1
    _ = gc_collect()

  0%|          | 0/5 [00:00<?, ?it/s]

time: 2h 14min 5s


# Results

Let's peek at the results.

In [13]:
_ = pd.read_feather("./cv_0/X_test.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_1/X_train_neg.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_2/y_test_neu.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_3/y_train_ter.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_4/X_test_untransformed.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_0/X_train_untransformed.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_1/y_test_untransformed.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_2/y_train_untransformed.feather")
_.head(5)
_.info()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1710,1711,1712,1713,1714,1715,1716,1717,1718,1719
0,-0.200318,-0.079591,-0.085694,0.085644,-0.26144,0.207012,0.054372,-0.070321,-0.01618,-0.400144,...,0.123679,-0.202286,0.240911,-0.132419,0.094906,0.236882,-0.19023,-0.00796,0.017991,-0.310528
1,-0.179967,-0.21064,-0.286055,-0.296028,0.081395,0.061064,0.19264,-0.094679,0.319975,-0.236925,...,0.300989,0.012957,-0.007632,-0.200628,0.06714,0.121718,0.000505,-0.058982,0.04539,0.120849
2,-0.201064,-0.05923,-0.56496,-0.034816,-0.155056,0.135351,0.024811,0.007414,0.048422,-0.084529,...,0.122254,-0.102736,-0.050463,-0.064174,0.034877,-0.058158,-0.208921,0.021208,0.206677,-0.17319
3,0.25703,-0.074324,-0.247301,0.049724,0.069506,0.06278,0.265292,-0.12497,0.246321,-0.30202,...,0.020797,0.090042,-0.076838,-0.200394,0.209571,0.046603,0.270125,-0.074805,0.139336,-0.171033
4,-0.117208,0.200171,-0.634259,-0.028843,0.052761,-0.039516,0.333906,-0.066727,-0.069563,-0.240387,...,0.127685,0.133332,0.092234,0.082183,0.113881,0.007495,-0.220317,0.0166,0.141476,-0.222463


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13494 entries, 0 to 13493
Columns: 1720 entries, 0 to 1719
dtypes: float64(1720)
memory usage: 177.1 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1714,1715,1716,1717,1718,1719,1720,1721,1722,1723
0,0.080641,0.035866,0.070995,-0.098683,0.168943,0.082318,-0.22992,0.165145,-0.304624,-0.076637,...,0.040155,0.226847,0.418276,-0.023277,0.208393,0.414519,-0.109358,-0.024321,0.012608,-0.196889
1,-0.627933,0.076679,-0.277732,-0.280622,0.514947,0.149221,-0.156525,-0.029274,0.215574,0.084678,...,0.258122,0.12273,0.252562,-0.151336,0.047228,0.208045,-0.308252,0.027256,-0.327603,0.057225
2,-0.151293,0.265817,0.055539,-0.398155,0.21353,0.431736,0.104021,0.490597,0.277072,-0.164734,...,-0.221955,-0.156192,0.057796,-0.311867,0.019575,0.452932,0.113158,-0.061367,-0.08985,-0.049146
3,0.154847,0.184277,0.017277,-0.220455,0.075703,-0.063982,-0.15108,0.495364,0.325388,0.038834,...,-0.338525,0.002517,-0.09453,-0.111644,0.419923,0.307013,0.0008,-0.236804,-0.052388,0.295008
4,-0.176299,0.114365,-0.568484,-0.292849,0.100491,0.325104,0.079495,0.06078,0.348304,0.155457,...,0.454515,-0.322047,-0.023101,-0.143589,-0.165669,0.525022,-0.455564,-0.113501,-0.361771,0.210915


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72641 entries, 0 to 72640
Columns: 1724 entries, 0 to 1723
dtypes: float64(1724)
memory usage: 955.5 MB


Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19319 entries, 0 to 19318
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       19319 non-null  bool 
dtypes: bool(1)
memory usage: 19.0 KB


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95782 entries, 0 to 95781
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       95782 non-null  int8 
dtypes: int8(1)
memory usage: 93.7 KB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.101377,-0.063714,-0.015971,-0.151561,-0.065328,-0.013746,0.008821,0.010461,-0.030857,-0.010367,...,-0.169826,0.051789,0.114774,-0.033263,-0.071615,-0.040535,-0.050813,0.021136,-0.189459,0.040756
1,-0.028015,-0.041617,0.012599,-0.03146,0.021788,0.039503,0.033654,0.114422,-0.080653,-0.03209,...,0.059661,0.020316,0.044898,0.008499,0.009812,-0.047551,0.085392,0.029538,0.05707,-0.029945
2,-0.000265,-0.050629,-0.005011,-0.169305,-0.087526,-0.022268,-0.020403,-0.149949,-0.047235,0.03298,...,-0.128375,0.128055,0.047202,-0.024305,-0.168291,0.112876,0.017571,0.078376,0.099279,-0.04661
3,0.019952,0.123573,0.024742,-0.061553,0.11691,0.040967,0.020223,0.060525,-0.142646,0.022462,...,-0.067434,-0.078454,-0.072601,-0.036682,0.081269,0.006444,0.008728,-0.02337,-0.011679,-0.027678
4,0.127793,0.017307,0.003179,0.085972,-0.086323,-0.081065,-0.059941,0.080929,0.1736,-0.005168,...,-0.081003,0.068492,-0.089526,0.007822,0.057294,-0.040558,0.050517,0.032885,0.011344,-0.071713


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16532 entries, 0 to 16531
Columns: 2048 entries, 0 to 2047
dtypes: float32(2048)
memory usage: 129.2 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.119926,-0.049218,0.039702,0.141101,0.044318,0.049631,0.016178,0.025797,-0.104466,-0.016044,...,-0.226529,-0.091773,0.028184,-0.074794,0.025474,0.046511,-0.002693,-0.017953,-0.146394,-0.06233
1,0.101377,-0.063714,-0.015971,-0.151561,-0.065328,-0.013746,0.008821,0.010461,-0.030857,-0.010367,...,-0.169826,0.051789,0.114774,-0.033263,-0.071615,-0.040535,-0.050813,0.021136,-0.189459,0.040756
2,-0.028015,-0.041617,0.012599,-0.03146,0.021788,0.039503,0.033654,0.114422,-0.080653,-0.03209,...,0.059661,0.020316,0.044898,0.008499,0.009812,-0.047551,0.085392,0.029538,0.05707,-0.029945
3,-0.008712,-0.003798,-0.05217,0.030298,0.049662,0.039806,0.056743,-0.186714,-0.15111,0.021191,...,0.107442,0.053569,-0.067552,-0.017051,-0.109469,-0.082999,-0.049325,0.031638,-0.027172,0.006479
4,-0.000265,-0.050629,-0.005011,-0.169305,-0.087526,-0.022268,-0.020403,-0.149949,-0.047235,0.03298,...,-0.128375,0.128055,0.047202,-0.024305,-0.168291,0.112876,0.017571,0.078376,0.099279,-0.04661


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73258 entries, 0 to 73257
Columns: 2048 entries, 0 to 2047
dtypes: float32(2048)
memory usage: 572.3 MB


Unnamed: 0,0
0,2
1,0
2,0
3,0
4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11524 entries, 0 to 11523
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       11524 non-null  int8 
dtypes: int8(1)
memory usage: 11.4 KB


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67433 entries, 0 to 67432
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       67433 non-null  int8 
dtypes: int8(1)
memory usage: 66.0 KB
time: 4.47 s


In [14]:
print(f"Time elapsed since notebook_begin_time: {time() - notebook_begin_time} s")
_ = gc_collect()

Time elapsed since notebook_begin_time: 8086.062160730362 s
time: 99.9 ms


[^top](#Contents)