# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Load data](#Load-data)
* [Strata](#Strata)
* [Hyperparameters](#Hyperparameters)
* [Models](#Models)
* [Evaluate](#Evaluate)
* [Discussion](#Discussion)

# Introduction

After rounds of feature engineering, visualization & exploration, and tuning various aspects of the classification pipeline, we are about to train new prototypes.

# Imports and configuration

In [1]:
from time import time

notebook_begin_time = time()

# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2022)
del environ
del random_seed
del np_seed
del set_seed
del reset_seeds

In [2]:
# extensions
%load_ext autotime
%load_ext lab_black
%load_ext nb_black

In [3]:
# core
import numpy as np
import pandas as pd

# utility
from joblib import dump
from gc import collect as gc_collect
from tqdm.notebook import tqdm

# faster
import swifter
from sklearnex import patch_sklearn

patch_sklearn()
del patch_sklearn

# typing
from typing import List, Dict

# metrics
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

# other sklearn
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

time: 2.47 s


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Location of FRILL .feather files
FRILL_FEATHERS_FOLDER = "../1.0-mic-extract_FRILL_embeddings"

# Location of pre-final features
FEATURES_FOLDER = "."

# Location where this notebook will output
DATA_OUT_FOLDER = "."

_ = gc_collect()

time: 105 ms


# Load data

In [5]:
def load_labels() -> pd.DataFrame:
    """Load just the labels"""
    keep_columns = [
        "id",
        "source",
        "speaker_id",
        "speaker_gender",
        "emo",
        "valence",
        "lang1",
        "length",
    ]
    labels = pd.concat(
        (
            pd.read_feather(
                f"{FRILL_FEATHERS_FOLDER}/dev_labels.feather", columns=keep_columns
            ),
            pd.read_feather(
                f"{FRILL_FEATHERS_FOLDER}/nondev_labels.feather", columns=keep_columns
            ),
        )
    ).set_index("id")
    return labels


def load_data() -> pd.DataFrame:
    """Loads the FRILL-based features"""
    df = pd.read_feather(
        f"{FEATURES_FOLDER}/scaled_features_ready_for_selection.feather"
    ).set_index("id")
    df.columns = df.columns.astype(str)
    return df


data = load_data()
labels = load_labels().loc[data.index]
y_true = labels.valence
gnb_features = ["spherical-LDA1", "spherical-LDA2"]
assert all(data.index == labels.index)
_ = gc_collect()

time: 206 ms


In [6]:
data.info()
labels.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 78777 entries, 0 to 87363
Data columns (total 51 columns):
 #   Column                                                                         Non-Null Count  Dtype  
---  ------                                                                         --------------  -----  
 0   LDA1                                                                           78777 non-null  float64
 1   LDA2                                                                           78777 non-null  float64
 2   ocLDA_neg                                                                      78777 non-null  float64
 3   ocLDA_neu                                                                      78777 non-null  float64
 4   ocLDA_pos                                                                      78777 non-null  float64
 5   ocSVM_sgdlinear_LDA1                                                           78777 non-null  float64
 6   ocSVM_sgdlinear_LDA2 

# Strata

In [7]:
N_SPLITS = 8

# fields are concatentated for quick permutation omitting non-existent combos
strata = labels.loc[
    :, ["source", "speaker_gender", "emo", "valence", "lang1", "length"]
]
strata.valence = strata.valence.astype(str)
strata = strata.swifter.apply("".join, axis=1)

Pandas Apply: 100%|██████████| 78777/78777 [00:00<00:00, 81157.35it/s]

time: 1.08 s





In [8]:
# utility function for identifying strata with only i occurences
def get_solo(i: int, strata_: pd.Series) -> np.ndarray:
    """Given a series of stratum memberships, return a shuffled array of strata with only i members."""
    return np.unique(
        strata_.loc[
            strata_.isin(
                (strata_counts := strata_.value_counts())
                .where(strata_counts == i)
                .dropna()
                .index
            )
        ]
        .sample(frac=1, random_state=SEED)
        .values
    )


# get solos, print stuff
def get_onlys(
    strata_: pd.Series, print_me: str = "", n_splits: int = N_SPLITS
) -> List[Dict[int, np.ndarray]]:
    """Optinally prints something and returns calls of get_solo on strata_ in a list"""
    print(print_me)
    solos = []
    for i in range(1, n_splits):
        solo: np.ndarray = get_solo(i, strata_)
        print(f"only {i}:", (_ := solo.size))
        if _:  # >= 1 strata with only i samples
            solos.append({i: solo})
    return solos


def process_strata(strata: pd.Series, n_splits: int = N_SPLITS) -> pd.Series:
    """Corrects strata membership column according to n_splits"""

    count = get_onlys_calls = 0

    while onlys := get_onlys(
        strata,
        print_me=f"merge passes performed: {get_onlys_calls}",
        n_splits=n_splits,
    ):
        get_onlys_calls += 1
        if len(onlys) == 1:
            last = onlys[0]
            strata_to_merge: np.ndarray = list(last.values())[0]
            only_key = list(last.keys())[0]
            tuplet_size = n_splits // only_key + (1 if n_splits % only_key else 0)
            # perform tuplet merge
            interval = len(strata_to_merge) // n_splits
            for strata_tuplet in zip(
                *[
                    strata_to_merge[interval * i : interval * (i + 1)]
                    for i in range(tuplet_size)
                ]
            ):
                strata = strata.replace(strata_tuplet, f"stratum_group_{count}")
                count += 1
            remainder = strata_to_merge[tuplet_size * interval :]
            if len(remainder) == 1:
                # process remainder unmatched
                n = n_splits
                strata_counts = strata.value_counts()
                while not (candidates := strata_counts.loc[strata_counts == n]).size:
                    n += 1
                strata = strata.replace(
                    [remainder[0], candidates.sample(n=1, random_state=SEED).index[0]],
                    f"stratum_group_{count}",
                )
                count += 1
            else:
                # self-pair last
                remainder = remainder.tolist()
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
        else:
            pop_onlys = lambda _: list(onlys.pop(_).values())[0].tolist()
            while len(onlys) >= 2:
                # pop the ends
                shortside = pop_onlys(0)
                longside = pop_onlys(-1)
                # merge until one end empty
                while shortside and longside:
                    strata = strata.replace(
                        (shortside.pop(), longside.pop()), f"stratum_group_{count}"
                    )
                    count += 1
            if onlys:
                # self-pair middle
                remainder = pop_onlys(0)
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
    return strata


_ = gc_collect()

time: 97.9 ms


In [9]:
STRATA = process_strata(strata, n_splits=N_SPLITS)
STRATA.value_counts()
cross_validator = lambda: StratifiedGroupKFold(
    n_splits=N_SPLITS, shuffle=True, random_state=SEED
).split(X=data, y=STRATA, groups=labels.speaker_id)

merge passes performed: 0
only 1: 49
only 2: 34
only 3: 30
only 4: 15
only 5: 26
only 6: 12
only 7: 9
merge passes performed: 1
only 1: 40
only 2: 22
only 3: 4
only 4: 1
only 5: 0
only 6: 0
only 7: 0
merge passes performed: 2
only 1: 39
only 2: 18
only 3: 0
only 4: 0
only 5: 5
only 6: 0
only 7: 0
merge passes performed: 3
only 1: 34
only 2: 0
only 3: 0
only 4: 9
only 5: 0
only 6: 5
only 7: 0
merge passes performed: 4
only 1: 29
only 2: 0
only 3: 0
only 4: 1
only 5: 0
only 6: 0
only 7: 5
merge passes performed: 5
only 1: 24
only 2: 0
only 3: 0
only 4: 1
only 5: 0
only 6: 0
only 7: 0
merge passes performed: 6
only 1: 23
only 2: 0
only 3: 0
only 4: 0
only 5: 1
only 6: 0
only 7: 0
merge passes performed: 7
only 1: 22
only 2: 0
only 3: 0
only 4: 0
only 5: 0
only 6: 1
only 7: 0
merge passes performed: 8
only 1: 21
only 2: 0
only 3: 0
only 4: 0
only 5: 0
only 6: 0
only 7: 1
merge passes performed: 9
only 1: 20
only 2: 0
only 3: 0
only 4: 0
only 5: 0
only 6: 0
only 7: 0
merge passes performed:

MELDmneu1engmedium    2905
MELDfneu1engmedium    2452
esdfneu1engmedium     1750
esdfhap2engmedium     1750
esdmneu1cmnmedium     1750
                      ... 
stratum_group_2          8
stratum_group_18         8
stratum_group_46         8
stratum_group_33         8
stratum_group_45         8
Length: 424, dtype: int64

time: 2.09 s


# Hyperparameters

In [10]:
var_smoothing = 0.1812436618470557
alpha = 100775.25983372086
gnb_params = lambda: {
    "base_estimator": GaussianNB(var_smoothing=var_smoothing),
    "n_estimators": 50,
    "warm_start": False,
    "n_jobs": -1,
    "random_state": SEED,
}

calibration_params = lambda: {
    "method": "isotonic",
    "cv": list(cross_validator()),
    "n_jobs": -1,
}

_ = gc_collect()

time: 108 ms


# Models

In [11]:
gnb_data = data.loc[:, gnb_features]
bagging_gnb = lambda: BaggingClassifier(**gnb_params()).fit(gnb_data, y_true)

ridge = lambda: CalibratedClassifierCV(
    base_estimator=RidgeClassifier(alpha=alpha, random_state=SEED),
    **calibration_params()
).fit(data, y_true)

voting = lambda: VotingClassifier(
    estimators=[
        (
            "ridge",
            CalibratedClassifierCV(
                base_estimator=RidgeClassifier(alpha=alpha, random_state=SEED),
                **calibration_params()
            ),
        ),
        (
            "gnb",
            Pipeline(
                steps=[
                    (
                        "select_features",
                        ColumnTransformer(
                            transformers=[("selector", "passthrough", gnb_features)],
                            n_jobs=-1,
                        ),
                    ),
                    (
                        "clf",
                        CalibratedClassifierCV(
                            base_estimator=BaggingClassifier(**gnb_params()),
                            **calibration_params()
                        ),
                    ),
                ]
            ),
        ),
    ],
    voting="soft",
    n_jobs=-1,
    verbose=True,
).fit(data, y_true)

stacked_pass = lambda: StackingClassifier(
    estimators=[
        (
            "ridge",
            RidgeClassifier(alpha=alpha, random_state=SEED),
        ),
        (
            "gnb",
            Pipeline(
                steps=[
                    (
                        "select_features",
                        ColumnTransformer(
                            transformers=[("selector", "passthrough", gnb_features)],
                            n_jobs=-1,
                        ),
                    ),
                    (
                        "clf",
                        BaggingClassifier(**gnb_params()),
                    ),
                ]
            ),
        ),
    ],
    final_estimator=LogisticRegressionCV(
        scoring="neg_log_loss",
        solver="saga",
        max_iter=10000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=SEED,
    ),
    cv=list(cross_validator()),
    n_jobs=-1,
    passthrough=True,
    verbose=1,
).fit(data, y_true)

stacked = lambda: StackingClassifier(
    estimators=[
        (
            "ridge",
            RidgeClassifier(alpha=alpha, random_state=SEED),
        ),
        (
            "gnb",
            Pipeline(
                steps=[
                    (
                        "select_features",
                        ColumnTransformer(
                            transformers=[("selector", "passthrough", gnb_features)],
                            n_jobs=-1,
                        ),
                    ),
                    (
                        "clf",
                        BaggingClassifier(**gnb_params()),
                    ),
                ]
            ),
        ),
    ],
    final_estimator=LogisticRegressionCV(
        scoring="neg_log_loss",
        solver="saga",
        max_iter=10000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=SEED,
    ),
    cv=list(cross_validator()),
    n_jobs=-1,
    passthrough=False,
    verbose=1,
).fit(data, y_true)

logreg = lambda: LogisticRegressionCV(
    cv=list(cross_validator()),
    scoring="neg_log_loss",
    solver="saga",
    tol=1e-5,
    max_iter=100000,
    class_weight="balanced",
    n_jobs=-1,
    random_state=SEED,
).fit(data, y_true)

_ = gc_collect()

time: 133 ms


In [12]:
models = {
    "bagging_GNB": bagging_gnb(),
    "logreg": logreg(),
    "ridge": ridge(),
    "voting_gnb_ridge": voting(),
    "stacked_gnb_ridge": stacked(),
    "stacked_gnb_ridge_passthrough": stacked_pass(),
}

time: 2h 46min 24s


In [13]:
for model, estimator in tqdm(models.items()):
    dump(estimator, f"{DATA_OUT_FOLDER}/prototypes/{model}.joblib")

  0%|          | 0/6 [00:00<?, ?it/s]

['./prototypes/bagging_GNB.joblib']

['./prototypes/logreg.joblib']

['./prototypes/ridge.joblib']

['./prototypes/voting_gnb_ridge.joblib']

['./prototypes/stacked_gnb_ridge.joblib']

['./prototypes/stacked_gnb_ridge_passthrough.joblib']

time: 650 ms


# Discussion

We are ready to try again on the holdout data.

In [14]:
print(f"Time elapsed since notebook_begin_time: {time() - notebook_begin_time} s")
_ = gc_collect()

Time elapsed since notebook_begin_time: 9996.845714569092 s
time: 382 ms


[^top](#Contents)