# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Load data](#Load-data)
* [Strata](#Strata)
* [Hyperparameters](#Hyperparameters)
* [Models](#Models)
* [Evaluate](#Evaluate)
* [Discussion](#Discussion)

# Introduction

After rounds of feature engineering, visualization & exploration, and tuning various aspects of the classification pipeline, we are about to create new prototypes.

# Imports and configuration

In [1]:
from time import time

notebook_begin_time = time()

# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2022)
del environ
del random_seed
del np_seed
del set_seed
del reset_seeds

In [2]:
# extensions
%load_ext autotime
%load_ext lab_black
%load_ext nb_black

In [3]:
# core
import numpy as np
import pandas as pd

# utility
from joblib import dump
from gc import collect as gc_collect
from tqdm.notebook import tqdm

# faster
import swifter
from sklearnex import patch_sklearn

patch_sklearn()
del patch_sklearn

# typing
from typing import List, Dict, Union

# other sklearn
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import (
    LogisticRegression,
    LogisticRegressionCV,
    RidgeClassifier,
)
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

time: 3.61 s


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Location of FRILL .feather files
FRILL_FEATHERS_FOLDER = "../1.0-mic-extract_FRILL_embeddings"

# Location of pre-final features
FEATURES_FOLDER = "."

# Location where this notebook will output
DATA_OUT_FOLDER = "."

_ = gc_collect()

time: 116 ms


# Load data

In [5]:
def load_labels() -> pd.DataFrame:
    """Load just the labels"""
    keep_columns = [
        "id",
        "source",
        "speaker_id",
        "speaker_gender",
        "emo",
        "valence",
        "lang1",
        "length",
    ]
    labels = pd.concat(
        (
            pd.read_feather(
                f"{FRILL_FEATHERS_FOLDER}/dev_labels.feather", columns=keep_columns
            ),
            pd.read_feather(
                f"{FRILL_FEATHERS_FOLDER}/nondev_labels.feather", columns=keep_columns
            ),
        )
    ).set_index("id")
    return labels


def load_data() -> pd.DataFrame:
    """Loads the FRILL-based features"""
    df = pd.read_feather(
        f"{FEATURES_FOLDER}/scaled_features_ready_for_selection.feather"
    ).set_index("id")
    df.columns = df.columns.astype(str)
    return df


data = load_data()
labels = load_labels().loc[data.index]
y_true = labels.valence
gnb_features = ["spherical-LDA1", "spherical-LDA2"]
assert all(data.index == labels.index)
_ = gc_collect()

time: 185 ms


In [6]:
data.info()
labels.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 85740 entries, 0 to 87363
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   LDA1                     85740 non-null  float64
 1   LDA2                     85740 non-null  float64
 2   ocSVM_sgdlinear_neg      85740 non-null  float64
 3   ocSVM_sgdlinear_neu      85740 non-null  float64
 4   ocSVM_sgdlinear_pos      85740 non-null  float64
 5   LDA-LOF_neg_20           85740 non-null  float64
 6   LDA-LOF_neu_20           85740 non-null  float64
 7   LDA-LOF_pos_20           85740 non-null  float64
 8   LDA-ocSVM_rbf_neg        85740 non-null  float64
 9   LDA-ocSVM_rbf_neu        85740 non-null  float64
 10  LDA-ocSVM_rbf_pos        85740 non-null  float64
 11  LDA-ocSVM_sgdlinear_neg  85740 non-null  float64
 12  LDA-ocSVM_sgdlinear_neu  85740 non-null  float64
 13  LDA-ocSVM_sgdlinear_pos  85740 non-null  float64
dtypes: float64(14)
memory

# Strata

In [7]:
N_SPLITS = 8

# fields are concatentated for quick permutation omitting non-existent combos
strata = labels.loc[
    :, ["source", "speaker_gender", "emo", "valence", "lang1", "length"]
]
strata.valence = strata.valence.astype(str)
strata = strata.swifter.apply("".join, axis=1)

Dask Apply: 100%|██████████| 16/16 [00:02<00:00,  5.86it/s]


time: 6.61 s


In [8]:
# utility function for identifying strata with only i occurences
def get_solo(i: int, strata_: pd.Series) -> np.ndarray:
    """Given a series of stratum memberships, return a shuffled array of strata with only i members."""
    return np.unique(
        strata_.loc[
            strata_.isin(
                (strata_counts := strata_.value_counts())
                .where(strata_counts == i)
                .dropna()
                .index
            )
        ]
        .sample(frac=1, random_state=SEED)
        .values
    )


# get solos, print stuff
def get_onlys(
    strata_: pd.Series, print_me: str = "", n_splits: int = N_SPLITS
) -> List[Dict[int, np.ndarray]]:
    """Optinally prints something and returns calls of get_solo on strata_ in a list"""
    print(print_me)
    solos = []
    for i in range(1, n_splits):
        solo: np.ndarray = get_solo(i, strata_)
        print(f"only {i}:", (_ := solo.size))
        if _:  # >= 1 strata with only i samples
            solos.append({i: solo})
    return solos


def process_strata(strata: pd.Series, n_splits: int = N_SPLITS) -> pd.Series:
    """Corrects strata membership column according to n_splits"""

    count = get_onlys_calls = 0

    while onlys := get_onlys(
        strata,
        print_me=f"merge passes performed: {get_onlys_calls}",
        n_splits=n_splits,
    ):
        get_onlys_calls += 1
        if len(onlys) == 1:
            last = onlys[0]
            strata_to_merge: np.ndarray = list(last.values())[0]
            only_key = list(last.keys())[0]
            tuplet_size = n_splits // only_key + (1 if n_splits % only_key else 0)
            # perform tuplet merge
            interval = len(strata_to_merge) // n_splits
            for strata_tuplet in zip(
                *[
                    strata_to_merge[interval * i : interval * (i + 1)]
                    for i in range(tuplet_size)
                ]
            ):
                strata = strata.replace(strata_tuplet, f"stratum_group_{count}")
                count += 1
            remainder = strata_to_merge[tuplet_size * interval :]
            if len(remainder) == 1:
                # process remainder unmatched
                n = n_splits
                strata_counts = strata.value_counts()
                while not (candidates := strata_counts.loc[strata_counts == n]).size:
                    n += 1
                strata = strata.replace(
                    [remainder[0], candidates.sample(n=1, random_state=SEED).index[0]],
                    f"stratum_group_{count}",
                )
                count += 1
            else:
                # self-pair last
                remainder = remainder.tolist()
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
        else:
            pop_onlys = lambda _: list(onlys.pop(_).values())[0].tolist()
            while len(onlys) >= 2:
                # pop the ends
                shortside = pop_onlys(0)
                longside = pop_onlys(-1)
                # merge until one end empty
                while shortside and longside:
                    strata = strata.replace(
                        (shortside.pop(), longside.pop()), f"stratum_group_{count}"
                    )
                    count += 1
            if onlys:
                # self-pair middle
                remainder = pop_onlys(0)
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
    return strata


_ = gc_collect()

time: 120 ms


In [9]:
STRATA = process_strata(strata, n_splits=N_SPLITS)
STRATA.value_counts()
cross_validator = lambda: StratifiedGroupKFold(
    n_splits=N_SPLITS, shuffle=True, random_state=SEED
).split(X=data, y=STRATA, groups=labels.speaker_id)

merge passes performed: 0
only 1: 52
only 2: 37
only 3: 31
only 4: 17
only 5: 27
only 6: 13
only 7: 9
merge passes performed: 1
only 1: 43
only 2: 24
only 3: 4
only 4: 1
only 5: 0
only 6: 0
only 7: 0
merge passes performed: 2
only 1: 42
only 2: 20
only 3: 0
only 4: 0
only 5: 5
only 6: 0
only 7: 0
merge passes performed: 3
only 1: 37
only 2: 0
only 3: 0
only 4: 10
only 5: 0
only 6: 5
only 7: 0
merge passes performed: 4
only 1: 32
only 2: 0
only 3: 0
only 4: 0
only 5: 0
only 6: 0
only 7: 5
merge passes performed: 5
only 1: 27
only 2: 0
only 3: 0
only 4: 0
only 5: 0
only 6: 0
only 7: 0
merge passes performed: 6
only 1: 1
only 2: 1
only 3: 0
only 4: 0
only 5: 0
only 6: 0
only 7: 0
merge passes performed: 7
only 1: 0
only 2: 0
only 3: 1
only 4: 0
only 5: 0
only 6: 0
only 7: 0
merge passes performed: 8
only 1: 0
only 2: 0
only 3: 0
only 4: 0
only 5: 0
only 6: 0
only 7: 0


MELDmneu1engmedium     2905
MELDfneu1engmedium     2452
esdmang0cmnmedium      1750
esdmneu1cmnmedium      1750
esdmhap2cmnmedium      1750
                       ... 
stratum_group_84          8
stratum_group_38          8
stratum_group_94          8
BAUM2fcon0engmedium       8
stratum_group_90          8
Length: 449, dtype: int64

time: 1.87 s


# Hyperparameters

In [10]:
var_smoothing = 0.026743984632167556
alpha = 97874.43620365807
logreg_C = 0.011514411442955409

ridge_params = lambda: {
    "alpha": alpha,
    "random_state": SEED,
}


def logreg_params(logreg_C: Union[None, float] = 10.0) -> Dict[str, Union[float, str]]:
    """return logistic regression paramaters"""
    logreg_params = {
        "solver": "lbfgs",
        "tol": 1e-5,
        "max_iter": 100000,
        "class_weight": "balanced",
        "n_jobs": -1,
        "random_state": SEED + 1,
    }
    if logreg_C:
        logreg_params["C"] = logreg_C
    return logreg_params


gnb_params = lambda: {
    "base_estimator": GaussianNB(var_smoothing=var_smoothing),
    "n_estimators": 50,
    "warm_start": False,
    "n_jobs": -1,
    "random_state": SEED + 2,
}

calibration_params = lambda: {
    "method": "isotonic",
    "cv": list(cross_validator()),
    "n_jobs": -1,
}

_ = gc_collect()

time: 97.2 ms


# Models

In [11]:
bagging_gnb = lambda: BaggingClassifier(**gnb_params()).fit(data, y_true)

ridge = lambda: CalibratedClassifierCV(
    base_estimator=RidgeClassifier(**ridge_params()), **calibration_params()
).fit(data, y_true)

voting_gnb_ridge = lambda: VotingClassifier(
    estimators=[
        (
            "gnb",
            CalibratedClassifierCV(
                base_estimator=BaggingClassifier(**gnb_params()),
                **calibration_params(),
            ),
        ),
        (
            "ridge",
            CalibratedClassifierCV(
                base_estimator=RidgeClassifier(**ridge_params()), **calibration_params()
            ),
        ),
    ],
    voting="soft",
    n_jobs=-1,
    verbose=True,
).fit(data, y_true)

voting_gnb_logreg = lambda: VotingClassifier(
    estimators=[
        (
            "gnb",
            CalibratedClassifierCV(
                base_estimator=BaggingClassifier(**gnb_params()),
                **calibration_params(),
            ),
        ),
        (
            "logreg",
            CalibratedClassifierCV(
                base_estimator=LogisticRegression(**logreg_params(logreg_C)),
                **calibration_params(),
            ),
        ),
    ],
    voting="soft",
    n_jobs=-1,
    verbose=True,
).fit(data, y_true)

voting_logreg_ridge = lambda: VotingClassifier(
    estimators=[
        (
            "logreg",
            CalibratedClassifierCV(
                base_estimator=LogisticRegression(**logreg_params(logreg_C)),
                **calibration_params(),
            ),
        ),
        (
            "ridge",
            CalibratedClassifierCV(
                base_estimator=RidgeClassifier(**ridge_params()), **calibration_params()
            ),
        ),
    ],
    voting="soft",
    n_jobs=-1,
    verbose=True,
).fit(data, y_true)

voting_gnb_logreg_ridge = lambda: VotingClassifier(
    estimators=[
        (
            "gnb",
            CalibratedClassifierCV(
                base_estimator=BaggingClassifier(**gnb_params()),
                **calibration_params(),
            ),
        ),
        (
            "logreg",
            CalibratedClassifierCV(
                base_estimator=LogisticRegression(**logreg_params(logreg_C)),
                **calibration_params(),
            ),
        ),
        (
            "ridge",
            CalibratedClassifierCV(
                base_estimator=RidgeClassifier(**ridge_params()), **calibration_params()
            ),
        ),
    ],
    voting="soft",
    n_jobs=-1,
    verbose=True,
).fit(data, y_true)

stacked_pass = lambda: StackingClassifier(
    estimators=[
        (
            "ridge",
            RidgeClassifier(**ridge_params()),
        ),
        (
            "gnb",
            BaggingClassifier(**gnb_params()),
        ),
    ],
    final_estimator=LogisticRegressionCV(
        scoring="neg_log_loss",
        **logreg_params(None),
    ),
    cv=list(cross_validator()),
    n_jobs=-1,
    passthrough=True,
    verbose=1,
).fit(data, y_true)

stacked = lambda: StackingClassifier(
    estimators=[
        (
            "ridge",
            RidgeClassifier(**ridge_params()),
        ),
        (
            "gnb",
            BaggingClassifier(**gnb_params()),
        ),
    ],
    final_estimator=LogisticRegressionCV(
        scoring="neg_log_loss",
        **logreg_params(None),
    ),
    cv=list(cross_validator()),
    n_jobs=-1,
    passthrough=False,
    verbose=1,
).fit(data, y_true)

logreg = lambda: LogisticRegression(**logreg_params(logreg_C)).fit(data, y_true)

_ = gc_collect()

time: 110 ms


In [12]:
models = {
    "bagging_gnb": bagging_gnb(),
    "ridge": ridge(),
}

time: 9.11 s


In [13]:
models["voting_gnb_logreg"] = voting_gnb_logreg()

time: 17.2 s


In [14]:
models["stacked"] = stacked()

time: 21.1 s


In [15]:
models["stacked_pass"] = stacked_pass()

time: 1min 34s


In [16]:
models["logreg"] = logreg()

time: 5.45 s


In [17]:
models["voting_gnb_logreg_ridge"] = voting_gnb_logreg_ridge()

time: 19.8 s


In [18]:
models["voting_gnb_ridge"] = voting_gnb_ridge()

time: 12 s


In [19]:
models["voting_logreg_ridge"] = voting_logreg_ridge()

time: 10.5 s


In [20]:
for model, estimator in tqdm(models.items()):
    print(model)
    dump(estimator, f"{DATA_OUT_FOLDER}/prototypes/{model}.joblib")

  0%|          | 0/9 [00:00<?, ?it/s]

bagging_gnb


['./prototypes/bagging_gnb.joblib']

ridge


['./prototypes/ridge.joblib']

voting_gnb_logreg


['./prototypes/voting_gnb_logreg.joblib']

stacked


['./prototypes/stacked.joblib']

stacked_pass


['./prototypes/stacked_pass.joblib']

logreg


['./prototypes/logreg.joblib']

voting_gnb_logreg_ridge


['./prototypes/voting_gnb_logreg_ridge.joblib']

voting_gnb_ridge


['./prototypes/voting_gnb_ridge.joblib']

voting_logreg_ridge


['./prototypes/voting_logreg_ridge.joblib']

time: 2 s


# Discussion

We are ready to try again on the holdout data.

In [21]:
print(f"Time elapsed since notebook_begin_time: {time() - notebook_begin_time} s")
_ = gc_collect()

Time elapsed since notebook_begin_time: 211.12521982192993 s
time: 123 ms


[^top](#Contents)