# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Load data](#Load-data)
* [Strata](#Strata)
* [Hyperparameters](#Hyperparameters)
* [Models](#Models)
* [Evaluate](#Evaluate)
* [Discussion](#Discussion)

# Introduction

After rounds of feature engineering, visualization & exploration, and tuning various aspects of the classification pipeline, we are about to create a benchmark prototype classifier using RandomForestClassifier. In this notebook, we perform a grid search over n_estimators using out-of-bag accuracy instead of cross validation. Other hyperparameters are based on previous 5-fold cross-validation using a related set of features.

# Imports and configuration

In [1]:
from time import time

notebook_begin_time = time()

# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2021)
del environ
del random_seed
del np_seed
del set_seed
del reset_seeds

In [2]:
# extensions
%load_ext autotime
%load_ext lab_black
%load_ext nb_black

In [3]:
# core
import numpy as np
import pandas as pd

# utility
from joblib import dump
from gc import collect as gc_collect
from tqdm.notebook import tqdm

# faster
import swifter
from sklearnex import patch_sklearn

patch_sklearn()
del patch_sklearn

# typing
from typing import List, Dict

# metrics
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

# other sklearn
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_validate, StratifiedGroupKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

time: 3.35 s


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Location of FRILL .feather files
FRILL_FEATHERS_FOLDER = "../1.0-mic-extract_FRILL_embeddings"

# Location of pre-final features
FEATURES_FOLDER = "../19.0-mic-extract_FRILL-based_features_from_full_data"

# Location where this notebook will output
DATA_OUT_FOLDER = "."

_ = gc_collect()

time: 102 ms


# Load data

In [5]:
def load_labels() -> pd.DataFrame:
    """Load just the labels"""
    keep_columns = [
        "id",
        "source",
        "speaker_id",
        "speaker_gender",
        "emo",
        "valence",
        "lang1",
        "length",
    ]
    labels = pd.concat(
        (
            pd.read_feather(
                f"{FRILL_FEATHERS_FOLDER}/dev_labels.feather", columns=keep_columns
            ),
            pd.read_feather(
                f"{FRILL_FEATHERS_FOLDER}/nondev_labels.feather", columns=keep_columns
            ),
        )
    ).set_index("id")
    return labels


def load_data(unscaled=False) -> pd.DataFrame:
    """Loads the FRILL-based features"""
    if unscaled:
        df = pd.read_feather(
            f"{FEATURES_FOLDER}/unscaled_features_ready_for_selection.feather"
        ).set_index("id")
    else:
        df = pd.read_feather(
            f"{FEATURES_FOLDER}/features_ready_for_selection.feather"
        ).set_index("id")
    df.columns = df.columns.astype(str)
    return df


data = load_data(unscaled=True)
labels = load_labels()
y_true = labels.valence
gnb_features = ["spherical-LDA1", "spherical-LDA2"]
assert all(data.index == labels.index)
_ = gc_collect()

time: 353 ms


In [6]:
data.info()
labels.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 86752 entries, 0 to 87363
Columns: 118 entries, theta_LDA1+LDA2 to LDA-ocSVM_poly6_pos
dtypes: float64(118)
memory usage: 78.8 MB
<class 'pandas.core.frame.DataFrame'>
UInt64Index: 86752 entries, 0 to 87363
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   source          86752 non-null  category
 1   speaker_id      86752 non-null  category
 2   speaker_gender  86752 non-null  category
 3   emo             86752 non-null  category
 4   valence         86752 non-null  int8    
 5   lang1           86752 non-null  category
 6   length          86752 non-null  category
dtypes: category(6), int8(1)
memory usage: 1.4 MB
time: 31 ms


# Strata

In [7]:
N_SPLITS = 7

# fields are concatentated for quick permutation omitting non-existent combos
strata = labels.loc[
    :, ["source", "speaker_gender", "emo", "valence", "lang1", "length"]
]
strata.valence = strata.valence.astype(str)
strata = strata.swifter.apply("".join, axis=1)

Dask Apply: 100%|██████████| 16/16 [00:03<00:00,  5.09it/s]


time: 6.39 s


In [8]:
# utility function for identifying strata with only i occurences
def get_solo(i: int, strata_: pd.Series) -> np.ndarray:
    """Given a series of stratum memberships, return a shuffled array of strata with only i members."""
    return np.unique(
        strata_.loc[
            strata_.isin(
                (strata_counts := strata_.value_counts())
                .where(strata_counts == i)
                .dropna()
                .index
            )
        ]
        .sample(frac=1, random_state=SEED)
        .values
    )


# get solos, print stuff
def get_onlys(
    strata_: pd.Series, print_me: str = "", n_splits: int = N_SPLITS
) -> List[Dict[int, np.ndarray]]:
    """Optinally prints something and returns calls of get_solo on strata_ in a list"""
    print(print_me)
    solos = []
    for i in range(1, n_splits):
        solo: np.ndarray = get_solo(i, strata_)
        print(f"only {i}:", (_ := solo.size))
        if _:  # >= 1 strata with only i samples
            solos.append({i: solo})
    return solos


def process_strata(strata: pd.Series, n_splits: int = N_SPLITS) -> pd.Series:
    """Corrects strata membership column according to n_splits"""

    count = get_onlys_calls = 0

    while onlys := get_onlys(
        strata,
        print_me=f"merge passes performed: {get_onlys_calls}",
        n_splits=n_splits,
    ):
        get_onlys_calls += 1
        if len(onlys) == 1:
            last = onlys[0]
            strata_to_merge: np.ndarray = list(last.values())[0]
            only_key = list(last.keys())[0]
            tuplet_size = n_splits // only_key + (1 if n_splits % only_key else 0)
            # perform tuplet merge
            interval = len(strata_to_merge) // n_splits
            for strata_tuplet in zip(
                *[
                    strata_to_merge[interval * i : interval * (i + 1)]
                    for i in range(tuplet_size)
                ]
            ):
                strata = strata.replace(strata_tuplet, f"stratum_group_{count}")
                count += 1
            remainder = strata_to_merge[tuplet_size * interval :]
            if len(remainder) == 1:
                # process remainder unmatched
                n = n_splits
                strata_counts = strata.value_counts()
                while not (candidates := strata_counts.loc[strata_counts == n]).size:
                    n += 1
                strata = strata.replace(
                    [remainder[0], candidates.sample(n=1, random_state=SEED).index[0]],
                    f"stratum_group_{count}",
                )
                count += 1
            else:
                # self-pair last
                remainder = remainder.tolist()
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
        else:
            pop_onlys = lambda _: list(onlys.pop(_).values())[0].tolist()
            while len(onlys) >= 2:
                # pop the ends
                shortside = pop_onlys(0)
                longside = pop_onlys(-1)
                # merge until one end empty
                while shortside and longside:
                    strata = strata.replace(
                        (shortside.pop(), longside.pop()), f"stratum_group_{count}"
                    )
                    count += 1
            if onlys:
                # self-pair middle
                remainder = pop_onlys(0)
                while len(remainder) >= 2:
                    strata = strata.replace(
                        (remainder.pop(), remainder.pop()), f"stratum_group_{count}"
                    )
                    count += 1
    return strata


_ = gc_collect()

time: 101 ms


In [9]:
STRATA = process_strata(strata, n_splits=N_SPLITS)
STRATA.value_counts()
cross_validator = lambda: StratifiedGroupKFold(
    n_splits=N_SPLITS, shuffle=True, random_state=SEED
).split(X=data, y=STRATA, groups=labels.speaker_id)

merge passes performed: 0
only 1: 53
only 2: 39
only 3: 31
only 4: 17
only 5: 27
only 6: 13
merge passes performed: 1
only 1: 40
only 2: 12
only 3: 14
only 4: 0
only 5: 0
only 6: 0
merge passes performed: 2
only 1: 26
only 2: 0
only 3: 0
only 4: 20
only 5: 0
only 6: 0
merge passes performed: 3
only 1: 6
only 2: 0
only 3: 0
only 4: 0
only 5: 20
only 6: 0
merge passes performed: 4
only 1: 0
only 2: 0
only 3: 0
only 4: 0
only 5: 14
only 6: 6
merge passes performed: 5
only 1: 0
only 2: 0
only 3: 0
only 4: 0
only 5: 8
only 6: 0
merge passes performed: 6
only 1: 0
only 2: 0
only 3: 0
only 4: 0
only 5: 0
only 6: 0


MELDmneu1engmedium    2905
MELDfneu1engmedium    2452
esdfsur0engmedium     1750
esdfhap2engmedium     1750
esdfneu1engmedium     1750
                      ... 
stratum_group_33         7
stratum_group_38         7
stratum_group_1          7
stratum_group_35         7
stratum_group_6          7
Length: 455, dtype: int64

time: 2.05 s


# Hyperparameters

In [10]:
rf_params = lambda: {
    "n_estimators": 342,
    "criterion": "entropy",
    "max_depth": 17,
    "min_samples_split": 9,
    "min_samples_leaf": 4,
    "max_features": "sqrt",
    "max_leaf_nodes": 2207,
    "bootstrap": True,
    "oob_score": True,
    "n_jobs": -1,
    "random_state": SEED,
    "warm_start": False,
    "class_weight": "balanced_subsample",
}

var_smoothing = 0.10556503264086932
gnb_params = lambda: {
    "base_estimator": GaussianNB(var_smoothing=var_smoothing),
    "n_estimators": 23,
    "oob_score": True,
    "warm_start": False,
    "n_jobs": -1,
    "random_state": SEED,
}

calibration_params = lambda: {
    "method": "isotonic",
    "cv": list(cross_validator()),
    "n_jobs": -1,
}

alpha = 0.960072

_ = gc_collect()

time: 140 ms


# Models

In [11]:
gnb_data = data.loc[:, gnb_features]
# plain_gnb = lambda: GaussianNB(var_smoothing=var_smoothing).fit(gnb_data, y_true)
# bagging_gnb = lambda: BaggingClassifier(**gnb_params()).fit(gnb_data, y_true)
# rf = lambda: RandomForestClassifier(**rf_params()).fit(data, y_true)
ridge = lambda: CalibratedClassifierCV(
    base_estimator=RidgeClassifier(alpha=alpha, random_state=SEED),
    **calibration_params()
).fit(data, y_true)
voting = lambda: VotingClassifier(
    estimators=[
        (
            "ridge",
            CalibratedClassifierCV(
                base_estimator=RidgeClassifier(alpha=alpha, random_state=SEED),
                **calibration_params()
            ),
        ),
        (
            "gnb",
            Pipeline(
                steps=[
                    (
                        "select_features",
                        ColumnTransformer(
                            transformers=[("selector", "passthrough", gnb_features)],
                            n_jobs=-1,
                        ),
                    ),
                    (
                        "clf",
                        CalibratedClassifierCV(
                            base_estimator=BaggingClassifier(**gnb_params()),
                            **calibration_params()
                        ),
                    ),
                ]
            ),
        ),
    ],
    voting="soft",
    n_jobs=-1,
).fit(data, y_true)

_ = gc_collect()

time: 134 ms


In [12]:
models = {
    # "plain_GNB": plain_gnb(),
    # "bagging_GNB": bagging_gnb(),
    # "randomforest": rf(),
    "voting_ensemble_gnb_ridge": voting(),
    "ridge": ridge(),
}

time: 13 s


In [13]:
for model, estimator in tqdm(models.items()):
    dump(estimator, f"{DATA_OUT_FOLDER}/prototypes/{model}.joblib")

  0%|          | 0/2 [00:00<?, ?it/s]

['./prototypes/voting_ensemble_gnb_ridge.joblib']

['./prototypes/ridge.joblib']

time: 276 ms


# Discussion

post hoc calibrated ridge classifier quick train

In [14]:
print(f"Time elapsed since notebook_begin_time: {time() - notebook_begin_time} s")
_ = gc_collect()

Time elapsed since notebook_begin_time: 32.32737708091736 s
time: 172 ms


[^top](#Contents)