# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Resampling](#Resampling)
* [Results](#Results)

# Introduction

To speed development, 5-fold train-test splits were preprocessed and prepared as separate .feather files. This notebook uses those splits and performs resampling on the embeddings. Rather than scaling features and reducing dimensions prior, directly resampling assumes that the FRILL extraction process produces scaled output that captures relevant dimensions.

In [1]:
from time import time

notebook_begin_time = time()

# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2021)
del environ
del random_seed
del np_seed
del set_seed
del reset_seeds

In [2]:
# extensions
%load_ext autotime
%load_ext lab_black
%load_ext nb_black

In [3]:
# core
import numpy as np
import pandas as pd

# utility
from gc import collect as gc_collect
from tqdm.notebook import tqdm

# typing
from typing import Tuple

# faster sklearn
from sklearnex import patch_sklearn

patch_sklearn()
del patch_sklearn

# resampling
from imblearn.over_sampling import ADASYN, SVMSMOTE
from imblearn.under_sampling import OneSidedSelection
from sklearn.svm import SVC

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

# hide warnings
import warnings

warnings.filterwarnings("ignore")
del warnings

time: 2.43 s


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Location of CV .feather files
CV_FEATHERS_FOLDER = "."

# Location where this notebook will output
DATA_OUT_FOLDER = "."

_ = gc_collect()

time: 122 ms


# Resampling

In [5]:
def relabel(y_train: pd.DataFrame, case: str) -> np.ndarray:
    """Relabel ternary to binary"""
    # 0: negative, 1: neutral, 2: positive
    y_train = np.squeeze(y_train.values).astype(np.int8)
    if case.startswith("neg"):
        return (y_train - 1) // 2 * (-1)
        # 0, 1, 2 -> -1, 0, 1 -> -1, 0, 0 -> 1, 0, 0
    elif case.startswith("neu"):
        return y_train % 2
        # 0, 1, 2 -> 0, 1, 0
    return y_train


def resample(
    X_train: pd.DataFrame, y_train: pd.DataFrame, case: str
) -> Tuple[Tuple[pd.DataFrame, np.ndarray], str]:
    """Oversample with ADASYN (SVMSMOTE backup), undersample with OSS"""
    # relabel if ternary, return np.int8 numpy array
    y_train: np.ndarray = relabel(y_train, case)

    # resample with ADASYN if possible with priority on sampling strategy
    for strategy in (
        "not majority",
        "minority",
        "all",
    ):  # majority class is not oversampled
        shared_params = {
            "sampling_strategy": strategy,
            "random_state": SEED,
            "n_jobs": -1,
        }
        message = lambda sampler: print(
            f"oversampled with {sampler} | strategy: {strategy}"
        )
        try:
            train = ADASYN(**shared_params).fit_resample(X_train, y_train)
            message("ADASYN")
            break
        except ValueError:
            try:
                train = SVMSMOTE(
                    **shared_params,
                    svm_estimator=SVC(class_weight="balanced", random_state=SEED),
                ).fit_resample(X_train, y_train)
                message("SVMSMOTE")
                break
            except:
                continue
    else:
        print("Failed to upsample.")
        train = (X_train, y_train)
    return (OneSidedSelection(random_state=SEED, n_jobs=-1).fit_resample(*train), case)


def save_data(
    train_data: Tuple[pd.DataFrame, np.ndarray], case: str, fold_num: int
) -> None:
    """Save X_train and y_train as .feather pd.DataFrame"""
    path = (
        lambda xy: f"{DATA_OUT_FOLDER}/cv_{fold_num}/{xy}_train_resampled_{case}.feather"
    )
    X_train, y_train = train_data
    X_train.reset_index(drop=True).to_feather(path("X"))
    del X_train
    _ = gc_collect()
    y_train = pd.DataFrame(y_train).reset_index(drop=True)
    y_train.columns = y_train.columns.astype(str)
    y_train.to_feather(path("y"))


_ = gc_collect()

time: 98.9 ms


In [6]:
fold_num = 0
while True:
    fold_begin = time()

    # load training data
    try:
        X_train: pd.DataFrame = pd.read_feather(
            f"{CV_FEATHERS_FOLDER}/cv_{fold_num}/X_train_untransformed.feather"
        )
        y_train: pd.DataFrame = pd.read_feather(
            f"{CV_FEATHERS_FOLDER}/cv_{fold_num}/y_train_untransformed.feather"
        )
    except FileNotFoundError:
        break

    for case in tqdm({"neg", "neu", "ter"}):
        # save resampled
        save_data(*resample(X_train, y_train, case=case), fold_num=fold_num)
    del X_train
    del y_train
    _ = gc_collect()

    print(f"fold {fold_num + 1} completed in {time() - fold_begin:.2f} s")
    del fold_begin

    fold_num += 1
    _ = gc_collect()

  0%|          | 0/3 [00:00<?, ?it/s]

oversampled with ADASYN | strategy: not majority
oversampled with SVMSMOTE | strategy: not majority
oversampled with ADASYN | strategy: not majority
fold 1 completed in 1363.18 s


  0%|          | 0/3 [00:00<?, ?it/s]

oversampled with ADASYN | strategy: not majority
oversampled with SVMSMOTE | strategy: not majority
oversampled with ADASYN | strategy: not majority
fold 2 completed in 2300.97 s


  0%|          | 0/3 [00:00<?, ?it/s]

oversampled with ADASYN | strategy: not majority
oversampled with ADASYN | strategy: not majority
oversampled with ADASYN | strategy: not majority
fold 3 completed in 910.40 s


  0%|          | 0/3 [00:00<?, ?it/s]

oversampled with ADASYN | strategy: not majority
oversampled with SVMSMOTE | strategy: not majority
oversampled with ADASYN | strategy: not majority
fold 4 completed in 939.63 s


  0%|          | 0/3 [00:00<?, ?it/s]

oversampled with ADASYN | strategy: not majority
oversampled with ADASYN | strategy: not majority
oversampled with ADASYN | strategy: not majority
fold 5 completed in 1034.90 s
time: 1h 49min 9s


# Results

Let's peek at the results.

In [7]:
_ = pd.read_feather("./cv_0/X_train_resampled_neg.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_1/X_train_resampled_neu.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_2/X_train_resampled_ter.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_3/y_train_resampled_neg.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_4/y_train_resampled_neu.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_0/y_train_resampled_ter.feather")
_.head(5)
_.info()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.173381,-0.000885,-0.028134,0.063979,-0.040914,0.089768,0.00942,-0.009658,-0.059898,-0.058973,...,-0.088983,0.095415,0.054035,-0.017342,0.0055,0.004635,-0.043357,-0.098072,-0.16029,0.09108
1,0.143889,-0.114034,-0.04634,0.015698,0.026511,0.055782,0.011183,-0.035858,-0.093313,0.030949,...,-0.146586,-0.033626,0.063095,-0.07228,-0.001062,0.021578,0.041127,0.090077,-0.024593,0.01682
2,0.043429,-0.024921,0.044928,-0.040725,-0.01998,0.042618,0.021618,-0.110268,-0.080457,0.013304,...,-0.046219,0.090248,-0.031559,-0.054432,-0.084632,-6.4e-05,0.09424,0.131143,-0.048901,-0.05246
3,0.011617,0.006525,-0.078876,0.002558,-0.020666,0.051838,-0.026137,0.043226,-0.025522,-0.033791,...,-0.162452,0.186429,0.065395,-0.030865,-0.010493,-0.064914,-0.030822,0.007724,0.013289,-0.083308
4,-0.00366,0.150072,-0.005546,0.095793,-0.02379,0.023972,-0.026603,0.002959,0.03838,0.025611,...,-0.095275,-0.055991,0.01178,-0.00296,-0.180334,0.014035,0.049368,0.100761,0.053365,0.145571


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38353 entries, 0 to 38352
Columns: 2048 entries, 0 to 2047
dtypes: float32(2048)
memory usage: 299.6 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.119926,-0.049218,0.039702,0.141101,0.044318,0.049631,0.016178,0.025797,-0.104466,-0.016044,...,-0.226529,-0.091773,0.028184,-0.074794,0.025474,0.046511,-0.002693,-0.017953,-0.146394,-0.06233
1,0.101377,-0.063714,-0.015971,-0.151561,-0.065328,-0.013746,0.008821,0.010461,-0.030857,-0.010367,...,-0.169826,0.051789,0.114774,-0.033263,-0.071615,-0.040535,-0.050813,0.021136,-0.189459,0.040756
2,-0.028015,-0.041617,0.012599,-0.03146,0.021788,0.039503,0.033654,0.114422,-0.080653,-0.03209,...,0.059661,0.020316,0.044898,0.008499,0.009812,-0.047551,0.085392,0.029538,0.05707,-0.029945
3,-0.008712,-0.003798,-0.05217,0.030298,0.049662,0.039806,0.056743,-0.186714,-0.15111,0.021191,...,0.107442,0.053569,-0.067552,-0.017051,-0.109469,-0.082999,-0.049325,0.031638,-0.027172,0.006479
4,-0.000265,-0.050629,-0.005011,-0.169305,-0.087526,-0.022268,-0.020403,-0.149949,-0.047235,0.03298,...,-0.128375,0.128055,0.047202,-0.024305,-0.168291,0.112876,0.017571,0.078376,0.099279,-0.04661


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111138 entries, 0 to 111137
Columns: 2048 entries, 0 to 2047
dtypes: float32(2048)
memory usage: 868.3 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.119926,-0.049218,0.039702,0.141101,0.044318,0.049631,0.016178,0.025797,-0.104466,-0.016044,...,-0.226529,-0.091773,0.028184,-0.074794,0.025474,0.046511,-0.002693,-0.017953,-0.146394,-0.06233
1,0.101377,-0.063714,-0.015971,-0.151561,-0.065328,-0.013746,0.008821,0.010461,-0.030857,-0.010367,...,-0.169826,0.051789,0.114774,-0.033263,-0.071615,-0.040535,-0.050813,0.021136,-0.189459,0.040756
2,-0.028015,-0.041617,0.012599,-0.03146,0.021788,0.039503,0.033654,0.114422,-0.080653,-0.03209,...,0.059661,0.020316,0.044898,0.008499,0.009812,-0.047551,0.085392,0.029538,0.05707,-0.029945
3,-0.008712,-0.003798,-0.05217,0.030298,0.049662,0.039806,0.056743,-0.186714,-0.15111,0.021191,...,0.107442,0.053569,-0.067552,-0.017051,-0.109469,-0.082999,-0.049325,0.031638,-0.027172,0.006479
4,-0.000265,-0.050629,-0.005011,-0.169305,-0.087526,-0.022268,-0.020403,-0.149949,-0.047235,0.03298,...,-0.128375,0.128055,0.047202,-0.024305,-0.168291,0.112876,0.017571,0.078376,0.099279,-0.04661


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109736 entries, 0 to 109735
Columns: 2048 entries, 0 to 2047
dtypes: float32(2048)
memory usage: 857.3 MB


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32170 entries, 0 to 32169
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       32170 non-null  int8 
dtypes: int8(1)
memory usage: 31.5 KB


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103978 entries, 0 to 103977
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   0       103978 non-null  int8 
dtypes: int8(1)
memory usage: 101.7 KB


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117594 entries, 0 to 117593
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   0       117594 non-null  int8 
dtypes: int8(1)
memory usage: 115.0 KB
time: 4.41 s


In [8]:
print(f"Time elapsed since notebook_begin_time: {time() - notebook_begin_time} s")
_ = gc_collect()

Time elapsed since notebook_begin_time: 6564.617095708847 s
time: 105 ms


[^top](#Contents)