# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Load FRILL module](#Load-FRILL-module)
* [Load data](#Load-data)
* [Process data](#Process-data)
* [Preview](#Preview)

## Introduction

This notebook extracts and stores FRILL embeddings.

## Imports and configuration

In [1]:
# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2021)

del environ
del random_seed
del np_seed
del set_seed
del reset_seeds
del SEED



In [2]:
# Extensions
%load_ext lab_black
%load_ext nb_black
%load_ext autotime

In [3]:
# Core
import pandas as pd
import numpy as np

# tensorflow & tensorflow_hub
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub

# Python <3.9 type hints
from typing import List

# util
import swifter
from gc import collect as gc_collect

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

# hide warnings
import warnings

warnings.filterwarnings("ignore")
del warnings

time: 3.81 s


In [4]:
# Location of pickled dataframes
PICKLED_DF_FOLDER = "D:/interim_data"

# Location where this notebook will output
DATA_OUT_FOLDER = "D:/interim_data"

# Location where the FRILL module is stored locally
LOCAL_FRILL = "../../../FRILL/"

_ = gc_collect()

time: 108 ms


## Load FRILL module

In [5]:
# Load FRILL
tf.enable_v2_behavior()
# module = hub.load("https://tfhub.dev/google/nonsemantic-speech-benchmark/frill/1")
module = hub.load(LOCAL_FRILL)
del LOCAL_FRILL

time: 30.8 s


In [6]:
_ = gc_collect()

time: 719 ms


## Process data

In [7]:
def process_dfs(
    # last, start_from, and skip parameters are for resuming interrupted notebook sessions
    dev: bool,
    last: int = None,
    start_from: int = 0,
    skip: List[int] = [],
) -> None:
    """For each dataframe in a list, extracts FRILL embeddings from a prepared pd.Series, expands them into a dataframe, and saves that dataframe."""
    i = start_from
    while True:
        if i in skip:
            if last and i >= last:
                break
            i += 1
            continue
        _ = gc_collect()
        try:
            df = pd.read_pickle(
                f"{PICKLED_DF_FOLDER}/{'dev' if dev else 'nondev_prefrill/nondev'}_prefrill_{i}.pkl"
            )
        except FileNotFoundError:
            break
        _ = gc_collect()
        df["frill"] = df.ragged.swifter.apply(lambda _: module(_)["embedding"][0])
        _ = gc_collect()
        frill = pd.DataFrame(df.frill.tolist())
        _ = gc_collect()
        frill = frill.astype(np.float32)
        frill.columns = frill.columns.astype(str)
        frill["id"] = df.id
        del df
        _ = gc_collect()
        frill.to_feather(
            f"{DATA_OUT_FOLDER}/{'dev' if dev else 'nondev_feathers/nondev'}_FRILL_{i}.feather"
        )
        del frill
        i += 1
        _ = gc_collect()
        del _

time: 32 ms


In [None]:
process_dfs(dev=True)

In [8]:
_ = gc_collect()

time: 763 ms


In [9]:
# executed over several notebook sessions
process_dfs(dev=False, last=137, start_from=134)

Pandas Apply: 100%|██████████| 500/500 [05:01<00:00,  1.66it/s]
Pandas Apply: 100%|██████████| 500/500 [02:44<00:00,  3.03it/s]
Pandas Apply: 100%|██████████| 500/500 [06:23<00:00,  1.30it/s]
Pandas Apply: 100%|██████████| 500/500 [06:06<00:00,  1.36it/s]
Pandas Apply: 100%|██████████| 49/49 [00:41<00:00,  1.17it/s]


time: 31min 51s


## Preview

In [10]:
_ = gc_collect()

time: 1.93 s


In [None]:
pd.read_feather(f"{DATA_OUT_FOLDER}/dev_FRILL_3.feather").head(1)

In [12]:
pd.read_feather(f"{DATA_OUT_FOLDER}/nondev_feathers/nondev_FRILL_130.feather").head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,id
0,0.113702,0.003415,-0.009764,0.02254,0.099817,0.101802,0.010931,-0.150367,0.028713,-0.014945,...,-0.065913,-0.08238,0.039501,0.009846,0.028661,-0.05709,-0.038373,0.052529,-0.089197,83315


time: 4.96 s


[^top](#Contents)