# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Process data](#Process-data)
* [Preview](#Preview)

## Introduction

This notebook concatenates stored FRILL embeddings and saves the aggregate files to the parent directory.

## Imports and configuration

In [1]:
# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2021)

In [2]:
# Extensions
%load_ext lab_black
%load_ext nb_black
%load_ext autotime

In [3]:
# Core
import pandas as pd

# tensorflow & tensorflow_hub
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub

# util
from gc import collect as gc_collect

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

# hide warnings
import warnings

warnings.filterwarnings("ignore")

time: 1.71 s


In [4]:
# Location of data in .feather format
FEATHER_DF_FOLDER = "D:/interim_data"

# Location where this notebook will output
DATA_OUT_FOLDER = "."

time: 10 ms


In [5]:
def process_data(dev: bool) -> None:
    """Concatenates prepared feather files and saves the aggregated file"""
    _ = gc_collect()
    dfs = []
    i = 0
    prefix = f"{'dev' if dev else 'nondev_feathers/nondev'}_FRILL"
    while True:
        try:
            dfs.append(pd.read_feather(f"{FEATHER_DF_FOLDER}/{prefix}_{i}.feather"))
        except FileNotFoundError:
            break
        i += 1
    pd.concat(dfs).reset_index(drop=True).to_feather(
        f"{DATA_OUT_FOLDER}/{'' if dev else 'non'}dev_FRILL.feather"
    )
    del dfs

time: 24 ms


In [6]:
process_data(dev=True)

time: 2.81 s


In [7]:
process_data(dev=False)

time: 55.6 s


In [8]:
_ = gc_collect()

time: 130 ms


Let's preview the feathers.

In [9]:
_ = pd.read_feather(f"{DATA_OUT_FOLDER}/dev_FRILL.feather")
_.head(1)
_.info()
_
del _
_ = gc_collect()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,id
0,0.119926,-0.049218,0.039702,0.141101,0.044318,0.049631,0.016178,0.025797,-0.104466,-0.016044,...,-0.091773,0.028184,-0.074794,0.025474,0.046511,-0.002693,-0.017953,-0.146394,-0.06233,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17703 entries, 0 to 17702
Columns: 2049 entries, 0 to id
dtypes: float32(2048), uint32(1)
memory usage: 138.4 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,id
0,0.119926,-0.049218,0.039702,0.141101,0.044318,0.049631,0.016178,0.025797,-0.104466,-0.016044,...,-0.091773,0.028184,-0.074794,0.025474,0.046511,-0.002693,-0.017953,-0.146394,-0.062330,0
1,0.101377,-0.063714,-0.015971,-0.151561,-0.065328,-0.013746,0.008821,0.010461,-0.030857,-0.010367,...,0.051789,0.114774,-0.033263,-0.071615,-0.040535,-0.050813,0.021136,-0.189459,0.040756,2
2,-0.028015,-0.041617,0.012599,-0.031460,0.021788,0.039503,0.033654,0.114422,-0.080653,-0.032090,...,0.020316,0.044898,0.008499,0.009812,-0.047551,0.085392,0.029538,0.057070,-0.029945,4
3,-0.008712,-0.003798,-0.052170,0.030298,0.049662,0.039806,0.056743,-0.186714,-0.151110,0.021191,...,0.053569,-0.067552,-0.017051,-0.109469,-0.082999,-0.049325,0.031638,-0.027172,0.006479,6
4,-0.000265,-0.050629,-0.005011,-0.169305,-0.087526,-0.022268,-0.020403,-0.149949,-0.047235,0.032980,...,0.128055,0.047202,-0.024305,-0.168291,0.112876,0.017571,0.078376,0.099279,-0.046610,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17698,0.180011,0.120602,0.042780,0.039607,-0.064516,0.037701,0.064375,0.006766,-0.067956,0.040594,...,-0.039278,-0.079617,-0.181004,0.034883,-0.065171,-0.092994,0.238404,-0.038211,-0.051745,39243
17699,-0.055477,-0.037197,0.098652,-0.055857,-0.085353,0.107524,0.007907,-0.066685,-0.005000,0.029096,...,0.095036,0.016983,-0.037770,0.062996,-0.017238,0.093098,0.053679,0.052871,-0.021866,39244
17700,-0.053369,0.019776,0.073188,0.076743,0.072065,0.134633,-0.040998,-0.052178,-0.031286,-0.028797,...,0.006242,0.092614,0.000562,0.006797,0.022133,-0.021554,0.164321,0.076711,-0.101797,39245
17701,0.069159,0.073854,-0.023457,-0.000237,0.014641,-0.042507,-0.038138,0.101064,-0.025217,0.040861,...,-0.028208,0.002690,-0.046130,0.073901,-0.077858,0.015327,0.048157,-0.007762,-0.101641,39246


time: 623 ms


In [10]:
_ = pd.read_feather(f"{DATA_OUT_FOLDER}/nondev_FRILL.feather")
_.head(1)
_.info()
_
del _
_ = gc_collect()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,id
0,0.074631,-0.017184,0.013033,-0.028294,0.017073,0.011132,-0.052272,0.077469,-0.127637,0.056547,...,-0.0106,-0.038199,-0.130908,-0.096763,-0.009632,-0.028987,0.061459,-0.064077,0.069334,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69049 entries, 0 to 69048
Columns: 2049 entries, 0 to id
dtypes: float32(2048), uint32(1)
memory usage: 539.7 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,id
0,0.074631,-0.017184,0.013033,-0.028294,0.017073,0.011132,-0.052272,0.077469,-0.127637,0.056547,...,-0.010600,-0.038199,-0.130908,-0.096763,-0.009632,-0.028987,0.061459,-0.064077,0.069334,1
1,0.004854,-0.027279,-0.094140,-0.126076,-0.140316,-0.058383,0.015511,0.046667,-0.024869,-0.060984,...,-0.015468,0.068417,-0.081816,-0.038204,0.033283,-0.055570,-0.028610,0.005424,-0.042908,3
2,0.064436,0.001500,-0.035291,0.011047,0.063933,0.088772,0.032510,-0.018022,0.042950,0.026433,...,-0.060911,0.089016,0.000265,-0.055298,0.070568,0.052941,0.057260,0.069311,-0.046155,5
3,-0.038411,-0.006024,0.047844,0.034920,0.029144,-0.007714,-0.069259,-0.102116,-0.036844,-0.007443,...,0.025900,-0.076937,-0.046671,-0.031731,0.028090,0.038855,-0.044481,0.097526,0.095964,7
4,0.017658,0.006395,-0.112567,0.027114,-0.128276,0.007152,-0.097692,-0.032338,-0.032595,-0.043031,...,0.017274,0.043451,0.003549,-0.200501,0.040738,0.007812,-0.024630,-0.034282,-0.007596,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69044,0.032284,0.079231,0.012629,0.037832,0.030645,0.037029,0.001485,-0.060416,0.014532,-0.015883,...,0.033543,0.040000,0.001393,-0.049468,0.043380,-0.019887,-0.000053,-0.009738,-0.057786,87359
69045,0.102379,-0.040497,-0.090511,-0.079831,-0.023239,0.007808,-0.029889,0.067105,0.003831,-0.039205,...,0.164508,-0.002019,-0.062618,0.055444,0.043677,-0.083234,-0.013649,0.155912,-0.011944,87360
69046,-0.076627,-0.035694,-0.008981,-0.110329,0.001256,-0.040823,-0.054973,0.068552,-0.041609,-0.040817,...,-0.002083,0.040230,0.012301,-0.072455,0.093254,0.103900,0.206039,0.059684,-0.027905,87361
69047,-0.032972,-0.017472,-0.036656,-0.109592,-0.059677,-0.006132,-0.058374,0.096132,0.047664,-0.054570,...,0.166648,0.002944,-0.093481,-0.041031,-0.016075,-0.059306,0.102830,0.092163,-0.065293,87362


time: 1.06 s


[^top](#Contents)