# Purpose

2021-07-28.
Test whether inference speeds are faster when using GPUs (it **should** be much faster).

---
2021-06-28.
Test different parallel frameworks to speed up getting embeddings from USE-multilingual.

test:
- list
- tf.apply
- dask
    - had a weird time trying to compute these...

Had a hard time installing these:
- ~modin~
- ~pandarallel~


# Notebook setup

In [2]:
%load_ext autoreload
%autoreload 2

In [35]:
from datetime import datetime
import gc
import os
import logging
from pprint import pprint

from tqdm.auto import tqdm
# import fse
# from fse.models import uSIF
# import gensim
# from gensim.models.fasttext import FastText, load_facebook_vectors
import joblib

import math
import numpy as np
import pandas as pd
import plotly
import plotly.express as px

# modin df
# import modin
# import modin.pandas as mpd

import subclu
# from subclu.data.fasttext_utils import (
#     download_ft_pretrained_model,
#     get_df_for_most_similar,
#     get_project_subfolder,
# )
from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)
from subclu.models.vectorize_text import get_embeddings_as_df
from subclu.data.data_loaders import LoadPosts, LoadSubreddits, create_sub_level_aggregates

print_lib_versions([
#     fse, gensim, joblib, modin, 
    np, pd, plotly, subclu
])

python		v 3.7.10
===
numpy		v: 1.18.5
pandas		v: 1.2.5
plotly		v: 4.14.3
subclu		v: 0.3.2


In [36]:
# USE & TF-focused imports
import tensorflow
import tensorflow as tf
# import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
import tensorflow_text
from tensorflow.python.client import device_lib

print_lib_versions([tensorflow, tensorflow_text])

python		v 3.7.10
===
tensorflow	v: 2.3.3
tensorflow_text	v: 2.3.0


In [5]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Check whether we have access to a GPU

In [6]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [7]:
# %%time

# # don't set debug logging to True in this notebook because we'll be overloaded with TF debug statements
# tf.debugging.set_log_device_placement(True)

# # Create some tensors
# a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
# b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
# c = tf.matmul(a, b)

# print(c)

## List devices

In [8]:
l_phys_gpus = (
    tf.config.list_physical_devices('GPU') +
    tf.config.list_physical_devices('XLA_GPU')
)

print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\n\nGPUs\n==="
    f"\nNum GPUs Available: {len(l_phys_gpus)}"
    f"\nGPU details:\n{l_phys_gpus}"
)


Built with CUDA? True

GPUs
===
Num GPUs Available: 2
GPU details:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]


In [9]:
l_all_local_devices = device_lib.list_local_devices()
print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\n\nAll devices:\n==="
    f"\nNum devices: {len(l_all_local_devices)}"
    f"\nDetails:"
)
pprint(l_all_local_devices, indent=4,)


Built with CUDA? True

All devices:
===
Num devices: 4
Details:
[   name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12578137409028812590
,
    name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 12482155329608343755
physical_device_desc: "device: XLA_CPU device"
,
    name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 12344900954379005170
physical_device_desc: "device: XLA_GPU device"
,
    name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14676252416
locality {
  bus_id: 1
  links {
  }
}
incarnation: 5314190549655588020
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
]


# Check NVIDIA CLI

First, do we even see the GPUs?

In [10]:
!lspci | grep 3D

00:04.0 3D controller: NVIDIA Corporation TU104GL [Tesla T4] (rev a1)


Then, are they recognized by the nvidia-smi tool?

In [11]:
!nvidia-smi

Thu Jul 29 05:55:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    33W /  70W |    222MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Load model from hub

The first time we try a model might take a while because we might need to download it from the URL.

TF should cache it after that.

```
# time from CPU (after already downloaded)
module_url_large = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3' 
model = hub.load(module_url_large)
> CPU times: user 8.98 s, sys: 1.4 s, total: 10.4 s
> Wall time: 10.2 s


module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3' 
model_normal = hub.load(module_url)
> CPU times: user 8.18 s, sys: 1.79 s, total: 9.98 s
> Wall time: 12 s
```

In [12]:
%%time
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
#@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']

module_url_large = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3' 
model = hub.load(module_url_large)

CPU times: user 9.4 s, sys: 1.68 s, total: 11.1 s
Wall time: 10.9 s


In [15]:
%%time
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
#@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']

module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3' 
model_normal = hub.load(module_url)

CPU times: user 4.48 s, sys: 884 ms, total: 5.37 s
Wall time: 5.58 s


## Example from tutorial

Large-model times:
```
# CPU
CPU times: user 4.89 s, sys: 296 ms, total: 5.19 s
Wall time: 4.94 s

# GPU
CPU times: user 3.01 s, sys: 191 ms, total: 3.2 s
Wall time: 2.91 s
``` 

Normal-model times:
```
# CPU
CPU times: user 1.6 s, sys: 110 ms, total: 1.71 s
Wall time: 2.02 s

# GPU
CPU times: user 986 ms, sys: 6.29 ms, total: 992 ms
Wall time: 942 ms
```

In [13]:
# Some texts of different lengths.
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]

In [14]:
%%time

# Compute embeddings.
en_result = model(english_sentences)
it_result = model(italian_sentences)
ja_result = model(japanese_sentences)

CPU times: user 3.01 s, sys: 191 ms, total: 3.2 s
Wall time: 2.91 s


In [16]:
%%time

# Compute embeddings.
en_result_ = model_normal(english_sentences)
it_result_ = model_normal(italian_sentences)
ja_result_ = model_normal(japanese_sentences)

CPU times: user 986 ms, sys: 6.29 ms, total: 992 ms
Wall time: 942 ms


In [17]:
pd.DataFrame(en_result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,"tf.Tensor(-0.0052521327, shape=(), dtype=float32)","tf.Tensor(-0.038312, shape=(), dtype=float32)","tf.Tensor(-0.009220169, shape=(), dtype=float32)","tf.Tensor(-0.04609629, shape=(), dtype=float32)","tf.Tensor(-0.057657726, shape=(), dtype=float32)","tf.Tensor(-0.04768434, shape=(), dtype=float32)","tf.Tensor(-0.016511653, shape=(), dtype=float32)","tf.Tensor(0.004728578, shape=(), dtype=float32)","tf.Tensor(-0.013010713, shape=(), dtype=float32)","tf.Tensor(-0.07138159, shape=(), dtype=float32)","tf.Tensor(-0.030422537, shape=(), dtype=float32)","tf.Tensor(0.023165466, shape=(), dtype=float32)","tf.Tensor(0.017145865, shape=(), dtype=float32)","tf.Tensor(0.045331288, shape=(), dtype=float32)","tf.Tensor(0.01566995, shape=(), dtype=float32)","tf.Tensor(-0.013959975, shape=(), dtype=float32)","tf.Tensor(-0.053601336, shape=(), dtype=float32)","tf.Tensor(-0.011878502, shape=(), dtype=float32)","tf.Tensor(-0.11063093, shape=(), dtype=float32)","tf.Tensor(0.004849982, shape=(), dtype=float32)","tf.Tensor(0.0067631933, shape=(), dtype=float32)","tf.Tensor(0.039811607, shape=(), dtype=float32)","tf.Tensor(0.066015504, shape=(), dtype=float32)","tf.Tensor(-0.012721933, shape=(), dtype=float32)","tf.Tensor(0.0374415, shape=(), dtype=float32)","tf.Tensor(0.030250398, shape=(), dtype=float32)","tf.Tensor(-0.0152029535, shape=(), dtype=float32)","tf.Tensor(0.011007941, shape=(), dtype=float32)","tf.Tensor(0.016856829, shape=(), dtype=float32)","tf.Tensor(-0.019257626, shape=(), dtype=float32)",...,"tf.Tensor(0.08616641, shape=(), dtype=float32)","tf.Tensor(-0.024433242, shape=(), dtype=float32)","tf.Tensor(-0.00091490924, shape=(), dtype=float32)","tf.Tensor(0.038853932, shape=(), dtype=float32)","tf.Tensor(-0.031945094, shape=(), dtype=float32)","tf.Tensor(0.043825816, shape=(), dtype=float32)","tf.Tensor(0.058404706, shape=(), dtype=float32)","tf.Tensor(0.03934805, shape=(), dtype=float32)","tf.Tensor(0.08239186, shape=(), dtype=float32)","tf.Tensor(-0.028973889, shape=(), dtype=float32)","tf.Tensor(0.012636834, shape=(), dtype=float32)","tf.Tensor(-0.008682323, shape=(), dtype=float32)","tf.Tensor(0.0005825149, shape=(), dtype=float32)","tf.Tensor(0.075179294, shape=(), dtype=float32)","tf.Tensor(0.01735615, shape=(), dtype=float32)","tf.Tensor(0.012234447, shape=(), dtype=float32)","tf.Tensor(-0.019842023, shape=(), dtype=float32)","tf.Tensor(-0.07749192, shape=(), dtype=float32)","tf.Tensor(-0.024406087, shape=(), dtype=float32)","tf.Tensor(-0.032542042, shape=(), dtype=float32)","tf.Tensor(0.012990009, shape=(), dtype=float32)","tf.Tensor(-0.05957966, shape=(), dtype=float32)","tf.Tensor(0.026064062, shape=(), dtype=float32)","tf.Tensor(-0.052391514, shape=(), dtype=float32)","tf.Tensor(-0.05916792, shape=(), dtype=float32)","tf.Tensor(-0.025977194, shape=(), dtype=float32)","tf.Tensor(-0.0319761, shape=(), dtype=float32)","tf.Tensor(0.021790506, shape=(), dtype=float32)","tf.Tensor(0.06280179, shape=(), dtype=float32)","tf.Tensor(-0.016225366, shape=(), dtype=float32)"
1,"tf.Tensor(-0.022675816, shape=(), dtype=float32)","tf.Tensor(-0.06907185, shape=(), dtype=float32)","tf.Tensor(0.015508699, shape=(), dtype=float32)","tf.Tensor(-0.029039731, shape=(), dtype=float32)","tf.Tensor(-0.08988534, shape=(), dtype=float32)","tf.Tensor(-0.0010676654, shape=(), dtype=float32)","tf.Tensor(-0.014085721, shape=(), dtype=float32)","tf.Tensor(0.0072551165, shape=(), dtype=float32)","tf.Tensor(-0.019989628, shape=(), dtype=float32)","tf.Tensor(0.0669519, shape=(), dtype=float32)","tf.Tensor(0.01702949, shape=(), dtype=float32)","tf.Tensor(0.022945886, shape=(), dtype=float32)","tf.Tensor(-0.0061085713, shape=(), dtype=float32)","tf.Tensor(0.05118299, shape=(), dtype=float32)","tf.Tensor(-0.043797843, shape=(), dtype=float32)","tf.Tensor(-0.041236263, shape=(), dtype=float32)","tf.Tensor(0.029429307, shape=(), dtype=float32)","tf.Tensor(0.04888875, shape=(), dtype=float32)","tf.Tensor(-0.11650517, shape=(), dtype=float32)","tf.Tensor(-0.0024727243, shape=(), dtype=float32)","tf.Tensor(0.028916731, shape=(), dtype=float32)","tf.Tensor(9.1215996e-05, shape=(), dtype=float32)","tf.Tensor(0.024404425, shape=(), dtype=float32)","tf.Tensor(0.012593301, shape=(), dtype=float32)","tf.Tensor(0.020244472, shape=(), dtype=float32)","tf.Tensor(0.0207117, shape=(), dtype=float32)","tf.Tensor(-0.031881034, shape=(), dtype=float32)","tf.Tensor(0.053425193, shape=(), dtype=float32)","tf.Tensor(0.0026995572, shape=(), dtype=float32)","tf.Tensor(-0.08084271, shape=(), dtype=float32)",...,"tf.Tensor(0.10710527, shape=(), dtype=float32)","tf.Tensor(0.00011628864, shape=(), dtype=float32)","tf.Tensor(0.008712975, shape=(), dtype=float32)","tf.Tensor(0.07481751, shape=(), dtype=float32)","tf.Tensor(-0.04523135, shape=(), dtype=float32)","tf.Tensor(0.052447546, shape=(), dtype=float32)","tf.Tensor(0.039153565, shape=(), dtype=float32)","tf.Tensor(-0.001876064, shape=(), dtype=float32)","tf.Tensor(0.017901925, shape=(), dtype=float32)","tf.Tensor(-0.014710548, shape=(), dtype=float32)","tf.Tensor(0.04312735, shape=(), dtype=float32)","tf.Tensor(-0.030878648, shape=(), dtype=float32)","tf.Tensor(-0.05691044, shape=(), dtype=float32)","tf.Tensor(0.058894195, shape=(), dtype=float32)","tf.Tensor(-0.006595898, shape=(), dtype=float32)","tf.Tensor(-0.019973187, shape=(), dtype=float32)","tf.Tensor(-0.03776713, shape=(), dtype=float32)","tf.Tensor(0.021722123, shape=(), dtype=float32)","tf.Tensor(-0.032639634, shape=(), dtype=float32)","tf.Tensor(-0.0055215736, shape=(), dtype=float32)","tf.Tensor(-0.03981592, shape=(), dtype=float32)","tf.Tensor(-0.004657408, shape=(), dtype=float32)","tf.Tensor(0.024200808, shape=(), dtype=float32)","tf.Tensor(-0.026657946, shape=(), dtype=float32)","tf.Tensor(-0.0063680657, shape=(), dtype=float32)","tf.Tensor(-0.06910574, shape=(), dtype=float32)","tf.Tensor(0.029129563, shape=(), dtype=float32)","tf.Tensor(0.08762301, shape=(), dtype=float32)","tf.Tensor(-0.00076922943, shape=(), dtype=float32)","tf.Tensor(-0.05410821, shape=(), dtype=float32)"
2,"tf.Tensor(0.019088328, shape=(), dtype=float32)","tf.Tensor(0.009552198, shape=(), dtype=float32)","tf.Tensor(-0.047412932, shape=(), dtype=float32)","tf.Tensor(0.010493757, shape=(), dtype=float32)","tf.Tensor(-0.042908456, shape=(), dtype=float32)","tf.Tensor(-0.06331453, shape=(), dtype=float32)","tf.Tensor(-0.0031612448, shape=(), dtype=float32)","tf.Tensor(0.05149613, shape=(), dtype=float32)","tf.Tensor(0.027147714, shape=(), dtype=float32)","tf.Tensor(-0.010961107, shape=(), dtype=float32)","tf.Tensor(0.034988597, shape=(), dtype=float32)","tf.Tensor(0.05735229, shape=(), dtype=float32)","tf.Tensor(0.046823528, shape=(), dtype=float32)","tf.Tensor(0.057418335, shape=(), dtype=float32)","tf.Tensor(-0.003669362, shape=(), dtype=float32)","tf.Tensor(0.006604575, shape=(), dtype=float32)","tf.Tensor(-0.0013654017, shape=(), dtype=float32)","tf.Tensor(0.0038320303, shape=(), dtype=float32)","tf.Tensor(-0.09058422, shape=(), dtype=float32)","tf.Tensor(0.048744865, shape=(), dtype=float32)","tf.Tensor(0.028695427, shape=(), dtype=float32)","tf.Tensor(-0.021483552, shape=(), dtype=float32)","tf.Tensor(0.052940547, shape=(), dtype=float32)","tf.Tensor(-0.017772537, shape=(), dtype=float32)","tf.Tensor(-0.048549958, shape=(), dtype=float32)","tf.Tensor(-0.075713105, shape=(), dtype=float32)","tf.Tensor(-0.061066046, shape=(), dtype=float32)","tf.Tensor(0.06376975, shape=(), dtype=float32)","tf.Tensor(-0.030032236, shape=(), dtype=float32)","tf.Tensor(0.026240462, shape=(), dtype=float32)",...,"tf.Tensor(-0.021820951, shape=(), dtype=float32)","tf.Tensor(-0.05420484, shape=(), dtype=float32)","tf.Tensor(0.078226514, shape=(), dtype=float32)","tf.Tensor(-0.013889626, shape=(), dtype=float32)","tf.Tensor(-0.054223288, shape=(), dtype=float32)","tf.Tensor(-0.05325499, shape=(), dtype=float32)","tf.Tensor(0.00722771, shape=(), dtype=float32)","tf.Tensor(-0.02969856, shape=(), dtype=float32)","tf.Tensor(0.025481395, shape=(), dtype=float32)","tf.Tensor(0.012124312, shape=(), dtype=float32)","tf.Tensor(0.0003245068, shape=(), dtype=float32)","tf.Tensor(0.008369918, shape=(), dtype=float32)","tf.Tensor(-0.041196987, shape=(), dtype=float32)","tf.Tensor(0.0804329, shape=(), dtype=float32)","tf.Tensor(0.0070090145, shape=(), dtype=float32)","tf.Tensor(-0.05093788, shape=(), dtype=float32)","tf.Tensor(0.013866419, shape=(), dtype=float32)","tf.Tensor(0.03464066, shape=(), dtype=float32)","tf.Tensor(0.008104311, shape=(), dtype=float32)","tf.Tensor(0.021492796, shape=(), dtype=float32)","tf.Tensor(0.11410709, shape=(), dtype=float32)","tf.Tensor(-0.07061692, shape=(), dtype=float32)","tf.Tensor(0.06558145, shape=(), dtype=float32)","tf.Tensor(-0.039064348, shape=(), dtype=float32)","tf.Tensor(0.0058091856, shape=(), dtype=float32)","tf.Tensor(-0.07895439, shape=(), dtype=float32)","tf.Tensor(0.059527934, shape=(), dtype=float32)","tf.Tensor(0.03206479, shape=(), dtype=float32)","tf.Tensor(0.036616657, shape=(), dtype=float32)","tf.Tensor(0.0033105926, shape=(), dtype=float32)"


In [20]:
# pd.DataFrame(en_result_)

In [21]:
# tf.nn.l2_normalize(it_result)

In [22]:
# Compute similarity matrix. Higher score indicates greater similarity.
similarity_matrix_it = np.inner(en_result, it_result)
similarity_matrix_it

array([[0.9696458 , 0.27347845, 0.25536817],
       [0.3772605 , 0.8931674 , 0.2880668 ],
       [0.2461136 , 0.23792973, 0.93521506]], dtype=float32)

In [23]:
similarity_matrix_ja = np.inner(en_result, ja_result)
similarity_matrix_ja

array([[0.9799423 , 0.5341731 , 0.33749646],
       [0.41874057, 0.77861166, 0.3792975 ],
       [0.25998718, 0.29997382, 0.8583672 ]], dtype=float32)

# Cosine Similarities Example
This is annoying because we get a different result when using this `get_similarities` function compared to `np.inner()`... maybe it's because the raw inputs to `np.inner` weren't normalized, right?
Also, I'm not sure why we have to force clipping...

---

Using example form Colab

https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb#scrollTo=W-q2r7jyZGb7

In [24]:
def get_similarities(sent_1, sent_2):
    sts_encode1 = tf.nn.l2_normalize(model(sent_1), axis=1)
    sts_encode2 = tf.nn.l2_normalize(model(sent_2), axis=1)
    
    cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
    clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
    scores = 1.0 - tf.acos(clip_cosine_similarities) / math.pi
    """Returns the similarity scores"""
    return scores

In [25]:
get_similarities(english_sentences[0], italian_sentences[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.9213719], dtype=float32)>

In [26]:
get_similarities(english_sentences[0], japanese_sentences[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.93613935], dtype=float32)>

In [27]:
model("hello").numpy().shape

(1, 512)

# Test new function on subreddit descriptions

Time with my function (apply model to list).


In [28]:
col_manual_labels = 'manual_topic_and_rating'

df_subs = LoadSubreddits(
    bucket_name='i18n-subreddit-clustering',
    folder_path='subreddits/de/2021-06-16',
    columns=None,
    col_new_manual_topic=col_manual_labels,
).read_and_apply_transformations()
df_subs.shape

06:05:36 | INFO | "Reading raw data..."
06:05:37 | INFO | "  Applying transformations..."


(629, 35)

In [29]:
# df_subs.head()

### Large model
```
# CPU, 100 chars
CPU times: user 28.3 s, sys: 3.06 s, total: 31.4 s
Wall time: 3.11 s
(629, 512)


# GPU, 100 chars
CPU times: user 337 ms, sys: 34.5 ms, total: 371 ms
Wall time: 518 ms
(629, 512)

# GPU, 500 chars
CPU times: user 427 ms, sys: 116 ms, total: 544 ms
Wall time: 1.5 s
(629, 512)


# GPU, 1000 chars (!!) <- This takes as long as 100 chars in a CPU!
CPU times: user 535 ms, sys: 303 ms, total: 837 ms
Wall time: 3.11 s
(629, 512)
```

In [40]:
%%time
df_vec_subs_large = get_embeddings_as_df(
    model=model,
    df=df_subs,
    col_text='subreddit_name_title_and_clean_descriptions',
    cols_index='subreddit_default_',
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=100,
)
df_vec_subs_large.shape

CPU times: user 263 ms, sys: 41.1 ms, total: 304 ms
Wall time: 477 ms


(629, 512)

In [49]:
%%time
df_vec_subs_large = get_embeddings_as_df(
    model=model,
    df=df_subs,
    col_text='subreddit_name_title_and_clean_descriptions',
    cols_index='subreddit_default_',
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=500,
)
df_vec_subs_large.shape

CPU times: user 427 ms, sys: 116 ms, total: 544 ms
Wall time: 1.5 s


(629, 512)

In [41]:
%%time
df_vec_subs_large = get_embeddings_as_df(
    model=model,
    df=df_subs,
    col_text='subreddit_name_title_and_clean_descriptions',
    cols_index='subreddit_default_',
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=1000,
)
df_vec_subs_large.shape

CPU times: user 520 ms, sys: 181 ms, total: 701 ms
Wall time: 3.05 s


(629, 512)

### Small/normal/reduced model
```
# CPU, 1000 chars
CPU times: user 20.5 s, sys: 3.47 s, total: 24 s
Wall time: 1.82 s
(629, 512)


# GPU, 1000 chars - NO INDEX
CPU times: user 359 ms, sys: 53 ms, total: 412 ms
Wall time: 336 ms
(629, 512)

# GPU, 1000 chars - ATTACH index cols (e.g., subreddit name)
CPU times: user 386 ms, sys: 21.4 ms, total: 408 ms
Wall time: 333 ms


# GPU, 2000 chars
CPU times: user 400 ms, sys: 73.4 ms, total: 473 ms
Wall time: 378 ms
(629, 512)

```

In [47]:
%%time

# no index *might* save some time, but not worth it b/c it could be hard to map embeddings to input
df_vec_subs = get_embeddings_as_df(
    model=model_normal,
    df=df_subs,
    col_text='subreddit_name_title_and_clean_descriptions',
    cols_index=None,
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=1000,
)
df_vec_subs.shape

CPU times: user 359 ms, sys: 57.1 ms, total: 416 ms
Wall time: 321 ms


(629, 512)

In [45]:
%%time

df_vec_subs = get_embeddings_as_df(
    model=model_normal,
    df=df_subs,
    col_text='subreddit_name_title_and_clean_descriptions',
    cols_index='subreddit_default_',
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=1000,
)
df_vec_subs.shape

CPU times: user 386 ms, sys: 21.4 ms, total: 408 ms
Wall time: 333 ms


(629, 512)

In [48]:
%%time

df_vec_subs = get_embeddings_as_df(
    model=model_normal,
    df=df_subs,
    col_text='subreddit_name_title_and_clean_descriptions',
    cols_index='subreddit_default_',
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=2000,
)
df_vec_subs.shape

CPU times: user 400 ms, sys: 73.4 ms, total: 473 ms
Wall time: 378 ms


(629, 512)

In [50]:
df_subs['subreddit_name_title_and_clean_descriptions'].str.len().describe()

count     629.000000
mean      509.092210
std       739.441529
min        17.000000
25%        93.000000
50%       191.000000
75%       626.000000
max      5142.000000
Name: subreddit_name_title_and_clean_descriptions, dtype: float64

In [59]:
for i in np.arange(0.8, 0.96, 0.025):
    print(f"{i:.3f}",
          f"{df_subs['subreddit_name_title_and_clean_descriptions'].str.len().quantile(i):.2f}")

0.800 769.40
0.825 874.00
0.850 1032.00
0.875 1129.50
0.900 1355.80
0.925 1611.30
0.950 1861.20


In [51]:
# df_vec_subs.head()

In [None]:
LEGACY

# Create a dataframe for broader testing

This might not be a great text because these sentences are really short (sometimes just a word...). But keep them as a benchmark because I also used them for CPU inference.

In [60]:
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]

In [61]:
df_test = pd.DataFrame({'text': (english_sentences + italian_sentences + japanese_sentences) * 600})
df_test.shape

(5400, 1)

In [62]:
df_test.head(10)

Unnamed: 0,text
0,dog
1,Puppies are nice.
2,I enjoy taking long walks along the beach with my dog.
3,cane
4,I cuccioli sono carini.
5,Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.
6,犬
7,子犬はいいです
8,私は犬と一緒にビーチを散歩するのが好きです
9,dog


# Get embeddings in different ways

After trying as a list, there's no reason to try other methods... it seems like `TF` is already optimized to run in parallel when reading a list and the bottleneck was using dataframes/pandas.

Might need to split up whole text DF into chunks to prevent memory errors, but applying to list should be good to go -- it takes less than 2 seconds on 5,400 sentences(!).

---
### CPU only
```
# get model results as list & convert to pd.DataFrame:
CPU times: user 1.21 s, sys: 285 ms, total: 1.49 s
Wall time: 1.95 s

CPU times: user 59.7 s, sys: 5.37 s, total: 1min 5s
Wall time: 5.06 s


# use pd.apply():
CPU times: user 1min 34s, sys: 12.4 s, total: 1min 47s
Wall time: 58.2 s
```

In [None]:
### GPU
```
# get model results as list & convert to pd.DataFrame:
CPU times: user 1.21 s, sys: 285 ms, total: 1.49 s
Wall time: 1.95 s

CPU times: user 59.7 s, sys: 5.37 s, total: 1min 5s
Wall time: 5.06 s


# use pd.apply():
CPU times: user 1min 34s, sys: 12.4 s, total: 1min 47s
Wall time: 58.2 s
```

## As a list
The example gets embeddings on a list, maybe the model can parallelize that somehow better than a df?

In [25]:
%%time

# model will return a list of tensors, if input is a list of text
emb_list = model(df_test['text'].to_list())

# So we can convert to np array using a list comprehension
# And convert back to a df with a known index
df_embeddings1 = pd.DataFrame(np.array([emb.numpy() for emb in emb_list]), 
                              index=df_test.index)

CPU times: user 1min, sys: 5.58 s, total: 1min 6s
Wall time: 5.37 s


In [30]:
df_embeddings1.shape

(5400, 512)

# As a batched list

In [31]:
tf_batch_inference_rows = 1000
len(df_test)

len(df_test) // tf_batch_inference_rows

5

In [32]:
%%time
# prototype the process
l_df_embeddings = list()

for i in tqdm(range(1 + len(df_test) // tf_batch_inference_rows)):
    slice_start = i * tf_batch_inference_rows
    slice_end = (i + 1) * tf_batch_inference_rows
    # print(slice_start, slice_end)
    l_df_embeddings.append(
        get_embeddings_as_df(
            model=model,
            df=df_test.iloc[slice_start:slice_end],
            col_text='text',
            lowercase_text=False,
        )
    )
    
df_embeddings2 = pd.concat(l_df_embeddings, axis=0, ignore_index=False)
del l_df_embeddings

  0%|          | 0/6 [00:00<?, ?it/s]

CPU times: user 1min 10s, sys: 5.27 s, total: 1min 16s
Wall time: 6.37 s


Added a new argument to function so batching is part of the function itself. Batching creates some overhead (by creating lists of dataframes).

However, it should limit the active memory needed by only loading a part of the text at any moment. Using `tqdm` should also help to make it easier to see progress.

In [33]:
%%time
# add new argument to do the batching as part of the function itself

df_embeddings3_ = get_embeddings_as_df(
    model=model,
    df=df_test,
    col_text='text',
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=None,
)

CPU times: user 1min 5s, sys: 6.43 s, total: 1min 12s
Wall time: 5.75 s


In [114]:
%%time

df_embeddings3_ = get_embeddings_as_df(
    model=model,
    df=df_test,
    col_text='text',
    lowercase_text=False,
    batch_size=None,
    limit_first_n_chars=5,
)

CPU times: user 33.4 s, sys: 2.68 s, total: 36 s
Wall time: 3.03 s


In [65]:
%%time
# add new argument to do the batching as part of the function itself

df_embeddings3_ = get_embeddings_as_df(
    model=model,
    df=df_test,
    col_text='text',
    lowercase_text=False,
    batch_size=5500,
)

CPU times: user 58.5 s, sys: 5.16 s, total: 1min 3s
Wall time: 4.93 s


In [67]:
%%time
# add new argument to do the batching as part of the function itself

df_embeddings3_ = get_embeddings_as_df(
    model=model,
    df=df_test,
    col_text='text',
    lowercase_text=False,
    batch_size=2000,
)

16:20:59 | INFO | "Getting embeddings in batches of size: 2000"


  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 1min, sys: 5.38 s, total: 1min 5s
Wall time: 6.07 s


In [64]:
%%time
# add new argument to do the batching as part of the function itself

df_embeddings3_ = get_embeddings_as_df(
    model=model,
    df=df_test,
    col_text='text',
    lowercase_text=False,
    batch_size=2500,
)

16:19:06 | INFO | "Getting embeddings in batches of size: 2500"


  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 60 s, sys: 4.57 s, total: 1min 4s
Wall time: 5.99 s


In [60]:
%%time
# add new argument to do the batching as part of the function itself

df_embeddings3 = get_embeddings_as_df(
    model=model,
    df=df_test,
    col_text='text',
    lowercase_text=False,
    batch_size=tf_batch_inference_rows
)

16:06:57 | INFO | "Getting embeddings in batches of size: 1000"


  0%|          | 0/6 [00:00<?, ?it/s]

CPU times: user 1min 2s, sys: 2.46 s, total: 1min 4s
Wall time: 7.2 s


### Check that outputs are equal/similar (within tolerance limits)

In [49]:
# n_check = len(df_embeddings1)
n_check = 5
np.allclose(
    df_embeddings1.head(n_check), 
    df_embeddings2.head(n_check), 
    rtol=1e-05,
)

True

In [48]:
# n_check = len(df_embeddings1)
np.allclose(
    df_embeddings1,
    df_embeddings2,
    rtol=1e-03,
)

True

In [70]:
# n_check = len(df_embeddings1)
np.allclose(
    df_embeddings2,
    df_embeddings3,
    rtol=1e-03,
)

True

In [71]:
# n_check = len(df_embeddings1)
np.allclose(
    df_embeddings3,
    df_embeddings3_,
    rtol=1e-03,
)

True

# Pandas .apply()

This appears to be serial and it's also a pain to get an array back out... not worth it.

In [22]:
%%time

s_output = df_test['text'].apply(model)

CPU times: user 8min 9s, sys: 56.7 s, total: 9min 6s
Wall time: 1min 42s


In [23]:
s_output.shape

(5400,)

In [117]:
# s_output.head()

## Compare/check output from pd.apply v. run model on list

The default tolerance of `rtol=1e-05` fails, so we need to move to `1e-04` or even `1e-02` for things to be equal.

---

Unclear why we might need to move it up to `1e-02` for all to be equal... am I sure things are in the same order? Might be better to use absolute tolerance instead of relative?

In [66]:
# n_check = len(df_embeddings1)
n_check = 5
np.allclose(
    df_embeddings1.head(n_check), 
    np.array(s_output.head(n_check).apply(lambda x: x.numpy()[0]).to_list()),
    rtol=1e-02,
)

True

In [96]:
# df_embeddings1.head(n_check)

In [97]:
# pd.DataFrame(s_output.head(n_check).apply(lambda x: x.numpy()[0]).to_list())

In [98]:
# df_embeddings1.tail(n_check)

In [99]:
# pd.DataFrame(s_output.tail(n_check).apply(lambda x: x.numpy()[0]).to_list())

In [73]:
n_check_small = 5
np.allclose(
    df_embeddings1.head(n_check_small),
    pd.DataFrame(s_output.head(n_check_small).apply(lambda x: x.numpy()[0]).to_list()),
    rtol=1e-04,
)

False

In [61]:
np.equal(
    df_embeddings1.head(1),
    pd.DataFrame(s_output.head(1).apply(lambda x: x.numpy()[0]).to_list())
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [80]:
np.equal(
    df_embeddings1.head(1),
    model(df_test.head(1)).numpy()
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Equality fails even on the same input
Looks like the output is not deterministic

In [87]:
np.equal(
    model(df_test.head(1)).numpy()[:, :10],
    model(df_test.head(1)).numpy()[:, :10],
)

array([[False, False, False, False,  True,  True,  True,  True, False,
         True]])

In [90]:
np.allclose(
    model(df_test.head(10)).numpy(),
    model(df_test.head(10)).numpy(),
)

False

In [91]:
np.allclose(
    model(df_test.head(10)).numpy(),
    model(df_test.head(10)).numpy(),
    rtol=1e-04,
)

True

In [93]:
%%time

np.allclose(
    model(df_test).numpy(),
    model(df_test).numpy(),
    rtol=1e-04,
)

CPU times: user 488 ms, sys: 77.4 ms, total: 566 ms
Wall time: 2.12 s


False

In [94]:
%%time

np.allclose(
    model(df_test).numpy(),
    model(df_test).numpy(),
    rtol=1e-03,
)

CPU times: user 470 ms, sys: 117 ms, total: 587 ms
Wall time: 2.15 s


True

In [95]:
%%time

np.allclose(
    model(df_test).numpy(),
    model(df_test).numpy(),
    rtol=1e-03,
)

CPU times: user 502 ms, sys: 69.2 ms, total: 571 ms
Wall time: 2.14 s


True

# Test checks on slicing strings

In [109]:
df_test['text'].str[:None]

0                                                                       dog
1                                                         Puppies are nice.
2                    I enjoy taking long walks along the beach with my dog.
3                                                                      cane
4                                                   I cuccioli sono carini.
                                       ...                                 
5395                                                I cuccioli sono carini.
5396    Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.
5397                                                                      犬
5398                                                                子犬はいいです
5399                                                  私は犬と一緒にビーチを散歩するのが好きです
Name: text, Length: 5400, dtype: object

In [110]:
df_test['text'].str[:4]

0        dog
1       Pupp
2       I en
3       cane
4       I cu
        ... 
5395    I cu
5396    Mi p
5397       犬
5398    子犬はい
5399    私は犬と
Name: text, Length: 5400, dtype: object

In [111]:
df_test['text'].str[:14]

0                  dog
1       Puppies are ni
2       I enjoy taking
3                 cane
4       I cuccioli son
             ...      
5395    I cuccioli son
5396    Mi piace fare 
5397                 犬
5398           子犬はいいです
5399    私は犬と一緒にビーチを散歩す
Name: text, Length: 5400, dtype: object

# Test limiting string length
USE will run out of memory with really long text.


So maybe let's cap things at around `7,000` characters because we'll start seeing diminishing returns and we don't want a batch held up by one long post/comment. 7k limit should keep inference for each post at less than 1 second.

Rough time checks:
Reps, Chars -> Large | "normal"
```
-    10,   770 characters -> ` 0.0224 seconds` | 0.0105 seconds
-    13,  1000 characters -> ` 0.0685 seconds` | 0.0121 s
-   100,  7.7k characters -> ` 0.860  seconds` | 0.0351 s
-   650,  ~15k characters -> ` 1.55   seconds` | 0.379  s
- 1,000,  ~76k characters -> `22.0    seconds` | 
- 2,000, ~153k characters -> `1 min 44 seconds` | 
- 2,400, ~200k characters -> `OOM` errors (out of memory)
```


In [80]:
len_check = [10, 13, 16, 20, 100, 200, 650, 1000, 1500, 2000, 3000, 5000]
for i in len_check:
    print(f"{i:6,.0f} -> {len(' '.join(english_sentences * i)):7,.0f}")

    10 ->     769
    13 ->   1,000
    16 ->   1,231
    20 ->   1,539
   100 ->   7,699
   200 ->  15,399
   650 ->  50,049
 1,000 ->  76,999
 1,500 -> 115,499
 2,000 -> 153,999
 3,000 -> 230,999
 5,000 -> 384,999


In [82]:
# %%time
# _ = model(' '.join(english_sentences))

In [81]:
# %%time
# _ = model_normal(' '.join(english_sentences))

In [78]:
%%time
_ = model(' '.join(english_sentences * 10))

CPU times: user 357 ms, sys: 34.3 ms, total: 392 ms
Wall time: 55.1 ms


In [63]:
%%time
_ = model_normal(' '.join(english_sentences * 10))

CPU times: user 66 ms, sys: 4.06 ms, total: 70.1 ms
Wall time: 10.5 ms


In [83]:
%%time
_ = model(' '.join(english_sentences * 13))

CPU times: user 512 ms, sys: 19 ms, total: 531 ms
Wall time: 68.5 ms


In [84]:
%%time
_ = model_normal(' '.join(english_sentences * 13))

CPU times: user 80 ms, sys: 2.83 ms, total: 82.9 ms
Wall time: 12.1 ms


In [66]:
%%time
_ = model(' '.join(english_sentences * 100))

CPU times: user 9.48 s, sys: 1.75 s, total: 11.2 s
Wall time: 995 ms


In [67]:
%%time
_ = model_normal(' '.join(english_sentences * 100))

CPU times: user 354 ms, sys: 0 ns, total: 354 ms
Wall time: 35.1 ms


In [69]:
%%time
_ = model(' '.join(english_sentences * 200))

CPU times: user 21.5 s, sys: 2.54 s, total: 24 s
Wall time: 1.73 s


In [68]:
%%time
_ = model_normal(' '.join(english_sentences * 200))

CPU times: user 700 ms, sys: 0 ns, total: 700 ms
Wall time: 66.8 ms


In [70]:
%%time
_ = model(' '.join(english_sentences * 650))

CPU times: user 3min 1s, sys: 21.8 s, total: 3min 22s
Wall time: 16.5 s


In [86]:
%%timeit
_ = model_normal(' '.join(english_sentences * 650))

379 ms ± 137 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
%%time
_ = model(' '.join(english_sentences * 1000))

CPU times: user 7min 3s, sys: 46.6 s, total: 7min 49s
Wall time: 34.7 s


In [72]:
%%time
_ = model_normal(' '.join(english_sentences * 1000))

CPU times: user 3.46 s, sys: 237 ms, total: 3.69 s
Wall time: 353 ms


In [75]:
%%time
_ = model(' '.join(english_sentences * 2000))

CPU times: user 26min 15s, sys: 2min 20s, total: 28min 35s
Wall time: 2min 4s


In [74]:
%%time
_ = model_normal(' '.join(english_sentences * 2000))

CPU times: user 13.6 s, sys: 2.24 s, total: 15.8 s
Wall time: 1.54 s


In [116]:
# %%time
# # This is expected to fail
# _ = model(' '.join(english_sentences * 5000))

In [76]:
%%time
# This is expected to fail for LARGE, but manages to complete for normal/default
_ = model_normal(' '.join(english_sentences * 5000))

CPU times: user 30.3 s, sys: 4.32 s, total: 34.6 s
Wall time: 3.78 s


Example error output:
```
---------------------------------------------------------------------------
ResourceExhaustedError                    Traceback (most recent call last)
<timed exec> in <module>

ResourceExhaustedError:  OOM when allocating tensor with shape[1,8,190002,190002] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node StatefulPartitionedCall/StatefulPartitionedCall/EncoderTransformer/Transformer/SparseTransformerEncode/Layer_0/SelfAttention/SparseMultiheadAttention/DotProductAttention/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_restored_function_body_51014]

Function call stack:
restored_function_body

```

In [None]:
LEGACY

### Try in parallel

In [17]:
import dask.dataframe as dd

In [18]:
ddf_text = dd.from_pandas(df_test, npartitions=6)

In [19]:
ddf_text.head()

Unnamed: 0,text
0,dog
1,Puppies are nice.
2,I enjoy taking long walks along the beach with my dog.
3,dog
4,Puppies are nice.


In [20]:
ddf_text[['text']].tail(10)

Unnamed: 0,text
5990,I enjoy taking long walks along the beach with my dog.
5991,dog
5992,Puppies are nice.
5993,I enjoy taking long walks along the beach with my dog.
5994,dog
5995,Puppies are nice.
5996,I enjoy taking long walks along the beach with my dog.
5997,dog
5998,Puppies are nice.
5999,I enjoy taking long walks along the beach with my dog.


## Define functions to get numpy array

In [21]:
def get_embeddings_as_numpy_df(
    df: pd.DataFrame,
    text_col: str = 'text',
    model_fxn: callable = model,
    apply_fxn: str = 'apply',
) -> pd.Series:
    """Apply fxn to get embeddings to a df
    Use it so that we can pair it with dask and apply it in parallel
    """
    if apply_fxn == 'apply':
        return df[text_col].apply(lambda x: model(x).numpy())
    else:
        # original example used map
        return df[text_col].map(model_fxn).apply(lambda x: x.numpy())
    

In [22]:
def get_embeddings_as_numpy(
    text_string: str,
#     tf_model: callable = model,
) -> np.ndarray:
    """Wrapper to convert Tensor type to numpy to make downstream transformations faster"""
    return model(text_string).numpy()

## Get embeddings with `dask.map_partition`

In [23]:
%%time

ddf_text['embeddings'] = (
    ddf_text[['text']]
    .map_partitions(model,
                    meta=pd.Series(name='embeddings', dtype=str)
                    )
    .compute()
)

CPU times: user 2.69 s, sys: 953 ms, total: 3.64 s
Wall time: 3.5 s


In [25]:
ddf_text[['embeddings']].compute().tail()

Unnamed: 0,embeddings
5995,
5996,
5997,
5998,
5999,


In [96]:
embeddings_size = 512
# tf_embeddings = np.empty([len(ddf_text), embeddings_size])
# tf_embeddings.shape

In [98]:
type(ddf_text)

dask.dataframe.core.DataFrame

In [102]:
%%time

ddf_text[['embeddings']].compute().head()

CPU times: user 39 ms, sys: 6.32 ms, total: 45.4 ms
Wall time: 34.9 ms


Unnamed: 0,embeddings
0,"((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt..."
1,"((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty..."
2,"((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d..."
3,"((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype..."
4,"((tf.Tensor(-0.0052521243, shape=(), dtype=float32), tf.Tensor(-0.03831199, shape=(), dtype=float32), tf.Tensor(-0.009220154, shape=(), dtype=float32), tf.Tensor(-0.0460963, shape=(), dtype=float32), tf.Tensor(-0.05765769, shape=(), dty..."


In [103]:
# %%time

# # This doesn't seem to work because dask expects the output to collapse to a single dimension (instead of 512)
# # tf_embeddings = 
# (
#     ddf_text[['text']]
#     .map_partitions(get_embeddings_as_numpy,
#                     meta=pd.Series(name='embeddings', dtype=str)
#                     )
#     .compute()
# )

In [107]:
%%time

ddf_text['embeddings_np1'] = (
    ddf_text
    .map_partitions(get_embeddings_as_numpy_df,
                    text_col='text',
                    apply_fxn='map',
                    meta=pd.Series(name='text', dtype=str)
                    )
    .compute()
)

CPU times: user 2min 40s, sys: 52.2 s, total: 3min 32s
Wall time: 46.8 s


In [106]:
%%time

ddf_text['embeddings_np2'] = (
    ddf_text
    .map_partitions(get_embeddings_as_numpy_df,
                    text_col='text',
                    apply_fxn='apply',
                    meta=pd.Series(name='text', dtype=str)
                    )
    .compute()
)

CPU times: user 2min 43s, sys: 52.5 s, total: 3min 35s
Wall time: 49.1 s


## What is the dtype? / how do we get embeddings out of this output?

By default it looks like the dtype is a `tensorflow.Tensor` object, which could be a pain to manipulate.

In [108]:
ddf_text['embeddings'].head()

0    ((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt...
1    ((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty...
2    ((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d...
3    ((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype...
4    ((tf.Tensor(-0.

In [109]:
ddf_text['embeddings'].tail()

5995    NaN
5996    NaN
5997    NaN
5998    NaN
5999    NaN
Name: embeddings, dtype: object

In [49]:
ddf_text['embeddings_np'].head()

0    [[-0.005252127, -0.03831199, -0.00922016, -0.046096295, -0.057657722, -0.04768436, -0.016511641, 0.0047285636, -0.013010718, -0.0713816, -0.030422542, 0.023165481, 0.017145848, 0.045331325, 0.015669955, -0.01395997, -0.05360133, -0.0118...
1    [[-0.02267581, -0.069071844, 0.015508692, -0.029039733, -0.08988534, -0.0010676696, -0.014085712, 0.0072551174, -0.019989632, 0.0669519, 0.017029503, 0.022945907, -0.006108564, 0.05118298, -0.043797858, -0.041236266, 0.029429343, 0.0488...
2    [[0.019088332, 0.009552218, -0.04741294, 0.0104937535, -0.042908445, -0.06331453, -0.0031612532, 0.051496144, 0.027147723, -0.010961102, 0.034988593, 0.057352275, 0.046823528, 0.057418354, -0.00366939, 0.0066046086, -0.0013654096, 0.003...
3    [[-0.0052521275, -0.03831199, -0.009220155, -0.046096295, -0.05765772, -0.04768436, -0.016511641, 0.0047285655, -0.013010721, -0.0713816, -0.030422544, 0.023165483, 0.017145844, 0.04533133, 0.015669957, -0.013959968, -0.05360133, -0.011...
4    [[-0.02267581, 

In [None]:
ddf_text['embeddings_np'].tail()

In [55]:
%%time
ddf_text['embeddings'].head().apply(lambda x: x.numpy())

CPU times: user 12.6 ms, sys: 0 ns, total: 12.6 ms
Wall time: 10.2 ms


0    [[-0.0052521224, -0.03831199, -0.009220155, -0.0460963, -0.057657693, -0.047684345, -0.016511647, 0.004728538, -0.01301072, -0.07138159, -0.03042251, 0.023165515, 0.017145874, 0.045331337, 0.015669929, -0.013959975, -0.05360135, -0.0118...
1    [[-0.022675814, -0.069071874, 0.015508659, -0.02903972, -0.08988533, -0.0010676887, -0.014085756, 0.007255134, -0.019989597, 0.06695186, 0.01702949, 0.02294587, -0.006108526, 0.05118297, -0.043797873, -0.04123629, 0.029429296, 0.0488887...
2    [[0.019088339, 0.009552221, -0.04741294, 0.010493785, -0.04290844, -0.06331449, -0.0031612637, 0.051496122, 0.027147735, -0.010961098, 0.034988593, 0.057352237, 0.046823513, 0.057418343, -0.0036693846, 0.006604615, -0.0013653795, 0.0038...
3    [[-0.0052521233, -0.03831199, -0.009220155, -0.0460963, -0.057657693, -0.047684345, -0.016511645, 0.0047285357, -0.013010718, -0.07138159, -0.030422507, 0.023165515, 0.017145874, 0.045331337, 0.015669929, -0.013959976, -0.05360135, -0.0...
4    [[-0.022675822,

In [108]:
# ddf_text['embeddings'].compute()

In [110]:
# %%time
# ddf_text['embeddings'].apply(lambda x: x.numpy())

In [112]:
ddf_text['embeddings'].head().apply(lambda x: x.numpy())

0    [[-0.005252141, -0.038312018, -0.009220189, -0.04609629, -0.05765771, -0.047684345, -0.01651165, 0.0047285794, -0.013010711, -0.071381606, -0.030422565, 0.02316548, 0.017145867, 0.045331288, 0.015669933, -0.013959985, -0.053601343, -0.0...
1    [[0.019088347, 0.009552213, -0.047412947, 0.0104937935, -0.04290844, -0.06331449, -0.0031612592, 0.051496133, 0.027147729, -0.010961093, 0.034988593, 0.05735223, 0.046823505, 0.057418343, -0.0036693835, 0.0066046203, -0.0013653715, 0.00...
2    [[-0.022675823, -0.069071874, 0.0155086545, -0.029039716, -0.08988534, -0.0010676958, -0.014085754, 0.0072551346, -0.019989599, 0.06695186, 0.01702949, 0.022945872, -0.0061085364, 0.051182974, -0.04379786, -0.04123629, 0.029429302, 0.04...
3    [[0.01908833, 0.009552218, -0.047412947, 0.010493787, -0.04290844, -0.063314475, -0.0031612662, 0.051496133, 0.027147733, -0.010961089, 0.0349886, 0.05735224, 0.046823505, 0.057418346, -0.003669383, 0.0066046147, -0.0013653776, 0.003832...
4    [[-0.0052521243

In [113]:
# ddf_text['embeddings'].apply(lambda x: x.numpy())