# Purpose

2021-06-28.
Test different parallel frameworks to speed up getting embeddings from USE-multilingual.

test:
- dask
- modin
- pandarallel

# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
import os

import fse
from fse.models import uSIF
import gensim
from gensim.models.fasttext import FastText, load_facebook_vectors
import joblib

import math
import numpy as np
import pandas as pd
import plotly
import plotly.express as px

# modin df
import modin
import modin.pandas as mpd

from subclu.data.fasttext_utils import (
    download_ft_pretrained_model,
    get_df_for_most_similar,
    get_project_subfolder,
)
from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


print_lib_versions([fse, gensim, joblib, modin, np, pd, plotly])

python		v 3.7.10
===
fse		v: 0.1.15
gensim		v: 3.8.3
joblib		v: 1.0.1
modin		v: 0.10.0
numpy		v: 1.18.5
pandas		v: 1.2.4
plotly		v: 4.14.3


In [3]:
# USE & TF-focused imports
import tensorflow
import tensorflow as tf
# import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
import tensorflow_text

print_lib_versions([tensorflow])

python		v 3.7.10
===
tensorflow	v: 2.3.2


In [4]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Check whether we have access to a GPU

In [21]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


# Load model from hub

In [5]:
%%time
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
#@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3' 


model = hub.load(module_url)

CPU times: user 5.73 s, sys: 963 ms, total: 6.69 s
Wall time: 6.73 s


## Example from tutorial

In [6]:
# Some texts of different lengths.
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]


In [7]:
%%time

# Compute embeddings.
en_result = model(english_sentences)
it_result = model(italian_sentences)
ja_result = model(japanese_sentences)


CPU times: user 2.63 s, sys: 224 ms, total: 2.85 s
Wall time: 2.46 s


In [8]:
pd.DataFrame(en_result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,"tf.Tensor(-0.0052521275, shape=(), dtype=float32)","tf.Tensor(-0.038312003, shape=(), dtype=float32)","tf.Tensor(-0.009220148, shape=(), dtype=float32)","tf.Tensor(-0.046096273, shape=(), dtype=float32)","tf.Tensor(-0.0576577, shape=(), dtype=float32)","tf.Tensor(-0.047684357, shape=(), dtype=float32)","tf.Tensor(-0.016511641, shape=(), dtype=float32)","tf.Tensor(0.004728587, shape=(), dtype=float32)","tf.Tensor(-0.013010709, shape=(), dtype=float32)","tf.Tensor(-0.071381606, shape=(), dtype=float32)","tf.Tensor(-0.030422544, shape=(), dtype=float32)","tf.Tensor(0.023165472, shape=(), dtype=float32)","tf.Tensor(0.017145852, shape=(), dtype=float32)","tf.Tensor(0.045331307, shape=(), dtype=float32)","tf.Tensor(0.015669959, shape=(), dtype=float32)","tf.Tensor(-0.013959983, shape=(), dtype=float32)","tf.Tensor(-0.053601325, shape=(), dtype=float32)","tf.Tensor(-0.011878517, shape=(), dtype=float32)","tf.Tensor(-0.11063095, shape=(), dtype=float32)","tf.Tensor(0.0048499736, shape=(), dtype=float32)","tf.Tensor(0.0067632, shape=(), dtype=float32)","tf.Tensor(0.03981162, shape=(), dtype=float32)","tf.Tensor(0.0660155, shape=(), dtype=float32)","tf.Tensor(-0.012721933, shape=(), dtype=float32)","tf.Tensor(0.037441507, shape=(), dtype=float32)","tf.Tensor(0.030250402, shape=(), dtype=float32)","tf.Tensor(-0.0152029265, shape=(), dtype=float32)","tf.Tensor(0.011007952, shape=(), dtype=float32)","tf.Tensor(0.016856804, shape=(), dtype=float32)","tf.Tensor(-0.019257635, shape=(), dtype=float32)",...,"tf.Tensor(0.08616641, shape=(), dtype=float32)","tf.Tensor(-0.024433237, shape=(), dtype=float32)","tf.Tensor(-0.00091490534, shape=(), dtype=float32)","tf.Tensor(0.038853906, shape=(), dtype=float32)","tf.Tensor(-0.031945076, shape=(), dtype=float32)","tf.Tensor(0.043825857, shape=(), dtype=float32)","tf.Tensor(0.058404725, shape=(), dtype=float32)","tf.Tensor(0.039348044, shape=(), dtype=float32)","tf.Tensor(0.08239184, shape=(), dtype=float32)","tf.Tensor(-0.028973904, shape=(), dtype=float32)","tf.Tensor(0.012636847, shape=(), dtype=float32)","tf.Tensor(-0.00868233, shape=(), dtype=float32)","tf.Tensor(0.00058252766, shape=(), dtype=float32)","tf.Tensor(0.07517928, shape=(), dtype=float32)","tf.Tensor(0.01735613, shape=(), dtype=float32)","tf.Tensor(0.012234436, shape=(), dtype=float32)","tf.Tensor(-0.019842023, shape=(), dtype=float32)","tf.Tensor(-0.07749191, shape=(), dtype=float32)","tf.Tensor(-0.02440608, shape=(), dtype=float32)","tf.Tensor(-0.032542024, shape=(), dtype=float32)","tf.Tensor(0.012990012, shape=(), dtype=float32)","tf.Tensor(-0.059579674, shape=(), dtype=float32)","tf.Tensor(0.026064057, shape=(), dtype=float32)","tf.Tensor(-0.05239151, shape=(), dtype=float32)","tf.Tensor(-0.059167933, shape=(), dtype=float32)","tf.Tensor(-0.025977189, shape=(), dtype=float32)","tf.Tensor(-0.031976104, shape=(), dtype=float32)","tf.Tensor(0.021790493, shape=(), dtype=float32)","tf.Tensor(0.06280179, shape=(), dtype=float32)","tf.Tensor(-0.016225362, shape=(), dtype=float32)"
1,"tf.Tensor(-0.022675825, shape=(), dtype=float32)","tf.Tensor(-0.06907185, shape=(), dtype=float32)","tf.Tensor(0.015508708, shape=(), dtype=float32)","tf.Tensor(-0.029039735, shape=(), dtype=float32)","tf.Tensor(-0.089885354, shape=(), dtype=float32)","tf.Tensor(-0.0010676576, shape=(), dtype=float32)","tf.Tensor(-0.014085728, shape=(), dtype=float32)","tf.Tensor(0.007255115, shape=(), dtype=float32)","tf.Tensor(-0.01998962, shape=(), dtype=float32)","tf.Tensor(0.06695188, shape=(), dtype=float32)","tf.Tensor(0.0170295, shape=(), dtype=float32)","tf.Tensor(0.022945879, shape=(), dtype=float32)","tf.Tensor(-0.006108559, shape=(), dtype=float32)","tf.Tensor(0.051182978, shape=(), dtype=float32)","tf.Tensor(-0.043797843, shape=(), dtype=float32)","tf.Tensor(-0.041236266, shape=(), dtype=float32)","tf.Tensor(0.029429324, shape=(), dtype=float32)","tf.Tensor(0.04888875, shape=(), dtype=float32)","tf.Tensor(-0.11650519, shape=(), dtype=float32)","tf.Tensor(-0.0024727122, shape=(), dtype=float32)","tf.Tensor(0.02891673, shape=(), dtype=float32)","tf.Tensor(9.120283e-05, shape=(), dtype=float32)","tf.Tensor(0.024404427, shape=(), dtype=float32)","tf.Tensor(0.012593308, shape=(), dtype=float32)","tf.Tensor(0.020244474, shape=(), dtype=float32)","tf.Tensor(0.020711716, shape=(), dtype=float32)","tf.Tensor(-0.031881053, shape=(), dtype=float32)","tf.Tensor(0.0534252, shape=(), dtype=float32)","tf.Tensor(0.002699542, shape=(), dtype=float32)","tf.Tensor(-0.080842726, shape=(), dtype=float32)",...,"tf.Tensor(0.10710527, shape=(), dtype=float32)","tf.Tensor(0.00011630716, shape=(), dtype=float32)","tf.Tensor(0.008712958, shape=(), dtype=float32)","tf.Tensor(0.07481754, shape=(), dtype=float32)","tf.Tensor(-0.04523135, shape=(), dtype=float32)","tf.Tensor(0.052447543, shape=(), dtype=float32)","tf.Tensor(0.03915357, shape=(), dtype=float32)","tf.Tensor(-0.0018760569, shape=(), dtype=float32)","tf.Tensor(0.017901925, shape=(), dtype=float32)","tf.Tensor(-0.014710553, shape=(), dtype=float32)","tf.Tensor(0.04312735, shape=(), dtype=float32)","tf.Tensor(-0.030878643, shape=(), dtype=float32)","tf.Tensor(-0.056910418, shape=(), dtype=float32)","tf.Tensor(0.058894217, shape=(), dtype=float32)","tf.Tensor(-0.0065958924, shape=(), dtype=float32)","tf.Tensor(-0.019973183, shape=(), dtype=float32)","tf.Tensor(-0.037767127, shape=(), dtype=float32)","tf.Tensor(0.021722129, shape=(), dtype=float32)","tf.Tensor(-0.032639638, shape=(), dtype=float32)","tf.Tensor(-0.005521565, shape=(), dtype=float32)","tf.Tensor(-0.039815918, shape=(), dtype=float32)","tf.Tensor(-0.0046574036, shape=(), dtype=float32)","tf.Tensor(0.024200814, shape=(), dtype=float32)","tf.Tensor(-0.026657939, shape=(), dtype=float32)","tf.Tensor(-0.006368075, shape=(), dtype=float32)","tf.Tensor(-0.069105715, shape=(), dtype=float32)","tf.Tensor(0.029129585, shape=(), dtype=float32)","tf.Tensor(0.08762303, shape=(), dtype=float32)","tf.Tensor(-0.0007692515, shape=(), dtype=float32)","tf.Tensor(-0.054108225, shape=(), dtype=float32)"
2,"tf.Tensor(0.019088324, shape=(), dtype=float32)","tf.Tensor(0.009552208, shape=(), dtype=float32)","tf.Tensor(-0.047412906, shape=(), dtype=float32)","tf.Tensor(0.010493745, shape=(), dtype=float32)","tf.Tensor(-0.042908456, shape=(), dtype=float32)","tf.Tensor(-0.06331452, shape=(), dtype=float32)","tf.Tensor(-0.003161247, shape=(), dtype=float32)","tf.Tensor(0.05149613, shape=(), dtype=float32)","tf.Tensor(0.027147705, shape=(), dtype=float32)","tf.Tensor(-0.010961123, shape=(), dtype=float32)","tf.Tensor(0.034988582, shape=(), dtype=float32)","tf.Tensor(0.057352267, shape=(), dtype=float32)","tf.Tensor(0.046823528, shape=(), dtype=float32)","tf.Tensor(0.05741834, shape=(), dtype=float32)","tf.Tensor(-0.0036693863, shape=(), dtype=float32)","tf.Tensor(0.006604586, shape=(), dtype=float32)","tf.Tensor(-0.0013654105, shape=(), dtype=float32)","tf.Tensor(0.0038320248, shape=(), dtype=float32)","tf.Tensor(-0.090584226, shape=(), dtype=float32)","tf.Tensor(0.048744842, shape=(), dtype=float32)","tf.Tensor(0.028695406, shape=(), dtype=float32)","tf.Tensor(-0.02148353, shape=(), dtype=float32)","tf.Tensor(0.052940547, shape=(), dtype=float32)","tf.Tensor(-0.017772527, shape=(), dtype=float32)","tf.Tensor(-0.04854997, shape=(), dtype=float32)","tf.Tensor(-0.07571308, shape=(), dtype=float32)","tf.Tensor(-0.06106605, shape=(), dtype=float32)","tf.Tensor(0.06376975, shape=(), dtype=float32)","tf.Tensor(-0.030032251, shape=(), dtype=float32)","tf.Tensor(0.02624046, shape=(), dtype=float32)",...,"tf.Tensor(-0.021820953, shape=(), dtype=float32)","tf.Tensor(-0.05420484, shape=(), dtype=float32)","tf.Tensor(0.07822652, shape=(), dtype=float32)","tf.Tensor(-0.01388963, shape=(), dtype=float32)","tf.Tensor(-0.054223277, shape=(), dtype=float32)","tf.Tensor(-0.05325497, shape=(), dtype=float32)","tf.Tensor(0.0072277193, shape=(), dtype=float32)","tf.Tensor(-0.02969859, shape=(), dtype=float32)","tf.Tensor(0.02548138, shape=(), dtype=float32)","tf.Tensor(0.012124341, shape=(), dtype=float32)","tf.Tensor(0.00032450375, shape=(), dtype=float32)","tf.Tensor(0.0083699105, shape=(), dtype=float32)","tf.Tensor(-0.041196983, shape=(), dtype=float32)","tf.Tensor(0.080432944, shape=(), dtype=float32)","tf.Tensor(0.0070090313, shape=(), dtype=float32)","tf.Tensor(-0.050937887, shape=(), dtype=float32)","tf.Tensor(0.013866394, shape=(), dtype=float32)","tf.Tensor(0.034640677, shape=(), dtype=float32)","tf.Tensor(0.008104301, shape=(), dtype=float32)","tf.Tensor(0.0214928, shape=(), dtype=float32)","tf.Tensor(0.11410709, shape=(), dtype=float32)","tf.Tensor(-0.07061693, shape=(), dtype=float32)","tf.Tensor(0.06558143, shape=(), dtype=float32)","tf.Tensor(-0.03906433, shape=(), dtype=float32)","tf.Tensor(0.005809185, shape=(), dtype=float32)","tf.Tensor(-0.078954406, shape=(), dtype=float32)","tf.Tensor(0.05952792, shape=(), dtype=float32)","tf.Tensor(0.032064784, shape=(), dtype=float32)","tf.Tensor(0.03661668, shape=(), dtype=float32)","tf.Tensor(0.0033105945, shape=(), dtype=float32)"


In [9]:
tf.nn.l2_normalize(it_result)

<tf.Tensor: shape=(3, 512), dtype=float32, numpy=
array([[ 0.00720229, -0.02716359, -0.00670168, ...,  0.00478419,
         0.03265055, -0.01112302],
       [-0.00031301, -0.03090753, -0.00294115, ...,  0.04377454,
        -0.01193869, -0.01058716],
       [ 0.013335  ,  0.01904837, -0.02599624, ...,  0.0150301 ,
         0.01788162, -0.00163474]], dtype=float32)>

In [10]:
# Compute similarity matrix. Higher score indicates greater similarity.
similarity_matrix_it = np.inner(en_result, it_result)
similarity_matrix_it

array([[0.96964586, 0.2734785 , 0.25536823],
       [0.37726057, 0.8931675 , 0.28806683],
       [0.24611366, 0.2379298 , 0.9352151 ]], dtype=float32)

In [11]:
similarity_matrix_ja = np.inner(en_result, ja_result)
similarity_matrix_ja

array([[0.97994244, 0.5341731 , 0.3374965 ],
       [0.4187405 , 0.7786118 , 0.37929755],
       [0.25998724, 0.29997385, 0.8583672 ]], dtype=float32)

# Cosine Similarities Example
This is annoying because we get a different result when using this `get_similarities` function compared to `np.inner()`... maybe it's because the raw inputs to `np.inner` weren't normalized, right?
Also, I'm not sure why we have to force clipping...

---

Using example form Colab

https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb#scrollTo=W-q2r7jyZGb7

In [12]:
def get_similarities(sent_1, sent_2):
    sts_encode1 = tf.nn.l2_normalize(model(sent_1), axis=1)
    sts_encode2 = tf.nn.l2_normalize(model(sent_2), axis=1)
    
    cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
    clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
    scores = 1.0 - tf.acos(clip_cosine_similarities) / math.pi
    """Returns the similarity scores"""
    return scores

In [13]:
get_similarities(english_sentences[0], italian_sentences[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.9213722], dtype=float32)>

In [14]:
get_similarities(english_sentences[0], japanese_sentences[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.9361395], dtype=float32)>

In [15]:
model("hello").numpy().shape

(1, 512)

# Create a dataframe for large scale testing

In [16]:
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]

In [17]:
df_test = pd.DataFrame({'text': (english_sentences + italian_sentences + japanese_sentences) * 600})
df_test.shape

(5400, 1)

In [18]:
df_test.head(10)

Unnamed: 0,text
0,dog
1,Puppies are nice.
2,I enjoy taking long walks along the beach with my dog.
3,cane
4,I cuccioli sono carini.
5,Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.
6,犬
7,子犬はいいです
8,私は犬と一緒にビーチを散歩するのが好きです
9,dog


# Get embeddings in different ways

After trying as a list, there's no reason to try other methods... it seems like `TF` is already optimized to run in parallel when reading a list and the bottleneck was using dataframes/pandas.

Might need to split up whole text DF into chunks to prevent memory errors, but applying to list should be good to go -- it takes less than 2 seconds on 5,400 sentences(!).

---
```
# get model results as list & convert to pd.DataFrame:
CPU times: user 1.21 s, sys: 285 ms, total: 1.49 s
Wall time: 1.95 s

# use pd.apply():
CPU times: user 1min 34s, sys: 12.4 s, total: 1min 47s
Wall time: 58.2 s
```

## As a list
The example gets embeddings on a list, maybe the model can parallelize that somehow better than a df?

In [19]:
%%time

# model will return a list of tensors, if input is a list of text
emb_list = model(df_test['text'].to_list())

# So we can convert to np array using a list comprehension
# And convert back to a df with a known index
df_embeddings1 = pd.DataFrame(np.array([emb.numpy() for emb in emb_list]), 
                              index=df_test.index)

CPU times: user 55.4 s, sys: 5.22 s, total: 1min
Wall time: 4.46 s


In [20]:
df_embeddings1.shape

(5400, 512)

# Pandas .apply()

This appears to be serial and it's also a pain to get an array back out... not worth it.

In [37]:
%%time

s_output = df_test['text'].apply(model)

CPU times: user 1min 34s, sys: 12.4 s, total: 1min 47s
Wall time: 58.2 s


In [38]:
s_output.shape

(5400,)

In [39]:
s_output.head()

0    ((tf.Tensor(-0.005252126, shape=(), dtype=float32), tf.Tensor(-0.038311988, shape=(), dtype=float32), tf.Tensor(-0.009220155, shape=(), dtype=float32), tf.Tensor(-0.046096295, shape=(), dtype=float32), tf.Tensor(-0.05765772, shape=(), d...
1    ((tf.Tensor(-0.022675809, shape=(), dtype=float32), tf.Tensor(-0.06907185, shape=(), dtype=float32), tf.Tensor(0.015508697, shape=(), dtype=float32), tf.Tensor(-0.029039733, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), dty...
2    ((tf.Tensor(0.019088345, shape=(), dtype=float32), tf.Tensor(0.009552222, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493755, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtyp...
3    ((tf.Tensor(0.012474758, shape=(), dtype=float32), tf.Tensor(-0.047048714, shape=(), dtype=float32), tf.Tensor(-0.011607644, shape=(), dtype=float32), tf.Tensor(-0.06017637, shape=(), dtype=float32), tf.Tensor(-0.051905125, shape=(), dt...
4    ((tf.Tensor(-0.

## Compare/check output from pd.apply v. run model on list

The default tolerance of `rtol=1e-05` fails, so we need to move to `1e-04` or even `1e-02` for things to be equal.

---

Unclear why we might need to move it up to `1e-02` for all to be equal... am I sure things are in the same order? Might be better to use absolute tolerance instead of relative?

In [66]:
# n_check = len(df_embeddings1)
n_check = 5
np.allclose(
    df_embeddings1.head(n_check), 
    np.array(s_output.head(n_check).apply(lambda x: x.numpy()[0]).to_list()),
    rtol=1e-02,
)

True

In [96]:
# df_embeddings1.head(n_check)

In [97]:
# pd.DataFrame(s_output.head(n_check).apply(lambda x: x.numpy()[0]).to_list())

In [98]:
# df_embeddings1.tail(n_check)

In [99]:
# pd.DataFrame(s_output.tail(n_check).apply(lambda x: x.numpy()[0]).to_list())

In [73]:
n_check_small = 5
np.allclose(
    df_embeddings1.head(n_check_small),
    pd.DataFrame(s_output.head(n_check_small).apply(lambda x: x.numpy()[0]).to_list()),
    rtol=1e-04,
)

False

In [61]:
np.equal(
    df_embeddings1.head(1),
    pd.DataFrame(s_output.head(1).apply(lambda x: x.numpy()[0]).to_list())
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [80]:
np.equal(
    df_embeddings1.head(1),
    model(df_test.head(1)).numpy()
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Equality fails even on the same input
Looks like the output is not deterministic

In [87]:
np.equal(
    model(df_test.head(1)).numpy()[:, :10],
    model(df_test.head(1)).numpy()[:, :10],
)

array([[False, False, False, False,  True,  True,  True,  True, False,
         True]])

In [90]:
np.allclose(
    model(df_test.head(10)).numpy(),
    model(df_test.head(10)).numpy(),
)

False

In [91]:
np.allclose(
    model(df_test.head(10)).numpy(),
    model(df_test.head(10)).numpy(),
    rtol=1e-04,
)

True

In [93]:
%%time

np.allclose(
    model(df_test).numpy(),
    model(df_test).numpy(),
    rtol=1e-04,
)

CPU times: user 488 ms, sys: 77.4 ms, total: 566 ms
Wall time: 2.12 s


False

In [94]:
%%time

np.allclose(
    model(df_test).numpy(),
    model(df_test).numpy(),
    rtol=1e-03,
)

CPU times: user 470 ms, sys: 117 ms, total: 587 ms
Wall time: 2.15 s


True

In [95]:
%%time

np.allclose(
    model(df_test).numpy(),
    model(df_test).numpy(),
    rtol=1e-03,
)

CPU times: user 502 ms, sys: 69.2 ms, total: 571 ms
Wall time: 2.14 s


True

In [None]:
LEGACY

### Try in parallel

In [17]:
import dask.dataframe as dd

In [18]:
ddf_text = dd.from_pandas(df_test, npartitions=6)

In [19]:
ddf_text.head()

Unnamed: 0,text
0,dog
1,Puppies are nice.
2,I enjoy taking long walks along the beach with my dog.
3,dog
4,Puppies are nice.


In [20]:
ddf_text[['text']].tail(10)

Unnamed: 0,text
5990,I enjoy taking long walks along the beach with my dog.
5991,dog
5992,Puppies are nice.
5993,I enjoy taking long walks along the beach with my dog.
5994,dog
5995,Puppies are nice.
5996,I enjoy taking long walks along the beach with my dog.
5997,dog
5998,Puppies are nice.
5999,I enjoy taking long walks along the beach with my dog.


## Define functions to get numpy array

In [21]:
def get_embeddings_as_numpy_df(
    df: pd.DataFrame,
    text_col: str = 'text',
    model_fxn: callable = model,
    apply_fxn: str = 'apply',
) -> pd.Series:
    """Apply fxn to get embeddings to a df
    Use it so that we can pair it with dask and apply it in parallel
    """
    if apply_fxn == 'apply':
        return df[text_col].apply(lambda x: model(x).numpy())
    else:
        # original example used map
        return df[text_col].map(model_fxn).apply(lambda x: x.numpy())
    

In [22]:
def get_embeddings_as_numpy(
    text_string: str,
#     tf_model: callable = model,
) -> np.ndarray:
    """Wrapper to convert Tensor type to numpy to make downstream transformations faster"""
    return model(text_string).numpy()

## Get embeddings with `dask.map_partition`

In [23]:
%%time

ddf_text['embeddings'] = (
    ddf_text[['text']]
    .map_partitions(model,
                    meta=pd.Series(name='embeddings', dtype=str)
                    )
    .compute()
)

CPU times: user 2.69 s, sys: 953 ms, total: 3.64 s
Wall time: 3.5 s


In [25]:
ddf_text[['embeddings']].compute().tail()

Unnamed: 0,embeddings
5995,
5996,
5997,
5998,
5999,


In [96]:
embeddings_size = 512
# tf_embeddings = np.empty([len(ddf_text), embeddings_size])
# tf_embeddings.shape

In [98]:
type(ddf_text)

dask.dataframe.core.DataFrame

In [102]:
%%time

ddf_text[['embeddings']].compute().head()

CPU times: user 39 ms, sys: 6.32 ms, total: 45.4 ms
Wall time: 34.9 ms


Unnamed: 0,embeddings
0,"((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt..."
1,"((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty..."
2,"((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d..."
3,"((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype..."
4,"((tf.Tensor(-0.0052521243, shape=(), dtype=float32), tf.Tensor(-0.03831199, shape=(), dtype=float32), tf.Tensor(-0.009220154, shape=(), dtype=float32), tf.Tensor(-0.0460963, shape=(), dtype=float32), tf.Tensor(-0.05765769, shape=(), dty..."


In [103]:
# %%time

# # This doesn't seem to work because dask expects the output to collapse to a single dimension (instead of 512)
# # tf_embeddings = 
# (
#     ddf_text[['text']]
#     .map_partitions(get_embeddings_as_numpy,
#                     meta=pd.Series(name='embeddings', dtype=str)
#                     )
#     .compute()
# )

In [107]:
%%time

ddf_text['embeddings_np1'] = (
    ddf_text
    .map_partitions(get_embeddings_as_numpy_df,
                    text_col='text',
                    apply_fxn='map',
                    meta=pd.Series(name='text', dtype=str)
                    )
    .compute()
)

CPU times: user 2min 40s, sys: 52.2 s, total: 3min 32s
Wall time: 46.8 s


In [106]:
%%time

ddf_text['embeddings_np2'] = (
    ddf_text
    .map_partitions(get_embeddings_as_numpy_df,
                    text_col='text',
                    apply_fxn='apply',
                    meta=pd.Series(name='text', dtype=str)
                    )
    .compute()
)

CPU times: user 2min 43s, sys: 52.5 s, total: 3min 35s
Wall time: 49.1 s


## What is the dtype? / how do we get embeddings out of this output?

By default it looks like the dtype is a `tensorflow.Tensor` object, which could be a pain to manipulate.

In [108]:
ddf_text['embeddings'].head()

0    ((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt...
1    ((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty...
2    ((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d...
3    ((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype...
4    ((tf.Tensor(-0.

In [109]:
ddf_text['embeddings'].tail()

5995    NaN
5996    NaN
5997    NaN
5998    NaN
5999    NaN
Name: embeddings, dtype: object

In [49]:
ddf_text['embeddings_np'].head()

0    [[-0.005252127, -0.03831199, -0.00922016, -0.046096295, -0.057657722, -0.04768436, -0.016511641, 0.0047285636, -0.013010718, -0.0713816, -0.030422542, 0.023165481, 0.017145848, 0.045331325, 0.015669955, -0.01395997, -0.05360133, -0.0118...
1    [[-0.02267581, -0.069071844, 0.015508692, -0.029039733, -0.08988534, -0.0010676696, -0.014085712, 0.0072551174, -0.019989632, 0.0669519, 0.017029503, 0.022945907, -0.006108564, 0.05118298, -0.043797858, -0.041236266, 0.029429343, 0.0488...
2    [[0.019088332, 0.009552218, -0.04741294, 0.0104937535, -0.042908445, -0.06331453, -0.0031612532, 0.051496144, 0.027147723, -0.010961102, 0.034988593, 0.057352275, 0.046823528, 0.057418354, -0.00366939, 0.0066046086, -0.0013654096, 0.003...
3    [[-0.0052521275, -0.03831199, -0.009220155, -0.046096295, -0.05765772, -0.04768436, -0.016511641, 0.0047285655, -0.013010721, -0.0713816, -0.030422544, 0.023165483, 0.017145844, 0.04533133, 0.015669957, -0.013959968, -0.05360133, -0.011...
4    [[-0.02267581, 

In [None]:
ddf_text['embeddings_np'].tail()

In [55]:
%%time
ddf_text['embeddings'].head().apply(lambda x: x.numpy())

CPU times: user 12.6 ms, sys: 0 ns, total: 12.6 ms
Wall time: 10.2 ms


0    [[-0.0052521224, -0.03831199, -0.009220155, -0.0460963, -0.057657693, -0.047684345, -0.016511647, 0.004728538, -0.01301072, -0.07138159, -0.03042251, 0.023165515, 0.017145874, 0.045331337, 0.015669929, -0.013959975, -0.05360135, -0.0118...
1    [[-0.022675814, -0.069071874, 0.015508659, -0.02903972, -0.08988533, -0.0010676887, -0.014085756, 0.007255134, -0.019989597, 0.06695186, 0.01702949, 0.02294587, -0.006108526, 0.05118297, -0.043797873, -0.04123629, 0.029429296, 0.0488887...
2    [[0.019088339, 0.009552221, -0.04741294, 0.010493785, -0.04290844, -0.06331449, -0.0031612637, 0.051496122, 0.027147735, -0.010961098, 0.034988593, 0.057352237, 0.046823513, 0.057418343, -0.0036693846, 0.006604615, -0.0013653795, 0.0038...
3    [[-0.0052521233, -0.03831199, -0.009220155, -0.0460963, -0.057657693, -0.047684345, -0.016511645, 0.0047285357, -0.013010718, -0.07138159, -0.030422507, 0.023165515, 0.017145874, 0.045331337, 0.015669929, -0.013959976, -0.05360135, -0.0...
4    [[-0.022675822,

In [111]:
ddf_text['embeddings'].compute()

0       ((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt...
1       ((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty...
2       ((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d...
3       ((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype...
4       

In [110]:
# %%time
# ddf_text['embeddings'].apply(lambda x: x.numpy())

In [112]:
ddf_text['embeddings'].head().apply(lambda x: x.numpy())

0    [[-0.005252141, -0.038312018, -0.009220189, -0.04609629, -0.05765771, -0.047684345, -0.01651165, 0.0047285794, -0.013010711, -0.071381606, -0.030422565, 0.02316548, 0.017145867, 0.045331288, 0.015669933, -0.013959985, -0.053601343, -0.0...
1    [[0.019088347, 0.009552213, -0.047412947, 0.0104937935, -0.04290844, -0.06331449, -0.0031612592, 0.051496133, 0.027147729, -0.010961093, 0.034988593, 0.05735223, 0.046823505, 0.057418343, -0.0036693835, 0.0066046203, -0.0013653715, 0.00...
2    [[-0.022675823, -0.069071874, 0.0155086545, -0.029039716, -0.08988534, -0.0010676958, -0.014085754, 0.0072551346, -0.019989599, 0.06695186, 0.01702949, 0.022945872, -0.0061085364, 0.051182974, -0.04379786, -0.04123629, 0.029429302, 0.04...
3    [[0.01908833, 0.009552218, -0.047412947, 0.010493787, -0.04290844, -0.063314475, -0.0031612662, 0.051496133, 0.027147733, -0.010961089, 0.0349886, 0.05735224, 0.046823505, 0.057418346, -0.003669383, 0.0066046147, -0.0013653776, 0.003832...
4    [[-0.0052521243

In [113]:
# ddf_text['embeddings'].apply(lambda x: x.numpy())