# Purpose

2021-06-16.
Test loading and setting up USE-multilingual before kicking off jobs to get embeddings for all posts/notes/comments.

If time allows, might also compare with loading fastText embeddings and converting the German-language POSTS.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
import os

import fse
from fse.models import uSIF
import gensim
from gensim.models.fasttext import FastText, load_facebook_vectors
import joblib

import math
import numpy as np
import pandas as pd
import plotly
import plotly.express as px

from subclu.data.fasttext_utils import (
    download_ft_pretrained_model,
    get_df_for_most_similar,
    get_project_subfolder,
)
from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


print_lib_versions([fse, gensim, joblib, np, pd, plotly])

python		v 3.7.10
===
fse		v: 0.1.15
gensim		v: 3.8.3
joblib		v: 1.0.1
numpy		v: 1.18.5
pandas		v: 1.2.4
plotly		v: 4.14.3


In [3]:
# USE & TF-focused imports
import tensorflow
import tensorflow as tf
# import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
import tensorflow_text

print_lib_versions([tensorflow])

python		v 3.7.10
===
tensorflow	v: 2.3.2


In [4]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Load model from hub

In [5]:
%%time
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
#@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3' 


model = hub.load(module_url)

def embed_text(input):
  return model(input)

CPU times: user 10 s, sys: 2.9 s, total: 12.9 s
Wall time: 17.7 s


## Example from tutorial

In [6]:
# Some texts of different lengths.
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]


In [7]:
%%time

# Compute embeddings.
en_result = model(english_sentences)
it_result = model(italian_sentences)
ja_result = model(japanese_sentences)


CPU times: user 3.24 s, sys: 177 ms, total: 3.42 s
Wall time: 3.63 s


In [8]:
pd.DataFrame(en_result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,...,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,"tf.Tensor(-0.0052521327, shape=(), dtype=float32)","tf.Tensor(-0.038312003, shape=(), dtype=float32)","tf.Tensor(-0.00922017, shape=(), dtype=float32)","tf.Tensor(-0.04609629, shape=(), dtype=float32)","tf.Tensor(-0.057657726, shape=(), dtype=float32)","tf.Tensor(-0.04768435, shape=(), dtype=float32)","tf.Tensor(-0.016511654, shape=(), dtype=float32)","tf.Tensor(0.0047285818, shape=(), dtype=float32)","tf.Tensor(-0.013010713, shape=(), dtype=float32)","tf.Tensor(-0.071381606, shape=(), dtype=float32)","tf.Tensor(-0.030422544, shape=(), dtype=float32)","tf.Tensor(0.023165468, shape=(), dtype=float32)","tf.Tensor(0.017145867, shape=(), dtype=float32)","tf.Tensor(0.04533129, shape=(), dtype=float32)","tf.Tensor(0.015669951, shape=(), dtype=float32)","tf.Tensor(-0.013959979, shape=(), dtype=float32)","tf.Tensor(-0.053601343, shape=(), dtype=float32)","tf.Tensor(-0.0118785035, shape=(), dtype=float32)","tf.Tensor(-0.110630944, shape=(), dtype=float32)","tf.Tensor(0.004849981, shape=(), dtype=float32)","tf.Tensor(0.006763194, shape=(), dtype=float32)","tf.Tensor(0.03981161, shape=(), dtype=float32)","tf.Tensor(0.066015504, shape=(), dtype=float32)","tf.Tensor(-0.012721933, shape=(), dtype=float32)","tf.Tensor(0.037441503, shape=(), dtype=float32)","tf.Tensor(0.030250406, shape=(), dtype=float32)","tf.Tensor(-0.015202955, shape=(), dtype=float32)","tf.Tensor(0.011007942, shape=(), dtype=float32)","tf.Tensor(0.016856827, shape=(), dtype=float32)","tf.Tensor(-0.019257631, shape=(), dtype=float32)",...,"tf.Tensor(0.08616642, shape=(), dtype=float32)","tf.Tensor(-0.024433251, shape=(), dtype=float32)","tf.Tensor(-0.00091491063, shape=(), dtype=float32)","tf.Tensor(0.038853936, shape=(), dtype=float32)","tf.Tensor(-0.0319451, shape=(), dtype=float32)","tf.Tensor(0.043825824, shape=(), dtype=float32)","tf.Tensor(0.058404714, shape=(), dtype=float32)","tf.Tensor(0.039348062, shape=(), dtype=float32)","tf.Tensor(0.082391866, shape=(), dtype=float32)","tf.Tensor(-0.028973889, shape=(), dtype=float32)","tf.Tensor(0.012636836, shape=(), dtype=float32)","tf.Tensor(-0.008682325, shape=(), dtype=float32)","tf.Tensor(0.0005825156, shape=(), dtype=float32)","tf.Tensor(0.0751793, shape=(), dtype=float32)","tf.Tensor(0.017356148, shape=(), dtype=float32)","tf.Tensor(0.012234448, shape=(), dtype=float32)","tf.Tensor(-0.019842025, shape=(), dtype=float32)","tf.Tensor(-0.07749193, shape=(), dtype=float32)","tf.Tensor(-0.024406092, shape=(), dtype=float32)","tf.Tensor(-0.032542046, shape=(), dtype=float32)","tf.Tensor(0.012990007, shape=(), dtype=float32)","tf.Tensor(-0.05957967, shape=(), dtype=float32)","tf.Tensor(0.026064066, shape=(), dtype=float32)","tf.Tensor(-0.05239152, shape=(), dtype=float32)","tf.Tensor(-0.05916793, shape=(), dtype=float32)","tf.Tensor(-0.025977192, shape=(), dtype=float32)","tf.Tensor(-0.031976104, shape=(), dtype=float32)","tf.Tensor(0.021790508, shape=(), dtype=float32)","tf.Tensor(0.06280179, shape=(), dtype=float32)","tf.Tensor(-0.016225368, shape=(), dtype=float32)"
1,"tf.Tensor(-0.022675816, shape=(), dtype=float32)","tf.Tensor(-0.069071844, shape=(), dtype=float32)","tf.Tensor(0.015508695, shape=(), dtype=float32)","tf.Tensor(-0.029039735, shape=(), dtype=float32)","tf.Tensor(-0.08988534, shape=(), dtype=float32)","tf.Tensor(-0.0010676612, shape=(), dtype=float32)","tf.Tensor(-0.014085721, shape=(), dtype=float32)","tf.Tensor(0.0072551174, shape=(), dtype=float32)","tf.Tensor(-0.01998963, shape=(), dtype=float32)","tf.Tensor(0.06695189, shape=(), dtype=float32)","tf.Tensor(0.017029494, shape=(), dtype=float32)","tf.Tensor(0.02294588, shape=(), dtype=float32)","tf.Tensor(-0.0061085685, shape=(), dtype=float32)","tf.Tensor(0.051182978, shape=(), dtype=float32)","tf.Tensor(-0.043797847, shape=(), dtype=float32)","tf.Tensor(-0.041236263, shape=(), dtype=float32)","tf.Tensor(0.02942931, shape=(), dtype=float32)","tf.Tensor(0.048888758, shape=(), dtype=float32)","tf.Tensor(-0.11650517, shape=(), dtype=float32)","tf.Tensor(-0.0024727208, shape=(), dtype=float32)","tf.Tensor(0.028916731, shape=(), dtype=float32)","tf.Tensor(9.121719e-05, shape=(), dtype=float32)","tf.Tensor(0.024404425, shape=(), dtype=float32)","tf.Tensor(0.012593304, shape=(), dtype=float32)","tf.Tensor(0.020244474, shape=(), dtype=float32)","tf.Tensor(0.0207117, shape=(), dtype=float32)","tf.Tensor(-0.031881034, shape=(), dtype=float32)","tf.Tensor(0.053425197, shape=(), dtype=float32)","tf.Tensor(0.0026995584, shape=(), dtype=float32)","tf.Tensor(-0.08084271, shape=(), dtype=float32)",...,"tf.Tensor(0.10710527, shape=(), dtype=float32)","tf.Tensor(0.00011629223, shape=(), dtype=float32)","tf.Tensor(0.008712977, shape=(), dtype=float32)","tf.Tensor(0.07481751, shape=(), dtype=float32)","tf.Tensor(-0.04523135, shape=(), dtype=float32)","tf.Tensor(0.052447546, shape=(), dtype=float32)","tf.Tensor(0.039153565, shape=(), dtype=float32)","tf.Tensor(-0.0018760605, shape=(), dtype=float32)","tf.Tensor(0.017901925, shape=(), dtype=float32)","tf.Tensor(-0.01471055, shape=(), dtype=float32)","tf.Tensor(0.04312735, shape=(), dtype=float32)","tf.Tensor(-0.030878652, shape=(), dtype=float32)","tf.Tensor(-0.056910444, shape=(), dtype=float32)","tf.Tensor(0.058894195, shape=(), dtype=float32)","tf.Tensor(-0.006595898, shape=(), dtype=float32)","tf.Tensor(-0.019973187, shape=(), dtype=float32)","tf.Tensor(-0.03776713, shape=(), dtype=float32)","tf.Tensor(0.021722123, shape=(), dtype=float32)","tf.Tensor(-0.032639634, shape=(), dtype=float32)","tf.Tensor(-0.0055215745, shape=(), dtype=float32)","tf.Tensor(-0.039815918, shape=(), dtype=float32)","tf.Tensor(-0.0046574036, shape=(), dtype=float32)","tf.Tensor(0.02420081, shape=(), dtype=float32)","tf.Tensor(-0.02665795, shape=(), dtype=float32)","tf.Tensor(-0.006368065, shape=(), dtype=float32)","tf.Tensor(-0.06910574, shape=(), dtype=float32)","tf.Tensor(0.029129563, shape=(), dtype=float32)","tf.Tensor(0.08762301, shape=(), dtype=float32)","tf.Tensor(-0.00076923, shape=(), dtype=float32)","tf.Tensor(-0.05410821, shape=(), dtype=float32)"
2,"tf.Tensor(0.019088328, shape=(), dtype=float32)","tf.Tensor(0.009552198, shape=(), dtype=float32)","tf.Tensor(-0.04741294, shape=(), dtype=float32)","tf.Tensor(0.010493756, shape=(), dtype=float32)","tf.Tensor(-0.04290846, shape=(), dtype=float32)","tf.Tensor(-0.063314535, shape=(), dtype=float32)","tf.Tensor(-0.0031612464, shape=(), dtype=float32)","tf.Tensor(0.051496133, shape=(), dtype=float32)","tf.Tensor(0.02714772, shape=(), dtype=float32)","tf.Tensor(-0.010961108, shape=(), dtype=float32)","tf.Tensor(0.0349886, shape=(), dtype=float32)","tf.Tensor(0.057352286, shape=(), dtype=float32)","tf.Tensor(0.046823524, shape=(), dtype=float32)","tf.Tensor(0.057418343, shape=(), dtype=float32)","tf.Tensor(-0.0036693674, shape=(), dtype=float32)","tf.Tensor(0.006604579, shape=(), dtype=float32)","tf.Tensor(-0.0013654018, shape=(), dtype=float32)","tf.Tensor(0.0038320322, shape=(), dtype=float32)","tf.Tensor(-0.090584226, shape=(), dtype=float32)","tf.Tensor(0.04874486, shape=(), dtype=float32)","tf.Tensor(0.02869543, shape=(), dtype=float32)","tf.Tensor(-0.02148356, shape=(), dtype=float32)","tf.Tensor(0.052940544, shape=(), dtype=float32)","tf.Tensor(-0.01777254, shape=(), dtype=float32)","tf.Tensor(-0.048549965, shape=(), dtype=float32)","tf.Tensor(-0.07571311, shape=(), dtype=float32)","tf.Tensor(-0.061066054, shape=(), dtype=float32)","tf.Tensor(0.063769765, shape=(), dtype=float32)","tf.Tensor(-0.030032238, shape=(), dtype=float32)","tf.Tensor(0.026240462, shape=(), dtype=float32)",...,"tf.Tensor(-0.021820953, shape=(), dtype=float32)","tf.Tensor(-0.054204836, shape=(), dtype=float32)","tf.Tensor(0.07822651, shape=(), dtype=float32)","tf.Tensor(-0.013889629, shape=(), dtype=float32)","tf.Tensor(-0.054223295, shape=(), dtype=float32)","tf.Tensor(-0.05325499, shape=(), dtype=float32)","tf.Tensor(0.007227717, shape=(), dtype=float32)","tf.Tensor(-0.029698567, shape=(), dtype=float32)","tf.Tensor(0.025481405, shape=(), dtype=float32)","tf.Tensor(0.0121243205, shape=(), dtype=float32)","tf.Tensor(0.00032450585, shape=(), dtype=float32)","tf.Tensor(0.00836992, shape=(), dtype=float32)","tf.Tensor(-0.041196987, shape=(), dtype=float32)","tf.Tensor(0.08043291, shape=(), dtype=float32)","tf.Tensor(0.0070090187, shape=(), dtype=float32)","tf.Tensor(-0.050937872, shape=(), dtype=float32)","tf.Tensor(0.013866421, shape=(), dtype=float32)","tf.Tensor(0.03464066, shape=(), dtype=float32)","tf.Tensor(0.008104312, shape=(), dtype=float32)","tf.Tensor(0.021492792, shape=(), dtype=float32)","tf.Tensor(0.1141071, shape=(), dtype=float32)","tf.Tensor(-0.07061693, shape=(), dtype=float32)","tf.Tensor(0.065581456, shape=(), dtype=float32)","tf.Tensor(-0.039064355, shape=(), dtype=float32)","tf.Tensor(0.005809179, shape=(), dtype=float32)","tf.Tensor(-0.0789544, shape=(), dtype=float32)","tf.Tensor(0.059527934, shape=(), dtype=float32)","tf.Tensor(0.0320648, shape=(), dtype=float32)","tf.Tensor(0.03661666, shape=(), dtype=float32)","tf.Tensor(0.0033105891, shape=(), dtype=float32)"


In [9]:
tf.nn.l2_normalize(it_result)

<tf.Tensor: shape=(3, 512), dtype=float32, numpy=
array([[ 0.00720229, -0.02716358, -0.00670169, ...,  0.0047842 ,
         0.03265055, -0.01112303],
       [-0.00031301, -0.03090754, -0.00294118, ...,  0.04377454,
        -0.01193869, -0.01058715],
       [ 0.01333501,  0.01904838, -0.02599623, ...,  0.0150301 ,
         0.01788162, -0.00163474]], dtype=float32)>

In [10]:
# it_result

In [10]:
# Compute similarity matrix. Higher score indicates greater similarity.
similarity_matrix_it = np.inner(en_result, it_result)
similarity_matrix_it

array([[0.96964586, 0.27347848, 0.25536823],
       [0.3772605 , 0.8931674 , 0.2880668 ],
       [0.24611363, 0.23792979, 0.9352153 ]], dtype=float32)

In [11]:
similarity_matrix_ja = np.inner(en_result, ja_result)
similarity_matrix_ja

array([[0.97994244, 0.53417313, 0.33749652],
       [0.4187406 , 0.77861166, 0.3792975 ],
       [0.25998732, 0.29997385, 0.8583673 ]], dtype=float32)

# Cosine Similarities
Using example form Colab

https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb#scrollTo=W-q2r7jyZGb7

In [12]:
def get_similarities(sent_1, sent_2):
    sts_encode1 = tf.nn.l2_normalize(model(sent_1), axis=1)
    sts_encode2 = tf.nn.l2_normalize(model(sent_2), axis=1)
    
    cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
    clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
    scores = 1.0 - tf.acos(clip_cosine_similarities) / math.pi
    """Returns the similarity scores"""
    return scores

In [13]:
get_similarities(english_sentences[0], italian_sentences[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.921372], dtype=float32)>

In [14]:
get_similarities(english_sentences[0], japanese_sentences[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.93613935], dtype=float32)>

In [15]:
model("hello").numpy().shape

(1, 512)

## Test in df

Pandas .apply()
```
s_output = df_test['text'].apply(model)

>
CPU times: user 1min 39s, sys: 14 s, total: 1min 53s
Wall time: 1min 4s
```


```
ddf_text['embeddings'] = (
    ddf_text
    .map_partitions(model,
                    meta=pd.Series(name='text', dtype=str)
                    )
    .compute()
)

>
CPU times: user 692 ms, sys: 168 ms, total: 860 ms
Wall time: 982 ms
```

In [16]:
%%time
df_test = pd.DataFrame({'text': english_sentences*2000})
df_test

CPU times: user 1.14 ms, sys: 0 ns, total: 1.14 ms
Wall time: 1.11 ms


Unnamed: 0,text
0,dog
1,Puppies are nice.
2,I enjoy taking long walks along the beach with my dog.
3,dog
4,Puppies are nice.
...,...
5995,Puppies are nice.
5996,I enjoy taking long walks along the beach with my dog.
5997,dog
5998,Puppies are nice.


In [39]:
%%time

s_output = df_test['text'].apply(model)

CPU times: user 1min 39s, sys: 14 s, total: 1min 53s
Wall time: 1min 4s


### Try in parallel

In [17]:
import dask.dataframe as dd

In [18]:
ddf_text = dd.from_pandas(df_test, npartitions=6)

In [19]:
ddf_text.head()

Unnamed: 0,text
0,dog
1,Puppies are nice.
2,I enjoy taking long walks along the beach with my dog.
3,dog
4,Puppies are nice.


In [20]:
ddf_text[['text']].tail(10)

Unnamed: 0,text
5990,I enjoy taking long walks along the beach with my dog.
5991,dog
5992,Puppies are nice.
5993,I enjoy taking long walks along the beach with my dog.
5994,dog
5995,Puppies are nice.
5996,I enjoy taking long walks along the beach with my dog.
5997,dog
5998,Puppies are nice.
5999,I enjoy taking long walks along the beach with my dog.


## Define functions to get numpy array

In [21]:
def get_embeddings_as_numpy_df(
    df: pd.DataFrame,
    text_col: str = 'text',
    model_fxn: callable = model,
    apply_fxn: str = 'apply',
) -> pd.Series:
    """Apply fxn to get embeddings to a df
    Use it so that we can pair it with dask and apply it in parallel
    """
    if apply_fxn == 'apply':
        return df[text_col].apply(lambda x: model(x).numpy())
    else:
        # original example used map
        return df[text_col].map(model_fxn).apply(lambda x: x.numpy())
    

In [22]:
def get_embeddings_as_numpy(
    text_string: str,
#     tf_model: callable = model,
) -> np.ndarray:
    """Wrapper to convert Tensor type to numpy to make downstream transformations faster"""
    return model(text_string).numpy()

## Get embeddings with `dask.map_partition`

In [23]:
%%time

ddf_text['embeddings'] = (
    ddf_text[['text']]
    .map_partitions(model,
                    meta=pd.Series(name='embeddings', dtype=str)
                    )
    .compute()
)

CPU times: user 2.69 s, sys: 953 ms, total: 3.64 s
Wall time: 3.5 s


In [26]:
ddf_text[['embeddings']].tail(10)

Unnamed: 0,embeddings
5990,
5991,
5992,
5993,
5994,
5995,
5996,
5997,
5998,
5999,


In [28]:
ddf_text[['embeddings']].compute().head(10)

Unnamed: 0,embeddings
0,"((tf.Tensor(-0.0052521233, shape=(), dtype=float32), tf.Tensor(-0.038311988, shape=(), dtype=float32), tf.Tensor(-0.009220155, shape=(), dtype=float32), tf.Tensor(-0.0460963, shape=(), dtype=float32), tf.Tensor(-0.057657693, shape=(), d..."
1,"((tf.Tensor(-0.022675822, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.01550865, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), dty..."
2,"((tf.Tensor(0.019088339, shape=(), dtype=float32), tf.Tensor(0.00955222, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493789, shape=(), dtype=float32), tf.Tensor(-0.042908438, shape=(), dtyp..."
3,"((tf.Tensor(-0.0052521233, shape=(), dtype=float32), tf.Tensor(-0.03831199, shape=(), dtype=float32), tf.Tensor(-0.009220156, shape=(), dtype=float32), tf.Tensor(-0.046096303, shape=(), dtype=float32), tf.Tensor(-0.05765769, shape=(), d..."
4,"((tf.Tensor(-0.022675809, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.015508649, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), dt..."
5,"((tf.Tensor(0.019088333, shape=(), dtype=float32), tf.Tensor(0.009552219, shape=(), dtype=float32), tf.Tensor(-0.047412943, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.042908438, shape=(), dty..."
6,
7,
8,
9,


In [96]:
embeddings_size = 512
# tf_embeddings = np.empty([len(ddf_text), embeddings_size])
# tf_embeddings.shape

In [98]:
type(ddf_text)

dask.dataframe.core.DataFrame

In [102]:
%%time

ddf_text[['embeddings']].compute().head()

CPU times: user 39 ms, sys: 6.32 ms, total: 45.4 ms
Wall time: 34.9 ms


Unnamed: 0,embeddings
0,"((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt..."
1,"((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty..."
2,"((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d..."
3,"((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype..."
4,"((tf.Tensor(-0.0052521243, shape=(), dtype=float32), tf.Tensor(-0.03831199, shape=(), dtype=float32), tf.Tensor(-0.009220154, shape=(), dtype=float32), tf.Tensor(-0.0460963, shape=(), dtype=float32), tf.Tensor(-0.05765769, shape=(), dty..."


In [103]:
# %%time

# # This doesn't seem to work because dask expects the output to collapse to a single dimension (instead of 512)
# # tf_embeddings = 
# (
#     ddf_text[['text']]
#     .map_partitions(get_embeddings_as_numpy,
#                     meta=pd.Series(name='embeddings', dtype=str)
#                     )
#     .compute()
# )

In [107]:
%%time

ddf_text['embeddings_np1'] = (
    ddf_text
    .map_partitions(get_embeddings_as_numpy_df,
                    text_col='text',
                    apply_fxn='map',
                    meta=pd.Series(name='text', dtype=str)
                    )
    .compute()
)

CPU times: user 2min 40s, sys: 52.2 s, total: 3min 32s
Wall time: 46.8 s


In [106]:
%%time

ddf_text['embeddings_np2'] = (
    ddf_text
    .map_partitions(get_embeddings_as_numpy_df,
                    text_col='text',
                    apply_fxn='apply',
                    meta=pd.Series(name='text', dtype=str)
                    )
    .compute()
)

CPU times: user 2min 43s, sys: 52.5 s, total: 3min 35s
Wall time: 49.1 s


## What is the dtype? / how do we get embeddings out of this output?

By default it looks like the dtype is a `tensorflow.Tensor` object, which could be a pain to manipulate.

In [108]:
ddf_text['embeddings'].head()

0    ((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt...
1    ((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty...
2    ((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d...
3    ((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype...
4    ((tf.Tensor(-0.

In [109]:
ddf_text['embeddings'].tail()

5995    NaN
5996    NaN
5997    NaN
5998    NaN
5999    NaN
Name: embeddings, dtype: object

In [49]:
ddf_text['embeddings_np'].head()

0    [[-0.005252127, -0.03831199, -0.00922016, -0.046096295, -0.057657722, -0.04768436, -0.016511641, 0.0047285636, -0.013010718, -0.0713816, -0.030422542, 0.023165481, 0.017145848, 0.045331325, 0.015669955, -0.01395997, -0.05360133, -0.0118...
1    [[-0.02267581, -0.069071844, 0.015508692, -0.029039733, -0.08988534, -0.0010676696, -0.014085712, 0.0072551174, -0.019989632, 0.0669519, 0.017029503, 0.022945907, -0.006108564, 0.05118298, -0.043797858, -0.041236266, 0.029429343, 0.0488...
2    [[0.019088332, 0.009552218, -0.04741294, 0.0104937535, -0.042908445, -0.06331453, -0.0031612532, 0.051496144, 0.027147723, -0.010961102, 0.034988593, 0.057352275, 0.046823528, 0.057418354, -0.00366939, 0.0066046086, -0.0013654096, 0.003...
3    [[-0.0052521275, -0.03831199, -0.009220155, -0.046096295, -0.05765772, -0.04768436, -0.016511641, 0.0047285655, -0.013010721, -0.0713816, -0.030422544, 0.023165483, 0.017145844, 0.04533133, 0.015669957, -0.013959968, -0.05360133, -0.011...
4    [[-0.02267581, 

In [None]:
ddf_text['embeddings_np'].tail()

In [55]:
%%time
ddf_text['embeddings'].head().apply(lambda x: x.numpy())

CPU times: user 12.6 ms, sys: 0 ns, total: 12.6 ms
Wall time: 10.2 ms


0    [[-0.0052521224, -0.03831199, -0.009220155, -0.0460963, -0.057657693, -0.047684345, -0.016511647, 0.004728538, -0.01301072, -0.07138159, -0.03042251, 0.023165515, 0.017145874, 0.045331337, 0.015669929, -0.013959975, -0.05360135, -0.0118...
1    [[-0.022675814, -0.069071874, 0.015508659, -0.02903972, -0.08988533, -0.0010676887, -0.014085756, 0.007255134, -0.019989597, 0.06695186, 0.01702949, 0.02294587, -0.006108526, 0.05118297, -0.043797873, -0.04123629, 0.029429296, 0.0488887...
2    [[0.019088339, 0.009552221, -0.04741294, 0.010493785, -0.04290844, -0.06331449, -0.0031612637, 0.051496122, 0.027147735, -0.010961098, 0.034988593, 0.057352237, 0.046823513, 0.057418343, -0.0036693846, 0.006604615, -0.0013653795, 0.0038...
3    [[-0.0052521233, -0.03831199, -0.009220155, -0.0460963, -0.057657693, -0.047684345, -0.016511645, 0.0047285357, -0.013010718, -0.07138159, -0.030422507, 0.023165515, 0.017145874, 0.045331337, 0.015669929, -0.013959976, -0.05360135, -0.0...
4    [[-0.022675822,

In [111]:
ddf_text['embeddings'].compute()

0       ((tf.Tensor(-0.005252141, shape=(), dtype=float32), tf.Tensor(-0.038312018, shape=(), dtype=float32), tf.Tensor(-0.009220189, shape=(), dtype=float32), tf.Tensor(-0.04609629, shape=(), dtype=float32), tf.Tensor(-0.05765771, shape=(), dt...
1       ((tf.Tensor(0.019088347, shape=(), dtype=float32), tf.Tensor(0.009552213, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.0104937935, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dty...
2       ((tf.Tensor(-0.022675823, shape=(), dtype=float32), tf.Tensor(-0.069071874, shape=(), dtype=float32), tf.Tensor(0.0155086545, shape=(), dtype=float32), tf.Tensor(-0.029039716, shape=(), dtype=float32), tf.Tensor(-0.08988534, shape=(), d...
3       ((tf.Tensor(0.01908833, shape=(), dtype=float32), tf.Tensor(0.009552218, shape=(), dtype=float32), tf.Tensor(-0.047412947, shape=(), dtype=float32), tf.Tensor(0.010493787, shape=(), dtype=float32), tf.Tensor(-0.04290844, shape=(), dtype...
4       

In [110]:
# %%time
# ddf_text['embeddings'].apply(lambda x: x.numpy())

In [112]:
ddf_text['embeddings'].head().apply(lambda x: x.numpy())

0    [[-0.005252141, -0.038312018, -0.009220189, -0.04609629, -0.05765771, -0.047684345, -0.01651165, 0.0047285794, -0.013010711, -0.071381606, -0.030422565, 0.02316548, 0.017145867, 0.045331288, 0.015669933, -0.013959985, -0.053601343, -0.0...
1    [[0.019088347, 0.009552213, -0.047412947, 0.0104937935, -0.04290844, -0.06331449, -0.0031612592, 0.051496133, 0.027147729, -0.010961093, 0.034988593, 0.05735223, 0.046823505, 0.057418343, -0.0036693835, 0.0066046203, -0.0013653715, 0.00...
2    [[-0.022675823, -0.069071874, 0.0155086545, -0.029039716, -0.08988534, -0.0010676958, -0.014085754, 0.0072551346, -0.019989599, 0.06695186, 0.01702949, 0.022945872, -0.0061085364, 0.051182974, -0.04379786, -0.04123629, 0.029429302, 0.04...
3    [[0.01908833, 0.009552218, -0.047412947, 0.010493787, -0.04290844, -0.063314475, -0.0031612662, 0.051496133, 0.027147733, -0.010961089, 0.0349886, 0.05735224, 0.046823505, 0.057418346, -0.003669383, 0.0066046147, -0.0013653776, 0.003832...
4    [[-0.0052521243

In [113]:
# ddf_text['embeddings'].apply(lambda x: x.numpy())