# Purpose
This notebook runs the `vectorize_text_to_embeddings` function to:
- loading USE-multilingual model
- load post & comment text
- convert the text into embeddings (at post or comment level)


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# from datetime import datetime
# import gc
# from functools import partial
# import os
import logging
# from pathlib import Path
# from pprint import pprint

import mlflow

import numpy as np
import pandas as pd

# TF libraries... I've been getting errors when these aren't loaded
import tensorflow_text
import tensorflow as tf

import subclu
from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
#     D_MODELS_CPU,
#     process_text_for_fse,
#     vectorize_text_with_fse,
)
from subclu.models.preprocess_text import TextPreprocessor, transform_and_tokenize_text
from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, mlflow, pd, tensorflow_text, tf, subclu])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.18.5
mlflow		v: 1.16.0
pandas		v: 1.2.4
tensorflow_text	v: 2.3.0
tensorflow	v: 2.3.3
subclu		v: 0.1.1


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Initialize mlflow logging with sqlite database

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/mlflow/mlruns.db'

## Get list of experiments with new function

In [5]:
mlf.list_experiment_meta(output_format='pandas')

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
0,0,Default,./mlruns/0,active
1,1,fse_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/1,active
2,2,fse_vectorize_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/2,active
3,3,subreddit_description_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/3,active
4,4,fse_vectorize_v1.1,gs://i18n-subreddit-clustering/mlflow/mlruns/4,active
5,5,use_multilingual_v0.1_test,gs://i18n-subreddit-clustering/mlflow/mlruns/5,active
6,6,use_multilingual_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/6,active


# Check whether we have access to a GPU

In [6]:
l_phys_gpus = tf.config.list_physical_devices('GPU')
from tensorflow.python.client import device_lib

print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\nGPUs\n==="
    f"\nNum GPUs Available: {len(l_phys_gpus)}"
    f"\nGPU details:\n{l_phys_gpus}"
    f"\n\nAll devices:\n===\n"
    f"{device_lib.list_local_devices()}"
)


Built with CUDA? True
GPUs
===
Num GPUs Available: 0
GPU details:
[]

All devices:
===
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13909519659929254346
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 17707750117119033111
physical_device_desc: "device: XLA_CPU device"
]


# Call function to vectorize text

- Batch of: 3000 
- Limit characters to: 1000
Finally leaves enough room to use around 50% of RAM (of 60GB)

The problem is that each iteration takes around 3 minutes, which means whole job for GERMAN only will tka around 4:42 hours:mins...

In [7]:
mlflow_experiment_test = 'use_multilingual_v0.1_test'

## Test on subreddits only to make sure entire process works first

For subreddit only, we can expand to more than 1,500 characters.

When scoring posts &/or comments, we might be better off trimming to first ~1,000 characters to speed things up. We can increase the character len if results aren't great... this could be a hyperparameter to tune.

In [9]:
mlflow.end_run(status='KILLED')

model, df_vect, df_vect_comments, df_vect_subs = vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name='test_n_samples',
    mlflow_experiment=mlflow_experiment_test,
    
    tokenize_lowercase=True,
    subreddits_path='subreddits/de/2021-06-16',
    posts_path='posts/de/2021-06-16',
    comments_path='comments/de/2021-06-16',
    tf_batch_inference_rows=1000,
    tf_limit_first_n_chars=1000,
    n_sample_posts=1500,
    n_sample_comments=2100,
)

10:12:16 | INFO | "Start vectorize function"
10:12:16 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-07-01_1012"
10:12:16 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/de/2021-06-16"
10:12:38 | INFO | "  0:00:21.509302 <- df_post time elapsed"
10:12:38 | INFO | "  (262226, 6) <- df_posts.shape"
10:12:38 | INFO | "  Sampling posts down to: 1,500"
10:12:38 | INFO | "  (1500, 6) <- df_posts.shape AFTER sampling"
10:12:38 | INFO | "Load comments df..."
10:13:08 | INFO | "  (1108757, 6) <- df_comments shape"
10:13:09 | INFO | "Keep only comments that match posts IDs in df_posts..."
10:13:09 | INFO | "  (5922, 6) <- updated df_comments shape"
10:13:09 | INFO | "  Sampling posts down to: 2,100"
10:13:09 | INFO | "  (2100, 6) <- df_posts.shape AFTER sampling"
10:13:09 | INFO | "Load subreddits df..."
10:13:10 | INFO | "  (629, 4) <- df_subs shape"
10:13:10 | INFO | "MLflow tracking URI: sqlite:////home/

  0%|          | 0/2 [00:00<?, ?it/s]

10:13:23 | INFO | "  Saving to local... df_vect_posts..."
10:13:23 | INFO | "  Logging to mlflow..."
10:13:24 | INFO | "Vectorizing COMMENTS..."
10:13:24 | INFO | "Getting embeddings in batches of size: 1000"


  0%|          | 0/3 [00:00<?, ?it/s]

10:13:27 | INFO | "  Saving to local... df_vect_comments..."
10:13:27 | INFO | "  Logging to mlflow..."
10:13:29 | INFO | "  0:01:12.740057 <- Total vectorize fxn time elapsed"


In [10]:
df_vect.iloc[:5, :10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9
subreddit_name,subreddit_id,post_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
wixbros,t5_3kytod,t3_mq7a62,-0.038117,0.038844,0.038505,-0.043656,-0.068405,0.022897,0.044303,0.077128,-0.081332,0.052953
buenzli,t5_2xbtv,t3_nvehlu,-0.048097,0.063601,0.026753,0.067516,0.008386,0.061029,0.066612,-0.006206,-0.052348,0.025185
deutschetributes,t5_3nl4m8,t3_mknpjt,-0.04299,0.050231,0.032712,0.064595,-0.096226,0.055949,0.042604,-0.050687,-0.065958,-0.026212
hamburg,t5_2r4mj,t3_mwaul4,-0.008832,-0.001481,0.068156,-0.010377,-0.09299,0.017509,0.034375,-0.021503,-0.038604,0.027993
fragreddit,t5_2r6ca,t3_nmidll,-0.024095,0.034854,-0.049674,0.078292,0.027804,0.049831,0.048479,0.036318,0.000942,0.042647


In [11]:
df_vect_comments.iloc[10:15, -10:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,embeddings_502,embeddings_503,embeddings_504,embeddings_505,embeddings_506,embeddings_507,embeddings_508,embeddings_509,embeddings_510,embeddings_511
subreddit_name,subreddit_id,post_id,comment_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
de,t5_22i0,t3_mtzmuy,t1_gv2qwej,0.006573,0.02274,0.022035,0.023709,-0.032349,-0.056558,-0.032651,0.032886,0.058517,0.003763
ich_iel,t5_37k29,t3_n3w8k7,t1_gwse0h5,-0.023824,-0.022993,0.008194,0.055177,0.047211,-0.018259,0.018188,0.001428,-0.008705,-0.02743
deutschetributes,t5_3nl4m8,t3_nw4zys,t1_h19nt95,-0.030743,0.028614,0.03305,0.054707,0.024857,-0.009729,0.016434,-0.013586,0.040401,0.021953
de,t5_22i0,t3_n1db9m,t1_gwg13yu,-0.040634,0.077304,-0.035834,0.077472,0.007378,-0.014481,-0.032528,0.0319,-0.043965,0.091057
ich_iel,t5_37k29,t3_nrm7uy,t1_h0hdl7s,-0.082705,-0.020488,-0.012717,0.026758,0.044663,0.03874,0.009746,-0.033515,-0.045458,0.109007


In [12]:
df_vect_subs.iloc[:5, :10]

Unnamed: 0_level_0,Unnamed: 1_level_0,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9
subreddit_name,subreddit_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
de,t5_22i0,-0.018191,-0.045794,0.035795,-0.036392,0.033076,0.013654,-0.013067,-0.031252,0.001283,0.0247
ich_iel,t5_37k29,-0.019543,-0.00259,-0.002255,0.00964,-0.080638,0.053921,0.068653,-0.051635,0.038154,0.016369
nicoledobrikov1,t5_3oioc0,0.00024,0.0437,-0.030162,-0.023365,0.051887,0.050446,0.013388,-0.049501,-0.059686,-0.068271
germany,t5_2qi4z,0.030575,-0.057457,0.007206,0.029543,-0.003699,0.064915,-0.033345,-0.066493,-0.01916,0.014145
germansgonewild,t5_37g5b,0.022604,-0.032705,-0.016022,0.06629,0.052799,0.029996,0.008364,-0.049809,-0.004913,-0.056319


# Check mlflow experiment & Read artifact

In [13]:
df_mlf_exp = mlf.list_experiment_meta(output_format='pandas')
df_mlf_exp

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
0,0,Default,./mlruns/0,active
1,1,fse_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/1,active
2,2,fse_vectorize_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/2,active
3,3,subreddit_description_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/3,active
4,4,fse_vectorize_v1.1,gs://i18n-subreddit-clustering/mlflow/mlruns/4,active
5,5,use_multilingual_v0.1_test,gs://i18n-subreddit-clustering/mlflow/mlruns/5,active
6,6,use_multilingual_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/6,active


## Check runs in experiment

In [14]:
exp_id = df_mlf_exp.loc[df_mlf_exp['name'] == mlflow_experiment_test, 
                        'experiment_id'].values[0]

mlf.search_all_runs(experiment_ids=[exp_id]).head(8)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.df_vect_subreddits_description_rows,metrics.df_vect_subreddits_description_cols,metrics.df_vect_comments_cols,metrics.df_vect_posts_cols,metrics.vectorizing_time_minutes,metrics.df_vect_comments_rows,metrics.df_vect_posts_rows,metrics.df_subs_len,params.tf_limit_first_n_chars,params.tokenize_function,params.model_name,params.posts_path,params.col_text_post_word_count,params.col_subreddit_id,params.n_sample_comments,params.model_location,params.col_text_comment_word_count,params.col_text_post,params.host_name,params.col_text_subreddit_word_count,params.comments_path,params.n_sample_posts,params.bucket_name,params.col_comment_id,params.col_text_subreddit_description,params.col_text_post_url,params.col_text_comment,params.tf_batch_inference_rows,params.subreddits_path,params.col_post_id,params.tokenize_lowercase,params.preprocess_text_folder,tags.mlflow.source.type,tags.host_name,tags.mlflow.runName,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.source.git.commit
0,45201072143a4d7fbb86a2f2b7d85520,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/45201072143a4d7fbb86a2f2b7d85520/artifacts,2021-07-01 10:13:10.544000+00:00,2021-07-01 10:13:29.348000+00:00,629.0,512.0,512.0,512.0,1.212334,2100.0,1500.0,,1000.0,sklearn,use_multilingual,posts/de/2021-06-16,text_word_count,subreddit_id,2100.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,comment_text_word_count,text,tensorflow-2-3-20210617-fix,subreddit_name_title_and_clean_descriptions_word_count,comments/de/2021-06-16,1500.0,i18n-subreddit-clustering,comment_id,subreddit_name_title_and_clean_descriptions,post_url_for_embeddings,comment_body_text,1000.0,subreddits/de/2021-06-16,post_id,True,,LOCAL,tensorflow-2-3-20210617-fix,test_n_samples,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d
1,b0569cb9a7fa4820a940cb6eee6f2045,5,KILLED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/b0569cb9a7fa4820a940cb6eee6f2045/artifacts,2021-07-01 10:05:26.367000+00:00,2021-07-01 10:12:16.530000+00:00,629.0,512.0,,512.0,,,1500.0,,1000.0,sklearn,use_multilingual,posts/de/2021-06-16,text_word_count,subreddit_id,2100.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,comment_text_word_count,text,tensorflow-2-3-20210617-fix,subreddit_name_title_and_clean_descriptions_word_count,comments/de/2021-06-16,1500.0,i18n-subreddit-clustering,comment_id,subreddit_name_title_and_clean_descriptions,post_url_for_embeddings,comment_body_text,1000.0,subreddits/de/2021-06-16,post_id,True,,LOCAL,tensorflow-2-3-20210617-fix,test_n_samples,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d
2,19cc9e3673b24b10bc56b96ccf3fefb7,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/19cc9e3673b24b10bc56b96ccf3fefb7/artifacts,2021-07-01 10:00:28.094000+00:00,2021-07-01 10:00:45.939000+00:00,629.0,512.0,512.0,512.0,1.219346,2100.0,1500.0,,1000.0,sklearn,use_multilingual,posts/de/2021-06-16,text_word_count,subreddit_id,2100.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,comment_text_word_count,text,tensorflow-2-3-20210617-fix,subreddit_name_title_and_clean_descriptions_word_count,comments/de/2021-06-16,1500.0,i18n-subreddit-clustering,comment_id,subreddit_name_title_and_clean_descriptions,post_url_for_embeddings,comment_body_text,1000.0,subreddits/de/2021-06-16,post_id,True,,LOCAL,tensorflow-2-3-20210617-fix,test_n_samples,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d
3,e48b0170c7ec4b3a9f4712676de6115e,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/e48b0170c7ec4b3a9f4712676de6115e/artifacts,2021-07-01 09:50:34.758000+00:00,2021-07-01 09:50:53.902000+00:00,629.0,512.0,512.0,512.0,1.366118,2200.0,1500.0,,1200.0,sklearn,use_multilingual,posts/de/2021-06-16,text_word_count,subreddit_id,2200.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,comment_text_word_count,text,tensorflow-2-3-20210617-fix,subreddit_name_title_and_clean_descriptions_word_count,comments/de/2021-06-16,1500.0,i18n-subreddit-clustering,comment_id,subreddit_name_title_and_clean_descriptions,post_url_for_embeddings,comment_body_text,2000.0,subreddits/de/2021-06-16,post_id,False,,LOCAL,tensorflow-2-3-20210617-fix,test_n_samples,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d
4,f4146aeea6f740ceadcf53c3da0c55cc,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/f4146aeea6f740ceadcf53c3da0c55cc/artifacts,2021-07-01 09:26:54.619000+00:00,2021-07-01 09:27:14.869000+00:00,629.0,512.0,512.0,512.0,1.346785,3500.0,2988.0,,1200.0,sklearn,use_multilingual,posts/de/2021-06-16,text_word_count,subreddit_id,3500.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,comment_text_word_count,text,tensorflow-2-3-20210617-fix,subreddit_name_title_and_clean_descriptions_word_count,comments/de/2021-06-16,1500.0,i18n-subreddit-clustering,comment_id,subreddit_name_title_and_clean_descriptions,post_url_for_embeddings,comment_body_text,3000.0,subreddits/de/2021-06-16,post_id,False,,LOCAL,tensorflow-2-3-20210617-fix,,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d
5,228dc2d325af46aa8b77fcf9b48fb4f9,5,KILLED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/228dc2d325af46aa8b77fcf9b48fb4f9/artifacts,2021-07-01 09:18:34.401000+00:00,2021-07-01 09:24:45.508000+00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,LOCAL,,,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d
6,86a7fd9c16924966a46daecf71f32597,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/86a7fd9c16924966a46daecf71f32597/artifacts,2021-07-01 09:06:22.773000+00:00,2021-07-01 09:06:43.136000+00:00,629.0,512.0,512.0,512.0,1.422616,2500.0,1997.0,,1200.0,sklearn,use_multilingual,posts/de/2021-06-16,text_word_count,subreddit_id,2500.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,comment_text_word_count,text,,subreddit_name_title_and_clean_descriptions_word_count,comments/de/2021-06-16,1000.0,i18n-subreddit-clustering,comment_id,subreddit_name_title_and_clean_descriptions,post_url_for_embeddings,comment_body_text,2000.0,subreddits/de/2021-06-16,post_id,False,,LOCAL,tensorflow-2-3-20210617-fix,,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d
7,49934e85170b44bf8da96949b28b9edd,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/49934e85170b44bf8da96949b28b9edd/artifacts,2021-07-01 07:59:05.396000+00:00,2021-07-01 09:04:33.319000+00:00,629.0,512.0,,,,,,,1200.0,sklearn,use_multilingual_large,posts/de/2021-06-16,text_word_count,subreddit_id,1500.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3,comment_text_word_count,text,,subreddit_name_title_and_clean_descriptions_word_count,comments/de/2021-06-16,1000.0,i18n-subreddit-clustering,comment_id,subreddit_name_title_and_clean_descriptions,post_url_for_embeddings,comment_body_text,2000.0,subreddits/de/2021-06-16,post_id,False,,LOCAL,tensorflow-2-3-20210617-fix,,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d


In [15]:
%%time

run_id = '45201072143a4d7fbb86a2f2b7d85520'

df_v_subs = mlf.read_run_artifact(
    run_id=run_id,
    artifact_folder='df_vect_subreddits_description',
    read_function=pd.read_parquet,
)
print(df_v_subs.shape)

(629, 512)
CPU times: user 169 ms, sys: 0 ns, total: 169 ms
Wall time: 1.75 s


In [23]:
np.allclose(df_vect_subs, df_v_subs)

True

In [16]:
df_v_subs.iloc[:5, :10]

Unnamed: 0_level_0,Unnamed: 1_level_0,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9
subreddit_name,subreddit_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
de,t5_22i0,-0.018191,-0.045794,0.035795,-0.036392,0.033076,0.013654,-0.013067,-0.031252,0.001283,0.0247
ich_iel,t5_37k29,-0.019543,-0.00259,-0.002255,0.00964,-0.080638,0.053921,0.068653,-0.051635,0.038154,0.016369
nicoledobrikov1,t5_3oioc0,0.00024,0.0437,-0.030162,-0.023365,0.051887,0.050446,0.013388,-0.049501,-0.059686,-0.068271
germany,t5_2qi4z,0.030575,-0.057457,0.007206,0.029543,-0.003699,0.064915,-0.033345,-0.066493,-0.01916,0.014145
germansgonewild,t5_37g5b,0.022604,-0.032705,-0.016022,0.06629,0.052799,0.029996,0.008364,-0.049809,-0.004913,-0.056319


In [17]:
%%time

df_v_posts = mlf.read_run_artifact(
    run_id=run_id,
    artifact_folder='df_vect_posts',
    read_function=pd.read_parquet,
)
print(df_v_posts.shape)

(1500, 512)
CPU times: user 99.4 ms, sys: 92 ms, total: 191 ms
Wall time: 1.77 s


In [24]:
np.allclose(df_vect, df_v_posts)

True

In [27]:
df_v_posts.iloc[14:20, :10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9
subreddit_name,subreddit_id,post_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
pcbaumeister,t5_4c1x98,t3_nlj0xz,-0.028007,0.018425,0.025313,-0.084277,-0.044653,0.036714,-0.084001,-0.060739,0.022273,-0.055197
dagibeehot,t5_wv7c1,t3_mzpji1,0.079956,0.062191,0.042096,0.028683,-0.014719,0.002513,-0.016711,0.036436,-0.033169,-0.016168
germansgonewild,t5_37g5b,t3_nkhuwl,-0.04588,0.025566,0.004287,0.02,-0.086096,0.016448,-0.003725,0.049456,-0.073738,-0.021704
de,t5_22i0,t3_nkm3hr,-0.05499,0.009562,0.011608,0.017721,0.01471,0.058977,0.061449,0.020423,-0.010647,0.038405
de,t5_22i0,t3_mpc8ai,-0.056767,-0.07399,0.057309,0.051738,0.019686,0.081643,-0.010165,0.045042,-0.045683,-0.015345
huebi,t5_29zucx,t3_mubs9j,0.15478,0.00766,0.06628,-0.002162,-0.080812,0.075854,0.000574,0.07894,-0.122165,-0.002068


In [21]:
%%time

df_v_comments = mlf.read_run_artifact(
    run_id=run_id,
    artifact_folder='df_vect_comments',
    read_function=pd.read_parquet,
)
print(df_v_comments.shape)

(2100, 512)
CPU times: user 441 ms, sys: 64.8 ms, total: 506 ms
Wall time: 2.05 s


In [25]:
np.allclose(df_vect_comments, df_v_comments)

True

In [26]:
df_v_comments.iloc[:5, :10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9
subreddit_name,subreddit_id,post_id,comment_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
de,t5_22i0,t3_n1db9m,t1_gwgumip,-0.050319,0.019366,0.008127,-0.035954,0.058837,0.018773,-0.077046,-0.080273,-0.007283,0.020304
ich_iel,t5_37k29,t3_muosjc,t1_gv7rlal,-0.020838,0.016757,-0.027872,0.005312,0.046645,0.074642,0.02286,-0.041156,0.009235,-0.068941
buenzli,t5_2xbtv,t3_ngs8bj,t1_gysjscj,-0.038592,-0.034569,-0.045555,0.006089,-0.044613,0.008128,0.023125,-0.062052,-0.024423,-0.032473
nicoledobrikovof,t5_3k1wb9,t3_noa9fo,t1_h0dfyag,0.020388,-0.063959,0.013214,-0.057574,0.054215,0.06014,-0.015974,-0.032665,-0.087324,0.022982
de,t5_22i0,t3_ngydq1,t1_gyu585z,-0.052664,0.04226,0.013913,0.053029,0.043332,0.046601,-0.062652,-0.046233,-0.016664,0.081627


# Run full with lower_case=True

In [28]:
mlflow_experiment_full = 'use_multilingual_v1'

In [None]:
mlflow.end_run(status='KILLED')

model, df_vect, df_vect_comments, df_vect_subs = vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name='test_n_samples',
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=True,
    subreddits_path='subreddits/de/2021-06-16',
    posts_path='posts/de/2021-06-16',
    comments_path='comments/de/2021-06-16',
    tf_batch_inference_rows=1000,
    tf_limit_first_n_chars=1000,
    n_sample_posts=1500,
    n_sample_comments=2100,
)

# Run full with lower_case=False

In [None]:
BREAK

In [12]:
mlflow.end_run(status='KILLED')

model, df_posts, df_vect, df_vect_comments, df_vect_subs = vectorize_text_to_embeddings(
    model_name='use_multilingual_large',
    mlflow_experiment=mlflow_experiment,
    
    tokenize_lowercase=False,
    subreddits_path='subreddits/de/2021-06-16',
    posts_path=None,  # 'posts/de/2021-06-16',
    comments_path=None,  # 'comments/de/2021-06-16',
    tf_batch_inference_rows=2000,
    tf_limit_first_n_chars=1200,
)

22:12:56 | INFO | "Start vectorize function"
22:12:56 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual_large/2021-06-30_2212"
22:12:56 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/de/2021-06-16"
22:13:08 | INFO | "  0:00:12.068115 <- df_post time elapsed"
22:13:08 | INFO | "  (262226, 6) <- df_posts.shape"
22:13:08 | INFO | "Load comments df..."
22:13:22 | INFO | "  (1108757, 6) <- df_comments shape"
22:13:23 | INFO | "Keep only comments that match posts IDs in df_posts..."
22:13:23 | INFO | "  (1108757, 6) <- updated df_comments shape"
22:13:23 | INFO | "Load subreddits df..."
22:13:25 | INFO | "  (629, 4) <- df_subs shape"
22:13:25 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
22:13:26 | INFO | "Loading model use_multilingual_large...
  with kwargs: None"
22:13:36 | INFO | "  0:00:09.588061 <- Load TF HUB model time elapsed"
22:13:36 | INFO | "Getting embeddings in batches

  0%|          | 0/132 [00:00<?, ?it/s]

01:43:41 | INFO | "Saving inference for comments df"


ValueError: parquet must have string column names

# Example from previous call using FSE/FastText/uSIF

In [17]:
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='sklearn_acronyms_emoji',
    tokenize_lowercase=True,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:25:16 | INFO | "Start vectorize function"
07:25:16 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0725"
07:25:16 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:25:22 | INFO | "  0:00:05.708467 <- df_post time elapsed"
07:25:22 | INFO | "  (111669, 6) <- df_posts.shape"
07:25:22 | INFO | "Load comments df..."
07:25:29 | INFO | "  (757388, 6) <- df_comments shape"
07:25:29 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:25:30 | INFO | "  (638052, 6) <- updated df_comments shape"
07:25:30 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:25:30 | INFO | "Filtering posts for SIF training..."
07:25:30 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:25:30 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:25:30 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:25:30 | INFO | "31,790 <- df_pos