# Purpose
This notebook runs the `vectorize_text_to_embeddings` function to:
- loading USE-multilingual model
- load post & comment text
- convert the text into embeddings (at post or comment level)


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
# from datetime import datetime
# import gc
# from functools import partial
# import os
import logging
# from pathlib import Path
# from pprint import pprint

import mlflow

import numpy as np
import pandas as pd

# TF libraries... I've been getting errors when these aren't loaded
import tensorflow_text
import tensorflow as tf

from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
    D_MODELS_CPU,
    process_text_for_fse,
    vectorize_text_with_fse,
)
from subclu.models.preprocess_text import TextPreprocessor, transform_and_tokenize_text

from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, mlflow, pd, tensorflow_text, tf])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.18.5
mlflow		v: 1.16.0
pandas		v: 1.2.4
tensorflow_text	v: 2.3.0
tensorflow	v: 2.3.2


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Initialize mlflow logging with sqlite database

In [6]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')

In [7]:
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/mlflow/mlruns.db'

## Get list of experiments with new function

In [10]:
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

In [11]:
mlf.list_experiment_meta()

[{'experiment_id': '0',
  'name': 'Default',
  'artifact_location': './mlruns/0',
  'lifecycle_stage': 'active'},
 {'experiment_id': '1',
  'name': 'use_multilingual_v0.1_test',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/1',
  'lifecycle_stage': 'active'}]

# Check whether we have access to a GPU

In [29]:
l_phys_gpus = tf.config.list_physical_devices('GPU')
from tensorflow.python.client import device_lib

print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\nGPUs\n==="
    f"\nNum GPUs Available: {len(l_phys_gpus)}"
    f"\nGPU details:\n{l_phys_gpus}"
    f"\n\nAll devices:\n===\n"
    f"{device_lib.list_local_devices()}"

)


Built with CUDA? True
GPUs
===
Num GPUs Available: 0
GPU details:
[]

All devices:
===
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17554342817575500758
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 5847791926257555059
physical_device_desc: "device: XLA_CPU device"
]


In [18]:

print()

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12684331277337632839
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 14335225933054774797
physical_device_desc: "device: XLA_CPU device"
]


# Call function to vectorize text

In [12]:
mlflow_experiment = 'use_multilingual_v0.1_test'

In [13]:
mlflow.end_run(status='KILLED')

model, df_posts, df_vect, df_vect_comments, df_vect_subs = vectorize_text_to_embeddings(
    model_name='use_multilingual_large',
    mlflow_experiment=mlflow_experiment,
    
    tokenize_lowercase=False,
    subreddits_path='subreddits/de/2021-06-16',
    posts_path='posts/de/2021-06-16',
    comments_path='comments/de/2021-06-16',
)

15:57:40 | INFO | "Start vectorize function"
15:57:40 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual_large/2021-06-29_1557"
15:57:40 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/de/2021-06-16"
15:57:50 | INFO | "  0:00:09.939972 <- df_post time elapsed"
15:57:50 | INFO | "  (262226, 6) <- df_posts.shape"
15:57:50 | INFO | "Load comments df..."
15:58:02 | INFO | "  (1108757, 6) <- df_comments shape"
15:58:03 | INFO | "Keep only comments that match posts IDs in df_posts..."
15:58:03 | INFO | "  (1108757, 6) <- updated df_comments shape"
15:58:03 | INFO | "Load subreddits df..."
15:58:03 | INFO | "  (629, 4) <- df_subs shape"
15:58:03 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
15:58:04 | INFO | "Loading model use_multilingual_large...
  with kwargs: None"
15:58:04 | INFO | "Using /tmp/tfhub_modules to cache modules."
15:58:11 | INFO | "  0:00:06.638491 <- Load TF HUB model

ResourceExhaustedError:  OOM when allocating tensor with shape[10278314,512] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node StatefulPartitionedCall/StatefulPartitionedCall/EncoderTransformer/Transformer/PrepareForTransformer/dense/BiasAdd}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_restored_function_body_51014]

Function call stack:
restored_function_body


In [17]:
del model, df_posts, d_ix_to_id
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='sklearn_acronyms_emoji',
    tokenize_lowercase=True,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:25:16 | INFO | "Start vectorize function"
07:25:16 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0725"
07:25:16 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:25:22 | INFO | "  0:00:05.708467 <- df_post time elapsed"
07:25:22 | INFO | "  (111669, 6) <- df_posts.shape"
07:25:22 | INFO | "Load comments df..."
07:25:29 | INFO | "  (757388, 6) <- df_comments shape"
07:25:29 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:25:30 | INFO | "  (638052, 6) <- updated df_comments shape"
07:25:30 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:25:30 | INFO | "Filtering posts for SIF training..."
07:25:30 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:25:30 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:25:30 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:25:30 | INFO | "31,790 <- df_pos

In [21]:
try:
    del model, df_posts, d_ix_to_id
except NameError:
    pass
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='gensim',
    tokenize_lowercase=True,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:45:47 | INFO | "Start vectorize function"
07:45:47 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0745"
07:45:47 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:45:52 | INFO | "  0:00:04.924321 <- df_post time elapsed"
07:45:52 | INFO | "  (111669, 6) <- df_posts.shape"
07:45:52 | INFO | "Load comments df..."
07:45:59 | INFO | "  (757388, 6) <- df_comments shape"
07:45:59 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:45:59 | INFO | "  (638052, 6) <- updated df_comments shape"
07:45:59 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:46:00 | INFO | "Filtering posts for SIF training..."
07:46:00 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:46:00 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:46:00 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:46:00 | INFO | "31,790 <- df_pos

In [22]:
try:
    del model, df_posts, d_ix_to_id
except NameError:
    pass
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='gensim',
    tokenize_lowercase=False,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:49:43 | INFO | "Start vectorize function"
07:49:43 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0749"
07:49:43 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:49:50 | INFO | "  0:00:06.091006 <- df_post time elapsed"
07:49:50 | INFO | "  (111669, 6) <- df_posts.shape"
07:49:50 | INFO | "Load comments df..."
07:49:58 | INFO | "  (757388, 6) <- df_comments shape"
07:49:58 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:49:58 | INFO | "  (638052, 6) <- updated df_comments shape"
07:49:58 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:50:00 | INFO | "Filtering posts for SIF training..."
07:50:00 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:50:00 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:50:00 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:50:00 | INFO | "31,790 <- df_pos

# Recover artifact from mlflow

In [26]:
run_id = 'aac3e007dfc2446790e25887adf287f6'
run = mlflow.get_run(run_id)

In [27]:
f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv"

'gs://i18n-subreddit-clustering/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts/d_ix_to_id/d_ix_to_id.csv'

In [29]:
df_idx = pd.read_csv(f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv")
print(df_idx.shape)
df_idx.head()

(111669, 2)


Unnamed: 0,training_index,post_id
0,0,t3_mkyj2k
1,1,t3_mkynzi
2,2,t3_mkyolv
3,3,t3_mkyp17
4,4,t3_mkyqrz
