# Purpose
This notebook runs the `vectorize_text_to_embeddings` function to:
- loading USE-multilingual model
- load post & comment text
- convert the text into embeddings (at post or comment level)


# Notebook setup

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# from datetime import datetime
# import gc
# from functools import partial
# import os
import logging
# from pathlib import Path
# from pprint import pprint

import mlflow

import numpy as np
import pandas as pd

# TF libraries... I've been getting errors when these aren't loaded
import tensorflow_text
import tensorflow as tf

import subclu
from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
    D_MODELS_CPU,
    process_text_for_fse,
    vectorize_text_with_fse,
)
from subclu.models.preprocess_text import TextPreprocessor, transform_and_tokenize_text
from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, mlflow, pd, tensorflow_text, tf, subclu])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.18.5
mlflow		v: 1.16.0
pandas		v: 1.2.4
tensorflow_text	v: 2.3.0
tensorflow	v: 2.3.3
subclu		v: 0.1.1


In [4]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Initialize mlflow logging with sqlite database

In [5]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')

In [6]:
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/mlflow/mlruns.db'

## Get list of experiments with new function

In [7]:
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

In [None]:
# mlf.list_experiment_meta()

# Check whether we have access to a GPU

In [9]:
l_phys_gpus = tf.config.list_physical_devices('GPU')
from tensorflow.python.client import device_lib

print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\nGPUs\n==="
    f"\nNum GPUs Available: {len(l_phys_gpus)}"
    f"\nGPU details:\n{l_phys_gpus}"
    f"\n\nAll devices:\n===\n"
    f"{device_lib.list_local_devices()}"
)


Built with CUDA? True
GPUs
===
Num GPUs Available: 0
GPU details:
[]

All devices:
===
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16250078998344776964
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 12248579071249388595
physical_device_desc: "device: XLA_CPU device"
]


# Call function to vectorize text

- Batch of: 3000 
- Limit characters to: 1000
Finally leaves enough room to use around 50% of RAM (of 60GB)

The problem is that each iteration takes around 3 minutes, which means whole job for GERMAN only will tka around 4:42 hours:mins...

In [10]:
mlflow_experiment = 'use_multilingual_v0.1_test'

In [None]:
mlflow.end_run(status='KILLED')

model, df_posts, df_vect, df_vect_comments, df_vect_subs = vectorize_text_to_embeddings(
    model_name='use_multilingual_large',
    mlflow_experiment=mlflow_experiment,
    
    tokenize_lowercase=False,
    subreddits_path='subreddits/de/2021-06-16',
    posts_path='posts/de/2021-06-16',
    comments_path='comments/de/2021-06-16',
    tf_batch_inference_rows=3000,
    tf_limit_first_n_chars=1000,
)

18:46:19 | INFO | "Start vectorize function"
18:46:19 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual_large/2021-06-30_1846"
18:46:19 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/de/2021-06-16"
18:46:43 | INFO | "  0:00:23.870929 <- df_post time elapsed"
18:46:43 | INFO | "  (262226, 6) <- df_posts.shape"
18:46:43 | INFO | "Load comments df..."
18:47:07 | INFO | "  (1108757, 6) <- df_comments shape"
18:47:08 | INFO | "Keep only comments that match posts IDs in df_posts..."
18:47:08 | INFO | "  (1108757, 6) <- updated df_comments shape"
18:47:08 | INFO | "Load subreddits df..."
18:47:10 | INFO | "  (629, 4) <- df_subs shape"
18:47:10 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
18:47:10 | INFO | "Loading model use_multilingual_large...
  with kwargs: None"
18:47:23 | INFO | "  0:00:13.100165 <- Load TF HUB model time elapsed"
18:47:23 | INFO | "Getting embeddings in batches

  0%|          | 0/88 [00:00<?, ?it/s]

In [1]:
LEGACY

NameError: name 'LEGACY' is not defined

In [17]:
del model, df_posts, d_ix_to_id
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='sklearn_acronyms_emoji',
    tokenize_lowercase=True,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:25:16 | INFO | "Start vectorize function"
07:25:16 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0725"
07:25:16 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:25:22 | INFO | "  0:00:05.708467 <- df_post time elapsed"
07:25:22 | INFO | "  (111669, 6) <- df_posts.shape"
07:25:22 | INFO | "Load comments df..."
07:25:29 | INFO | "  (757388, 6) <- df_comments shape"
07:25:29 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:25:30 | INFO | "  (638052, 6) <- updated df_comments shape"
07:25:30 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:25:30 | INFO | "Filtering posts for SIF training..."
07:25:30 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:25:30 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:25:30 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:25:30 | INFO | "31,790 <- df_pos

In [21]:
try:
    del model, df_posts, d_ix_to_id
except NameError:
    pass
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='gensim',
    tokenize_lowercase=True,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:45:47 | INFO | "Start vectorize function"
07:45:47 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0745"
07:45:47 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:45:52 | INFO | "  0:00:04.924321 <- df_post time elapsed"
07:45:52 | INFO | "  (111669, 6) <- df_posts.shape"
07:45:52 | INFO | "Load comments df..."
07:45:59 | INFO | "  (757388, 6) <- df_comments shape"
07:45:59 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:45:59 | INFO | "  (638052, 6) <- updated df_comments shape"
07:45:59 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:46:00 | INFO | "Filtering posts for SIF training..."
07:46:00 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:46:00 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:46:00 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:46:00 | INFO | "31,790 <- df_pos

In [22]:
try:
    del model, df_posts, d_ix_to_id
except NameError:
    pass
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='gensim',
    tokenize_lowercase=False,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:49:43 | INFO | "Start vectorize function"
07:49:43 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0749"
07:49:43 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:49:50 | INFO | "  0:00:06.091006 <- df_post time elapsed"
07:49:50 | INFO | "  (111669, 6) <- df_posts.shape"
07:49:50 | INFO | "Load comments df..."
07:49:58 | INFO | "  (757388, 6) <- df_comments shape"
07:49:58 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:49:58 | INFO | "  (638052, 6) <- updated df_comments shape"
07:49:58 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:50:00 | INFO | "Filtering posts for SIF training..."
07:50:00 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:50:00 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:50:00 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:50:00 | INFO | "31,790 <- df_pos

# Recover artifact from mlflow

In [1]:
run_id = 'aac3e007dfc2446790e25887adf287f6'
run = mlflow.get_run(run_id)

NameError: name 'mlflow' is not defined

In [2]:
f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv"

NameError: name 'run' is not defined

In [3]:
df_idx = pd.read_csv(f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv")
print(df_idx.shape)
df_idx.head()

NameError: name 'pd' is not defined