# Purpose
This notebook runs the `vectorize_text_to_embeddings` function to:
- loading fastText embeddings & create a uSIF model
- load post & comment text
- train a uSIF model
- convert the text into embeddings (at post or comment level)

Currently only one job call runs at a time, so I may try running two notebooks at the same time to run some jobs in parallel.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
from functools import partial
import os
import logging
from pathlib import Path
from pprint import pprint

import mlflow

import numpy as np
import pandas as pd

from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
    D_MODELS_CPU,
    process_text_for_fse,
    vectorize_text_with_fse,
)
from subclu.models.preprocess_text import TextPreprocessor, transform_and_tokenize_text

from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, mlflow, pd])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.19.5
mlflow		v: 1.16.0
pandas		v: 1.2.4


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Initialize mlflow logging with sqlite database

In [5]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')

In [6]:
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/mlflow/mlruns.db'

## Get list of experiments with new function

In [9]:
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

In [12]:
mlf.list_experiment_meta()

[{'experiment_id': '0',
  'name': 'Default',
  'artifact_location': './mlruns/0',
  'lifecycle_stage': 'active'},
 {'experiment_id': '1',
  'name': 'fse_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/1',
  'lifecycle_stage': 'active'},
 {'experiment_id': '2',
  'name': 'fse_vectorize_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/2',
  'lifecycle_stage': 'active'},
 {'experiment_id': '3',
  'name': 'subreddit_description_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/3',
  'lifecycle_stage': 'active'},
 {'experiment_id': '4',
  'name': 'fse_vectorize_v1.1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/4',
  'lifecycle_stage': 'active'}]

# Call function to vectorize text

In [13]:
mlflow_experiment = 'fse_vectorize_v1.1'

In [16]:
mlflow.end_run(status='KILLED')

model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='sklearn_acronyms_emoji',
    tokenize_lowercase=False,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:22:10 | INFO | "Start vectorize function"
07:22:10 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0722"
07:22:10 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:22:17 | INFO | "  0:00:06.316106 <- df_post time elapsed"
07:22:17 | INFO | "  (111669, 6) <- df_posts.shape"
07:22:17 | INFO | "Load comments df..."
07:22:25 | INFO | "  (757388, 6) <- df_comments shape"
07:22:25 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:22:25 | INFO | "  (638052, 6) <- updated df_comments shape"
07:22:25 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:22:26 | INFO | "Filtering posts for SIF training..."
07:22:26 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:22:26 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:22:26 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:22:26 | INFO | "31,790 <- df_pos

In [17]:
del model, df_posts, d_ix_to_id
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='sklearn_acronyms_emoji',
    tokenize_lowercase=True,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:25:16 | INFO | "Start vectorize function"
07:25:16 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0725"
07:25:16 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:25:22 | INFO | "  0:00:05.708467 <- df_post time elapsed"
07:25:22 | INFO | "  (111669, 6) <- df_posts.shape"
07:25:22 | INFO | "Load comments df..."
07:25:29 | INFO | "  (757388, 6) <- df_comments shape"
07:25:29 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:25:30 | INFO | "  (638052, 6) <- updated df_comments shape"
07:25:30 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:25:30 | INFO | "Filtering posts for SIF training..."
07:25:30 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:25:30 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:25:30 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:25:30 | INFO | "31,790 <- df_pos

In [21]:
try:
    del model, df_posts, d_ix_to_id
except NameError:
    pass
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='gensim',
    tokenize_lowercase=True,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:45:47 | INFO | "Start vectorize function"
07:45:47 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0745"
07:45:47 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:45:52 | INFO | "  0:00:04.924321 <- df_post time elapsed"
07:45:52 | INFO | "  (111669, 6) <- df_posts.shape"
07:45:52 | INFO | "Load comments df..."
07:45:59 | INFO | "  (757388, 6) <- df_comments shape"
07:45:59 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:45:59 | INFO | "  (638052, 6) <- updated df_comments shape"
07:45:59 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:46:00 | INFO | "Filtering posts for SIF training..."
07:46:00 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:46:00 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:46:00 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:46:00 | INFO | "31,790 <- df_pos

In [22]:
try:
    del model, df_posts, d_ix_to_id
except NameError:
    pass
gc.collect()

mlflow.end_run(status='KILLED')
model, df_posts, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment=mlflow_experiment,
    
    tokenize_function='gensim',
    tokenize_lowercase=False,
    train_min_word_count=4,
    train_exclude_duplicated_docs=True,
    train_subreddits_to_exclude=['wixbros', 'katjakrasavicenudes',
                                 'deutschetributes', 'germannudes',
                                 'annitheduck', 'germanonlyfans',
                                 'loredana', 'nicoledobrikovof',
                                 'germansgonewild', 'elisaalinenudes',
                                 'marialoeffler', 'germanwomenandcouples',
                                ],
)

07:49:43 | INFO | "Start vectorize function"
07:49:43 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0749"
07:49:43 | INFO | "Loading df_posts...
  gs://i18n-subreddit-clustering/posts/2021-05-19"
07:49:50 | INFO | "  0:00:06.091006 <- df_post time elapsed"
07:49:50 | INFO | "  (111669, 6) <- df_posts.shape"
07:49:50 | INFO | "Load comments df..."
07:49:58 | INFO | "  (757388, 6) <- df_comments shape"
07:49:58 | INFO | "Keep only comments that match posts IDs in df_posts..."
07:49:58 | INFO | "  (638052, 6) <- updated df_comments shape"
07:49:58 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
07:50:00 | INFO | "Filtering posts for SIF training..."
07:50:00 | INFO | "59,366 <- Exclude posts because of: subreddits filter"
07:50:00 | INFO | "30,537 <- Exclude posts because of: duplicated posts"
07:50:00 | INFO | "25,328 <- Exclude posts because of: minimum word count"
07:50:00 | INFO | "31,790 <- df_pos

# Recover artifact from mlflow

In [26]:
run_id = 'aac3e007dfc2446790e25887adf287f6'
run = mlflow.get_run(run_id)

In [27]:
f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv"

'gs://i18n-subreddit-clustering/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts/d_ix_to_id/d_ix_to_id.csv'

In [29]:
df_idx = pd.read_csv(f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv")
print(df_idx.shape)
df_idx.head()

(111669, 2)


Unnamed: 0,training_index,post_id
0,0,t3_mkyj2k
1,1,t3_mkynzi
2,2,t3_mkyolv
3,3,t3_mkyp17
4,4,t3_mkyqrz
