# Purpose
This notebook runs the `vectorize_text_to_embeddings` function to:
- loading fastText embeddings & create a uSIF model
- load post & comment text
- train a uSIF model
- convert the text into embeddings (at post or comment level)

In this notebook I focus on using the METADATA/descriptions for each subreddit.

Hypothesis: if we can get some meaningful vectors from the subreddit descriptions, we might be able to add them as an input we can use for:
- subreddit embeddings (e.g., 10% subreddit meta + 80% post title+text + 10% comments)
- post-level embeddings (e.g., 10% subreddit meta + 90% post title+text)

# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
from functools import partial
import os
import logging
from pathlib import Path
from pprint import pprint

import mlflow

import numpy as np
import pandas as pd

from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
    D_MODELS_CPU,
    process_text_for_fse,
    vectorize_text_with_fse,
)
from subclu.models.preprocess_text import TextPreprocessor, transform_and_tokenize_text

from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, pd])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')

In [5]:
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/mlflow/mlruns.db'

## Get list of experiments with new function

In [8]:
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARN)

In [9]:
mlf.list_experiment_meta()

[{'experiment_id': '0',
  'name': 'Default',
  'artifact_location': './mlruns/0',
  'lifecycle_stage': 'active'},
 {'experiment_id': '1',
  'name': 'fse_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/1',
  'lifecycle_stage': 'active'},
 {'experiment_id': '2',
  'name': 'fse_vectorize_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/2',
  'lifecycle_stage': 'active'}]

# Inspect data for subreddit meta

In [10]:
%%time

bucket_ = 'i18n-subreddit-clustering'
subs_path = 'subreddits/2021-06-01'

df_subs = pd.read_parquet(
    path=f"gs://{bucket_}/{subs_path}",
)

CPU times: user 408 ms, sys: 96.3 ms, total: 505 ms
Wall time: 2.26 s


In [11]:
df_subs.shape

(196, 34)

In [55]:
# df_subs.head()

In [18]:
df_subs['subreddit_name_title_and_clean_descriptions_word_count'].describe()

count    196.000000
mean      88.826531
std      115.498609
min        3.000000
25%       16.750000
50%       37.000000
75%      105.250000
max      636.000000
Name: subreddit_name_title_and_clean_descriptions_word_count, dtype: float64

In [22]:
# df_subs[df_subs['subreddit_name_title_and_clean_descriptions_word_count'] < 10]

# Call function to vectorize text

In [None]:
# del model, df_posts, d_ix_to_id
gc.collect()

subs_path = 'subreddits/2021-06-01'

mlflow.end_run(status='KILLED')
model, df_subs, d_ix_to_id = vectorize_text_to_embeddings(
    tokenize_function='sklearn_acronyms_emoji',
    mlflow_experiment='subreddit_description_v1',
    tokenize_lowercase=True,
    train_min_word_count=4,
    subreddits_path=subs_path,
    posts_path=None,
    comments_path=None,
)

In [33]:
# del model, df_posts, d_ix_to_id
gc.collect()

subs_path = 'subreddits/2021-06-01'

mlflow.end_run(status='KILLED')
model, df_subs, d_ix_to_id = vectorize_text_to_embeddings(
    tokenize_function='sklearn_acronyms_emoji',
    mlflow_experiment='subreddit_description_v1',
    tokenize_lowercase=False,
    train_min_word_count=4,
    subreddits_path=subs_path,
    posts_path=None,
    comments_path=None,
)

05:32:13 | INFO | "Start vectorize function"
05:32:13 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0532"
05:32:13 | INFO | "Load subreddits df..."
05:32:14 | INFO | "  (196, 4) <- df_comments shape"
05:32:14 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
05:32:14 | INFO | "Filtering posts for SIF training..."
05:32:14 | INFO | "     0 <- Exclude posts because of: subreddits filter"
05:32:14 | INFO | "     0 <- Exclude posts because of: duplicated posts"
05:32:14 | INFO | "     4 <- Exclude posts because of: minimum word count"
05:32:14 | INFO | "   192 <- df_subs for training"
05:32:14 | INFO | "Converting df_train to fse format..."
05:32:14 | INFO | "  0:00:00.000328 <- Converting to fse time elapsed"
05:32:14 | INFO | "Logging training df to mlflow..."
05:32:14 | INFO | "Loading model fasttext_usif_de...
  with kwargs: {'lang_id': 'de', 'workers': 10, 'length': 11, 'lang_freq': 'de', 'verbose': 

In [35]:
gc.collect()

mlflow.end_run(status='KILLED')
model, df_subs, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment='subreddit_description_v1',
    tokenize_function='sklearn',
    tokenize_lowercase=True,
    train_min_word_count=4,
    subreddits_path=subs_path,
    posts_path=None,
    comments_path=None,
)

05:35:31 | INFO | "Start vectorize function"
05:35:31 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0535"
05:35:31 | INFO | "Load subreddits df..."
05:35:31 | INFO | "  (196, 4) <- df_comments shape"
05:35:31 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
05:35:32 | INFO | "Filtering posts for SIF training..."
05:35:32 | INFO | "     0 <- Exclude posts because of: subreddits filter"
05:35:32 | INFO | "     0 <- Exclude posts because of: duplicated posts"
05:35:32 | INFO | "     4 <- Exclude posts because of: minimum word count"
05:35:32 | INFO | "   192 <- df_subs for training"
05:35:32 | INFO | "Converting df_train to fse format..."
05:35:32 | INFO | "  0:00:00.000334 <- Converting to fse time elapsed"
05:35:32 | INFO | "Logging training df to mlflow..."
05:35:32 | INFO | "Loading model fasttext_usif_de...
  with kwargs: {'lang_id': 'de', 'workers': 10, 'length': 11, 'lang_freq': 'de', 'verbose': 

In [36]:
gc.collect()

mlflow.end_run(status='KILLED')
model, df_subs, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment='subreddit_description_v1',
    tokenize_function='sklearn_acronyms_emoji',
    tokenize_lowercase=False,
    train_min_word_count=7,
    subreddits_path=subs_path,
    posts_path=None,
    comments_path=None,
)

05:47:53 | INFO | "Start vectorize function"
05:47:53 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0547"
05:47:53 | INFO | "Load subreddits df..."
05:47:53 | INFO | "  (196, 4) <- df_comments shape"
05:47:53 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
05:47:54 | INFO | "Filtering posts for SIF training..."
05:47:54 | INFO | "     0 <- Exclude posts because of: subreddits filter"
05:47:54 | INFO | "     0 <- Exclude posts because of: duplicated posts"
05:47:54 | INFO | "    13 <- Exclude posts because of: minimum word count"
05:47:54 | INFO | "   183 <- df_subs for training"
05:47:54 | INFO | "Converting df_train to fse format..."
05:47:54 | INFO | "  0:00:00.000312 <- Converting to fse time elapsed"
05:47:54 | INFO | "Logging training df to mlflow..."
05:47:54 | INFO | "Loading model fasttext_usif_de...
  with kwargs: {'lang_id': 'de', 'workers': 10, 'length': 11, 'lang_freq': 'de', 'verbose': 

In [47]:
gc.collect()

mlflow.end_run(status='KILLED')
model, df_subs, d_ix_to_id = vectorize_text_to_embeddings(
    mlflow_experiment='subreddit_description_v1',
    tokenize_function='sklearn',
    tokenize_lowercase=False,
    train_min_word_count=7,
    subreddits_path=subs_path,
    posts_path=None,
    comments_path=None,
)

05:56:33 | INFO | "Start vectorize function"
05:56:33 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/fse/2021-06-02_0556"
05:56:33 | INFO | "Load subreddits df..."
05:56:34 | INFO | "  (196, 4) <- df_comments shape"
05:56:34 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
05:56:34 | INFO | "Filtering posts for SIF training..."
05:56:34 | INFO | "     0 <- Exclude posts because of: subreddits filter"
05:56:34 | INFO | "     0 <- Exclude posts because of: duplicated posts"
05:56:34 | INFO | "    13 <- Exclude posts because of: minimum word count"
05:56:34 | INFO | "   183 <- df_subs for training"
05:56:34 | INFO | "Converting df_train to fse format..."
05:56:34 | INFO | "  0:00:00.000345 <- Converting to fse time elapsed"
05:56:34 | INFO | "Logging training df to mlflow..."
05:56:34 | INFO | "Loading model fasttext_usif_de...
  with kwargs: {'lang_id': 'de', 'workers': 10, 'length': 11, 'lang_freq': 'de', 'verbose': 

# Recover artifact from mlflow

In [48]:
run_id = 'a6f09bcae7b147f693f6083b56ec3ad5'
run = mlflow.get_run(run_id)

In [49]:
f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv"

'gs://i18n-subreddit-clustering/mlflow/mlruns/3/a6f09bcae7b147f693f6083b56ec3ad5/artifacts/d_ix_to_id/d_ix_to_id.csv'

In [50]:
df_idx = pd.read_csv(f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv")
df_idx.head()

Unnamed: 0,training_index,subreddit_id
0,0,t5_22i0
1,1,t5_30305
2,2,t5_2s82y
3,3,t5_37k29
4,4,t5_2qi4z


In [51]:
%%time

df_vects = pd.read_parquet(f"{run.info.artifact_uri}/df_vect_subreddits_description/")

CPU times: user 81.6 ms, sys: 2.19 ms, total: 83.8 ms
Wall time: 608 ms


In [52]:
df_vects.shape

(196, 300)

In [53]:
df_vects.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 196 entries, ('de', 't5_22i0') to ('dotade', 't5_4d9b0q')
Columns: 300 entries, embeddings_0 to embeddings_299
dtypes: float32(300)
memory usage: 241.8+ KB


In [54]:
df_vects.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21,embeddings_22,embeddings_23,embeddings_24,embeddings_25,embeddings_26,embeddings_27,embeddings_28,embeddings_29,...,embeddings_270,embeddings_271,embeddings_272,embeddings_273,embeddings_274,embeddings_275,embeddings_276,embeddings_277,embeddings_278,embeddings_279,embeddings_280,embeddings_281,embeddings_282,embeddings_283,embeddings_284,embeddings_285,embeddings_286,embeddings_287,embeddings_288,embeddings_289,embeddings_290,embeddings_291,embeddings_292,embeddings_293,embeddings_294,embeddings_295,embeddings_296,embeddings_297,embeddings_298,embeddings_299
subreddit_name,subreddit_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
de,t5_22i0,-0.068193,-0.140246,-0.078632,-0.088902,0.058947,0.04562,0.037167,-0.058531,-0.037315,-0.01535,-0.072647,0.005629,0.147903,0.005428,-0.056435,-0.011115,-0.079805,-0.013302,0.024932,0.001717,-0.066347,-0.018027,-0.039436,-0.015084,-0.025545,-0.039431,0.049099,-0.045878,-0.074431,-0.021248,...,0.015191,0.020865,0.057544,-0.034896,0.029612,0.048836,0.478561,-0.029383,0.054213,-0.006427,0.082874,-0.047048,0.031864,0.053346,-0.027506,-0.012726,0.027782,0.058283,-0.093523,0.033228,0.035452,0.04503,0.044162,0.014779,-0.02707,0.004733,-0.002979,-0.028784,-0.02658,2.7e-05
de_iama,t5_30305,-0.067955,-0.141612,-0.001929,-0.071472,-0.02935,0.088498,0.048055,-0.03485,-0.012829,-0.076761,-0.057956,0.016863,0.104513,-0.068526,-0.042201,-0.009282,-0.126241,-0.049793,0.015492,-0.107231,-0.038972,-0.005304,-0.007023,0.007038,-0.087264,0.000271,-0.04539,-0.025538,-0.025474,-0.056472,...,0.032816,0.080769,0.067391,-0.029824,0.046361,-0.005821,0.320294,0.033977,0.044636,-0.038056,0.134913,0.009774,0.05081,0.020896,-0.031264,0.027764,-0.012944,0.083818,-0.006974,0.120271,0.02058,0.002752,0.028324,0.071032,0.054481,0.017852,-0.003119,-0.028639,0.056255,-0.050546
bundesliga,t5_2s82y,-0.091431,-0.087854,-0.145448,-0.009487,0.088647,0.062016,-0.018871,-0.018094,-0.122689,0.063472,0.098198,-0.11066,0.090146,0.087363,-0.044413,0.021362,0.020865,-0.08161,0.098145,0.201054,-0.144961,0.050591,0.090852,0.070311,0.031639,-0.036719,0.233683,-0.061653,-0.068082,0.132007,...,-0.219609,-0.055264,0.033242,0.015594,-0.159257,-0.049292,0.629636,-0.066586,0.114068,0.11222,0.095717,0.020551,-0.155897,0.11329,0.044877,-0.064142,-0.122605,0.053964,-0.065005,0.000735,-0.011948,0.07061,0.049743,-0.055776,-0.090864,-0.042501,-0.092974,-0.002848,-0.010848,0.084419
ich_iel,t5_37k29,-0.035952,-0.060456,-0.148823,-0.05187,0.035436,0.035236,-0.006718,-0.063142,-0.043526,-0.11195,-0.023704,-0.049138,0.117564,-0.115526,-0.048981,-0.040843,-0.048785,-0.018597,-0.051052,-0.115204,-0.067233,-0.017151,0.089802,-0.048875,0.021613,0.010931,0.023022,0.05634,-0.008354,0.00425,...,0.056989,-0.010975,0.032579,0.040398,0.000713,0.025171,0.477503,0.02822,-0.028034,-0.075452,0.003443,0.038868,0.088912,0.032729,0.007014,0.032898,0.060034,-0.031061,-0.084744,-0.080818,0.05955,-0.07132,0.013347,0.012645,0.037299,-0.034162,-0.041637,-0.017311,0.018234,-0.026489
germany,t5_2qi4z,-0.109373,-0.060183,-0.221237,-0.129652,0.066976,0.051033,0.076763,-0.052196,-0.084627,0.099022,0.00962,0.010689,0.010767,-0.028676,0.002753,-0.000592,-0.1333,0.054518,-0.106768,0.139377,-0.112454,0.098173,0.053897,-0.035679,0.026762,-0.088102,0.107053,0.011507,-0.141544,-0.006828,...,-0.01956,0.026004,0.075463,-0.03307,-0.087544,0.025086,0.629335,-0.124546,0.097818,0.233379,0.140266,-0.052758,-0.100207,0.07297,-0.055256,-0.07269,-0.014565,0.071595,-0.043883,0.119549,0.017734,0.033778,0.100645,0.06488,-0.232661,-0.028845,-0.078855,-0.036558,-0.118363,-0.018238
