# Purpose

Apply the new `TextPreprocessor` class to preprocess text before going through tokenizing &/or vectorizing.

We'll be using these data sets by different models/jobs, so run this notebook once and read it multiple times (instead of running it each time it's needed).


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
import os
import logging
from pathlib import Path
from pprint import pprint

import mlflow

import numpy as np
import pandas as pd

from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
    D_MODELS_CPU,
    D_CUSTOM_SPLIT,
)
from subclu.models.preprocess_text import TextPreprocessor

from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, pd])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Load data


In [36]:
%%time

project_id='data-science-prod'
bucket_name = 'i18n-subreddit-clustering'
folder_posts = 'posts/2021-05-19'
folder_comments = 'comments/2021-05-19'

l_cols_posts = [
    # IDs
    'subreddit_name',
    'subreddit_id',
    'post_id',
#     'user_id',
#     'thing_type',
    
    # Meta
#     'submit_date',
#     'removed',
#     'upvotes',
#     'successful',
#     'app_name',
#     'combined_topic_and_rating',
#     'post_type',
#     'post_nsfw',
#     'geolocation_country_code',
    
    # Language & text content
#     'post_url',
#     'language',
#     'probability',
#     'weighted_language',
#     'weighted_language_probability',
#     'text_len',
    'text_word_count',
#     'post_url_for_embeddings',
    'text'
]

df_posts = pd.read_parquet(path=f"gs://{bucket_name}/{folder_posts}",
                           columns=l_cols_posts)

CPU times: user 470 ms, sys: 95.6 ms, total: 565 ms
Wall time: 6.05 s


In [5]:
df_posts.shape

(111669, 4)

In [14]:
assert len(df_posts) == df_posts['post_id'].nunique()

In [37]:
%%time

l_cols_comments = [
    # IDs
    'subreddit_name',
    'subreddit_id',
    'post_id',
    'comment_id',
    # 'user_id',
    
    # Comment & user meta
#     'thing_type',
#     'submit_date',
#     'removed',
#     'upvotes',
#     'successful',
#     'app_name',
#     'post_type',
#     'post_nsfw',
#     'geolocation_country_code',
#     'subreddit_geo_country_code',
#     'combined_topic',
#     'combined_topic_and_rating',
#     'rating',
#     'rating_version',
    
    # Text & language meta
#     'language',
#     'probability',
#     'weighted_language',
#     'weighted_language_probability',
#     'comment_text_len',
    'comment_text_word_count',
    'comment_body_text',
]

df_comments = pd.read_parquet(path=f"gs://{bucket_name}/{folder_comments}",
                              columns=l_cols_comments
                             )

CPU times: user 1.61 s, sys: 425 ms, total: 2.04 s
Wall time: 7.95 s


In [38]:
assert len(df_comments) == df_comments['comment_id'].nunique()

# Test `TextPreprocessor`

## Visually check output

In [81]:
display_text = False

l_nsfw_subs_sample = [
    'wixbros', 'katjakrasavicenudes',
    'deutschetributes', 'germannudes',
    'loredana', 'lucycat', 'emmyruss', 'elisaalinenudes',
    'germanonlyfans', 'germansgonewild',
]

mask_ignore_some_nsfw = ~df_posts['subreddit_name'].isin(l_nsfw_subs_sample)
mask_word_count = df_posts['text_word_count'] >= 3

In [82]:
if display_text:
    df_posts[mask_ignore_some_nsfw & mask_word_count].head(10)

In [83]:
mask_ignore_some_nsfw_c = ~df_comments['subreddit_name'].isin(l_nsfw_subs_sample)
mask_word_count_c = df_comments['comment_text_word_count'] >= 3

if display_text:
    df_comments[mask_ignore_some_nsfw_c & mask_word_count_c].head(10)

In [84]:
if display_text:
    df_comments[mask_ignore_some_nsfw_c & mask_word_count_c].iloc[45:55]

In [85]:
%%time

txtp = TextPreprocessor(lowercase=True, remove_digits=True, return_fse_format=False, verbose=False)

if display_text:
    display(txtp.transform(df_posts[mask_ignore_some_nsfw & mask_word_count]['text'].head(7)))
    print('\n', txtp.transform(df_comments[mask_ignore_some_nsfw_c & mask_word_count_c]['comment_body_text'].head(7)))

CPU times: user 11 µs, sys: 2 µs, total: 13 µs
Wall time: 18.4 µs


In [86]:
%%time

txtp = TextPreprocessor(lowercase=True, remove_digits=False, return_fse_format=False, verbose=False)

if display_text:
    display(txtp.transform(df_posts[mask_ignore_some_nsfw & mask_word_count]['text'].head(7)))
    print('\n', txtp.transform(df_comments[mask_ignore_some_nsfw_c & mask_word_count_c]['comment_body_text'].head(7)))

CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 18.8 µs


In [59]:
%%time

txtp = TextPreprocessor(lowercase=False, remove_digits=True, return_fse_format=False, verbose=False)

if display_text:
    display(txtp.transform(df_posts[mask_ignore_some_nsfw & mask_word_count]['text'].head(7)))
    print('\n', txtp.transform(df_comments[mask_ignore_some_nsfw_c & mask_word_count_c]['comment_body_text'].head(7)))

CPU times: user 0 ns, sys: 11 µs, total: 11 µs
Wall time: 15.7 µs


# Save processed text to new location

In [72]:
f"gs://{bucket_name}/{folder_posts}"
col_text_post = 'text'
col_text_comment = 'comment_body_text'


In [67]:
d_map_paths_to_params = {
    'lowercase_and_remove_digits': {
        'lowercase': True, 'remove_digits': True
    },
    'lowercase': {
        'lowercase': True, 'remove_digits': False
    },
    'remove_digits': {
        'lowercase': False, 'remove_digits': True
    },
}

In [77]:
for folder_, params in d_map_paths_to_params.items():
    logging.info(folder_)
    new_path_posts = f"gs://{bucket_name}/{folder_posts}_{folder_}/df.parquet"
    new_path_comments = f"gs://{bucket_name}/{folder_comments}_{folder_}/df.parquet"
    
    txtp = TextPreprocessor(lowercase=params['lowercase'], remove_digits=params['remove_digits'],
                            return_fse_format=False, verbose=False)
    
    logging.info(new_path_posts)
    (
        df_posts
        .assign(**{col_text_post: txtp.transform(df_posts[col_text_post])})
        .to_parquet(new_path_posts)
    )
    logging.info(new_path_comments)
    (
        df_comments
        .assign(**{col_text_comment: txtp.transform(df_comments[col_text_comment])})
        .to_parquet(new_path_comments)
    )

16:22:24 | INFO | "lowercase_and_remove_digits"
16:22:24 | INFO | "gs://i18n-subreddit-clustering/posts/2021-05-19/lowercase_and_remove_digits/df.parquet"
16:22:25 | INFO | "gs://i18n-subreddit-clustering/comments/2021-05-19/lowercase_and_remove_digits/df.parquet"
16:22:31 | INFO | "lowercase"
16:22:31 | INFO | "gs://i18n-subreddit-clustering/posts/2021-05-19/lowercase/df.parquet"
16:22:32 | INFO | "gs://i18n-subreddit-clustering/comments/2021-05-19/lowercase/df.parquet"
16:22:37 | INFO | "remove_digits"
16:22:37 | INFO | "gs://i18n-subreddit-clustering/posts/2021-05-19/remove_digits/df.parquet"
16:22:38 | INFO | "gs://i18n-subreddit-clustering/comments/2021-05-19/remove_digits/df.parquet"


# Reload data to make sure process was successful

In [87]:
display_text = False

for folder_, params in d_map_paths_to_params.items():
    logging.info(folder_)
    new_path_posts = f"gs://{bucket_name}/{folder_posts}_{folder_}/df.parquet"
    new_path_comments = f"gs://{bucket_name}/{folder_comments}_{folder_}/df.parquet"
    
    if display_text:
        logging.info(new_path_posts)
        display(
            pd.read_parquet(new_path_posts)
            .head(10)
        )
        logging.info(new_path_comments)
        display(
            pd.read_parquet(new_path_comments)
            .head(10)
        )

16:34:11 | INFO | "lowercase_and_remove_digits"
16:34:11 | INFO | "lowercase"
16:34:11 | INFO | "remove_digits"
