In [1]:
## Add this directory to the path and load our functions
import sys
sys.path.append("../src/")

import paware

import polars as pl
import os

# Handling the Raw Data
For the purposes of demonstration, we will work with a subset of the raw data that consists of posts in the following subreddits:

* `r/wholefoods`
* `r/TalesFromYourBank`
* `r/GeneralMotors`

## Loading the Data

The first step, is to load the data.

In [2]:
df = pl.read_parquet("../paw_demo/raw/raw_data.parquet")
df.shape

(147773, 14)

Here we peek at the list of subreddits represented in the data:

In [3]:
df["reddit_subreddit"].unique()

reddit_subreddit
str
"""GeneralMotors"""
"""TalesFromYourB…"
"""wholefoods"""


And we can peek at the data to get a sense of its structure:

In [4]:
df.sample(5)

aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission
str,str,str,str,i64,str,str,str,str,str,str,str,str,str
"""comment""","""2023-06-10T22:…","""jnqd263""","""t1_jnqd263""",1686449528,"""Da_Bulls_312""","""I got to sit i…","""/r/wholefoods/…",,,"""wholefoods""","""t3_145pvab""","""t3_145pvab""","""145pvab"""
"""comment""","""2023-08-24T12:…","""jxkkawb""","""t1_jxkkawb""",1692893821,"""Detroitbeardgu…","""It wouldn't hu…","""/r/wholefoods/…",,,"""wholefoods""","""t3_15zz8jt""","""t1_jxke3u9""","""15zz8jt"""
"""comment""","""2023-08-24T20:…","""jxmswe0""","""t1_jxmswe0""",1692925155,"""cheesenuggets2…","""Trigger Warnin…","""/r/TalesFromYo…",,,"""TalesFromYourB…","""t3_1607t9b""","""t1_jxluavb""","""1607t9b"""
"""comment""","""2023-03-30T20:…","""jech82k""","""t1_jech82k""",1680222210,"""StoreProfessio…","""Weed acid and …","""/r/wholefoods/…",,,"""wholefoods""","""t3_126g150""","""t3_126g150""","""126g150"""
"""comment""","""2023-07-08T21:…","""jr8017h""","""t1_jr8017h""",1688864468,"""Mariah0""","""They come in f…","""/r/wholefoods/…",,,"""wholefoods""","""t3_14ukqnf""","""t3_14ukqnf""","""14ukqnf"""


## Preprocessing

Because we are (outside of this demo) working with a gigantic dataset, we will first break it up into separate files that each contain the data from an individual subreddit. This ensures that we preserve all of the important structure (all of the replies, and replies to replies, will be contained in the same file) but we are less likely to overload our memory when handling the data.

In [5]:
## Get a list of the subreddits present in the dataset
subreddit_list = df["reddit_subreddit"].unique().to_list()

## Write the data for each individual subreddit to distinct parquet files
for subreddit in subreddit_list:
    df_subreddit = df.filter(pl.col("reddit_subreddit") == subreddit)
    df_subreddit.write_parquet(f"../paw_demo/raw_subs/{subreddit.lower()}_data.parquet")

## 

# Embedding

Now, we are ready to get a vector embedding of the data. We'll use our custom module to implment this.

Here we will apply the General Text Embedding [thenlper/gte-base](https://huggingface.co/thenlper/gte-base) to generate vector representations of our data. The parameters we define are:

* `CONFIG_NAME`: A name to distinguish this configuration.
* `RAW_DATA_DIR`: The directory containing our raw data (in this case, after we've added sentiment data).
* `EMBEDDED_SAVE_DIR`: The directory in which to save the files after embedding.
* `BATCH_SIZE`: We're using the `embed_documents()` function for Hugging Face Embeddings provided by LangChain ([docs](https://github.com/langchain-ai/langchain/blob/ea43c669f28f4564cc23cb9cc1356e363508e69d/libs/community/langchain_community/embeddings/huggingface.py#L81)) and this is quite computationally and memory intensive. Breaking the data into batches helps avoid overloading RAM.
* `CHUNK_WITH_METADATA`: Whether we should append information about the subreddit to the beginning of each text chink before embedding.
* `CHUNK_OVERLAP_PCT`: The minimum overlap between adjacent chuncks of text to be embedded, as a percent.

In [6]:
## Define the embedding tool
embedding_tool = paware.PawEmbedding(
    CONFIG_NAME = "demo",
    RAW_DATA_DIR = "../paw_demo/raw_subs/",
    EMBEDDED_SAVE_DIR =  "../paw_demo/embedded_subs/",
    BATCH_SIZE = 100000,
    CHUNK_WITH_METADATA = True,
    CHUNK_SIZE = 512,
    CHUNK_OVERLAP_PCT = 0.2
)

## Embed the data
embedding_tool.embed_from_subs(subs_dir="../paw_demo/raw_subs/")

Loading and chunking...
Dropping 239 rows with reddit_text=='[deleted]'
Dropping 40 rows with reddit_text=='[removed]'
Dropping 0 rows that are likely bots or memes
Dropping 0 rows with 'reddit_text'=='' and 'aware_post_type'=='comment'
Dropping 0 rows with 'reddit_text'==' ' and 'aware_post_type'=='comment'
Replacing 'reddit_text' with 'reddit_title' in 167 rows with 'reddit_text'=='' or 'reddit_text'==' '


  data_chunked = data_chunked.with_columns(


... done loading and chunking.

Vectorizing and saving...
Working on batch 0, rows 0 through 37642...
Done with batch 0.
Combining parquet files...
... done vectorizing and saving.

Loading and chunking...
Dropping 1099 rows with reddit_text=='[deleted]'
Dropping 119 rows with reddit_text=='[removed]'
Dropping 67 rows that are likely bots or memes
Dropping 0 rows with 'reddit_text'=='' and 'aware_post_type'=='comment'
Dropping 0 rows with 'reddit_text'==' ' and 'aware_post_type'=='comment'
Replacing 'reddit_text' with 'reddit_title' in 235 rows with 'reddit_text'=='' or 'reddit_text'==' '


  data_chunked = data_chunked.with_columns(


... done loading and chunking.

Vectorizing and saving...
Working on batch 0, rows 0 through 41559...
Done with batch 0.
Combining parquet files...
... done vectorizing and saving.

Loading and chunking...
Dropping 700 rows with reddit_text=='[deleted]'
Dropping 164 rows with reddit_text=='[removed]'
Dropping 61 rows that are likely bots or memes
Dropping 0 rows with 'reddit_text'=='' and 'aware_post_type'=='comment'
Dropping 0 rows with 'reddit_text'==' ' and 'aware_post_type'=='comment'
Replacing 'reddit_text' with 'reddit_title' in 631 rows with 'reddit_text'=='' or 'reddit_text'==' '


  data_chunked = data_chunked.with_columns(


... done loading and chunking.

Vectorizing and saving...
Working on batch 0, rows 0 through 92742...
Done with batch 0.
Combining parquet files...
... done vectorizing and saving.



## Verifying the Results

We can load the data now, and verify that it includes the embedding data.

In [7]:
embedding_dir = "../paw_demo/embedded_subs/config_demo/"
test = pl.read_parquet(
    embedding_dir+"vectorized_generalmotors_data_complete.parquet")

test.head(5)

aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,reply_ids,is_short_question,text_chunk,vector
str,str,str,str,i64,str,str,str,str,str,str,str,str,str,list[str],bool,str,list[f64]
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,,"[""t1_kgfznkx"", ""t1_kgg05vt""]",False,"""General Motors…","[0.018657, 0.037279, … 0.037011]"
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,,"[""t1_kgfznkx"", ""t1_kgg05vt""]",False,"""General Motors…","[0.019537, 0.014495, … 0.008045]"
"""comment""","""2024-01-05T10:…","""kgfznkx""","""t1_kgfznkx""",1704468972,"""warwolf0""","""Yea, that’s a …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""",,False,"""General Motors…","[0.0408, 0.030147, … -0.004045]"
"""comment""","""2024-01-05T10:…","""kgg05vt""","""t1_kgg05vt""",1704469154,"""tossedawaytheq…","""Search Socrate…","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""","[""t1_kgg0rjf"", ""t1_kgg79j0""]",False,"""General Motors…","[0.02871, 0.029767, … 0.022664]"
"""comment""","""2024-01-05T10:…","""kgg0rjf""","""t1_kgg0rjf""",1704469370,"""Ripinpasta69""","""Only works if …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud""",,False,"""General Motors…","[0.012741, 0.031151, … 0.01205]"


Notice that the `is_short_question` flag has been added as part of preprocessing.