In [1]:
## Add this directory to the path and load our functions
import sys
sys.path.append("../src/")
from data_processing import preprocess, chunk, vectorize
from adding_metadata import replies

## Loading the data...
We are using Polars to for general data handling. It is similar to Pandas, but optimized for larger datasate like ours.

In [2]:
import polars as pl
## Load the data
df = pl.read_parquet("../temp_data/raw_data_subset_gm.parquet")

In [3]:
df.head(5)

aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission
str,str,str,str,i64,str,str,str,str,str,str,str,str,str
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,
"""comment""","""2024-01-05T10:…","""kgfznkx""","""t1_kgfznkx""",1704468972,"""warwolf0""","""Yea, that’s a …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud"""
"""comment""","""2024-01-05T10:…","""kgg05vt""","""t1_kgg05vt""",1704469154,"""tossedawaytheq…","""Search Socrate…","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud"""
"""comment""","""2024-01-05T10:…","""kgg0rjf""","""t1_kgg0rjf""",1704469370,"""Ripinpasta69""","""Only works if …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud"""
"""comment""","""2024-01-05T11:…","""kgg79j0""","""t1_kgg79j0""",1704471652,"""Jmill2000""","""Gotcha, thanks…","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud"""


## Processing the data

Our processing pipeline is broken into the following steps:
* Preprocessing:
    * Dropping rows with deleted or removed posts
    * Dropping rows that appear to be bots or memes
    * Replacing empty submission texts with their titles
* Chunking (with or without metadata):
    * Breaks the `reddit_text` column into smaller chunks that are small enough to be handleded by our embedding model later. A new column is created, `text_chunk` that contains these chunks. Each such row also still contains all of the metadata (including the original text) assosicated with the `reddit_text` that the chunk belongs to.
    * There are two functions that do this, both have the same parameters: `raw_data`, `chunk_size`, and `chunk_overlap_pct`:
        * `chunk_preprocessed_data`
            * regular chunking, no added metadata
        * `chunk_preprocessed_data_with_subreddit`
            * chunks are made slighly smaller, and metadata regardng the subreddit is appended to the start of each chunk.
* Vectorizing (not shown below, see the notbook: `vdb_demo_raw_data_to_vdb.ipynb`)
    * Text chunks are tokenized and embedded using a setntence transfrom er model. The result is a dataframe with a new `vector` column.

In [4]:
## Preprocess
data_preprocessed = preprocess.preprocess_data(df)
data_preprocessed.head(5)

Dropping 1099 rows with reddit_text=='[deleted]'
Dropping 119 rows with reddit_text=='[removed]'
Dropping 67 rows that are likely bots or memes
Dropping 0 rows with 'reddit_text'=='' and 'aware_post_type'=='comment'
Dropping 0 rows with 'reddit_text'==' ' and 'aware_post_type'=='comment'
Replacing 'reddit_text' with 'reddit_title' in 235 rows with 'reddit_text'=='' or 'reddit_text'==' '


aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission
str,str,str,str,i64,str,str,str,str,str,str,str,str,str
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,
"""comment""","""2024-01-05T10:…","""kgfznkx""","""t1_kgfznkx""",1704468972,"""warwolf0""","""Yea, that’s a …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud"""
"""comment""","""2024-01-05T10:…","""kgg05vt""","""t1_kgg05vt""",1704469154,"""tossedawaytheq…","""Search Socrate…","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud"""
"""comment""","""2024-01-05T10:…","""kgg0rjf""","""t1_kgg0rjf""",1704469370,"""Ripinpasta69""","""Only works if …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud"""
"""comment""","""2024-01-05T11:…","""kgg79j0""","""t1_kgg79j0""",1704471652,"""Jmill2000""","""Gotcha, thanks…","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud"""


In [5]:
## Chunking without metadata
data_chunked = chunk.chunk_preprocessed_data(
    data_preprocessed=data_preprocessed, 
    chunk_size=512, 
    chunk_overlap_pct=.2)
data_chunked.head(5)

aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,text_chunk
str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,,"""So I know GM m…"
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,,"""I think that’s…"
"""comment""","""2024-01-05T10:…","""kgfznkx""","""t1_kgfznkx""",1704468972,"""warwolf0""","""Yea, that’s a …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""","""Yea, that’s a …"
"""comment""","""2024-01-05T10:…","""kgg05vt""","""t1_kgg05vt""",1704469154,"""tossedawaytheq…","""Search Socrate…","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""","""Search Socrate…"
"""comment""","""2024-01-05T10:…","""kgg0rjf""","""t1_kgg0rjf""",1704469370,"""Ripinpasta69""","""Only works if …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud""","""Only works if …"


In [6]:
pl.Config.set_fmt_str_lengths(3000)
data_chunked[["reddit_text","text_chunk"]].sample(5)

reddit_text,text_chunk
str,str
"""Any thoughts on if GM will offer UAW workers a buyout this year as well? It's also a contract year.""","""Any thoughts on if GM will offer UAW workers a buyout this year as well? It's also a contract year."""
"""hope that’s sarcasm.. we’d be in an even worse shape if he was still here""","""hope that’s sarcasm.. we’d be in an even worse shape if he was still here"""
"""You’re asking people to put all their eggs in one basket then. The employees of Enron did that, they lost literally everything, forced to delay retirement 10+ years. It is terrible practice to invest in your employer, because you can end up with no assets or job at the same time if they implode. The big automakers have imploded before, look at 2008, where all except Ford needed a bailout because of the credit crunch - politicians even did cash for clunkers, a terrible program, to bail them out.""","""You’re asking people to put all their eggs in one basket then. The employees of Enron did that, they lost literally everything, forced to delay retirement 10+ years. It is terrible practice to invest in your employer, because you can end up with no assets or job at the same time if they implode. The big automakers have imploded before, look at 2008, where all except Ford needed a bailout because of the credit crunch - politicians even did cash for clunkers, a terrible program, to bail them out."""
"""That’s not a great product if it costs so much that you need exploited labor to be profitable. I’m tired of subsidizing Fortune 100 companies bottom line.""","""That’s not a great product if it costs so much that you need exploited labor to be profitable. I’m tired of subsidizing Fortune 100 companies bottom line."""
"""Plant Medical, I think some Plant Office staff, I think some Engineers, I know I talked to one guy at Ford who was Salary UAW and in Finance but they kept screwing with his department, and I’m sure there’s lots more.""","""Plant Medical, I think some Plant Office staff, I think some Engineers, I know I talked to one guy at Ford who was Salary UAW and in Finance but they kept screwing with his department, and I’m sure there’s lots more."""


In [7]:
## Chunking with metadata
data_chunked_with_metadata = chunk.chunk_preprocessed_data_with_subreddit(
    data_preprocessed=data_preprocessed, 
    chunk_size=512, 
    chunk_overlap_pct=.2)

pl.Config.set_fmt_str_lengths(100)
data_chunked_with_metadata.head(5)

aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,text_chunk
str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str
"""submission""","""2024-01-05T10:15:42""","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM makes a big deal about not allowing employees to provide references. I don’t have many…","""/r/GeneralMotors/comments/18z96ud/references/""","""References""","""https://www.reddit.com/r/GeneralMotors/comments/18z96ud/references/""","""GeneralMotors""",,,,"""General Motors, GM So I know GM makes a big deal about not allowing employees to provide reference…"
"""submission""","""2024-01-05T10:15:42""","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM makes a big deal about not allowing employees to provide references. I don’t have many…","""/r/GeneralMotors/comments/18z96ud/references/""","""References""","""https://www.reddit.com/r/GeneralMotors/comments/18z96ud/references/""","""GeneralMotors""",,,,"""General Motors, GM I think that’s all GM can provide, but I’m not sure what the number/email is to…"
"""comment""","""2024-01-05T10:36:12""","""kgfznkx""","""t1_kgfznkx""",1704468972,"""warwolf0""","""Yea, that’s a load of shit by SLT btw. I get no references in LinkedIn to some extent but for inter…","""/r/GeneralMotors/comments/18z96ud/references/kgfznkx/""",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""","""General Motors, GM Yea, that’s a load of shit by SLT btw. I get no references in LinkedIn to some …"
"""comment""","""2024-01-05T10:39:14""","""kgg05vt""","""t1_kgg05vt""",1704469154,"""tossedawaythequeen""","""Search Socrates for “the work number”. That is the employment verification service GM uses for emp…","""/r/GeneralMotors/comments/18z96ud/references/kgg05vt/""",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""","""General Motors, GM Search Socrates for “the work number”. That is the employment verification ser…"
"""comment""","""2024-01-05T10:42:50""","""kgg0rjf""","""t1_kgg0rjf""",1704469370,"""Ripinpasta69""","""Only works if you are in the USA""","""/r/GeneralMotors/comments/18z96ud/references/kgg0rjf/""",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud""","""General Motors, GM Only works if you are in the USA"""


In [8]:
pl.Config.set_fmt_str_lengths(3000)
data_chunked_with_metadata[["reddit_text", "text_chunk"]]

reddit_text,text_chunk
str,str
"""So I know GM makes a big deal about not allowing employees to provide references. I don’t have many people who no longer work with GM who I worked with close enough to provide a reference. Since I’ve only ever worked for GM, besides internships, I’d like to at least be able to list something for references. Is there an HR number or email I am able to add to at least verify employment. I think that’s all GM can provide, but I’m not sure what the number/email is to do that. Does anyone know or have any advice? Thanks""","""General Motors, GM So I know GM makes a big deal about not allowing employees to provide references. I don’t have many people who no longer work with GM who I worked with close enough to provide a reference. Since I’ve only ever worked for GM, besides internships, I’d like to at least be able to list something for references. Is there an HR number or email I am able to add to at least verify employment."""
"""So I know GM makes a big deal about not allowing employees to provide references. I don’t have many people who no longer work with GM who I worked with close enough to provide a reference. Since I’ve only ever worked for GM, besides internships, I’d like to at least be able to list something for references. Is there an HR number or email I am able to add to at least verify employment. I think that’s all GM can provide, but I’m not sure what the number/email is to do that. Does anyone know or have any advice? Thanks""","""General Motors, GM I think that’s all GM can provide, but I’m not sure what the number/email is to do that. Does anyone know or have any advice? Thanks"""
"""Yea, that’s a load of shit by SLT btw. I get no references in LinkedIn to some extent but for interviews outside etc… what they say should be illegal frankly""","""General Motors, GM Yea, that’s a load of shit by SLT btw. I get no references in LinkedIn to some extent but for interviews outside etc… what they say should be illegal frankly"""
"""Search Socrates for “the work number”. That is the employment verification service GM uses for employees.""","""General Motors, GM Search Socrates for “the work number”. That is the employment verification service GM uses for employees."""
"""Only works if you are in the USA""","""General Motors, GM Only works if you are in the USA"""
…,…
"""Would love for that to be true, but was he really the only higher-up pushing for RTO?""","""General Motors, GM Would love for that to be true, but was he really the only higher-up pushing for RTO?"""
"""lol. Mgmt and share holders already know. What you think is Gm verse. It is all ecosystem of no code tools based services for embedded software Sellers in market will use AI and no code tools to provide same that current chaps do in GM""","""General Motors, GM lol. Mgmt and share holders already know. What you think is Gm verse. It is all ecosystem of no code tools based services for embedded software Sellers in market will use AI and no code tools to provide same that current chaps do in GM"""
"""Baby those days are gone when complexity was on class diagram or in api design interfaces or protocol stacks code U refine the requirements with good prompts. Bingo here come the tested code. Use memory safe new language with framework of flexible api. AI will put application in minutes""","""General Motors, GM Baby those days are gone when complexity was on class diagram or in api design interfaces or protocol stacks code U refine the requirements with good prompts. Bingo here come the tested code. Use memory safe new language with framework of flexible api. AI will put application in minutes"""
"""This shit just sounds like tech word salad from a bad LLM lol.""","""General Motors, GM This shit just sounds like tech word salad from a bad LLM lol."""
