In [1]:
## Add this directory to the path and load our functions
import sys
sys.path.append("../src/")
from data_processing import preprocess, chunk, vectorize

import polars as pl

## Define annd load the embdding model
from langchain_community.embeddings import HuggingFaceEmbeddings
model = HuggingFaceEmbeddings(model_name="thenlper/gte-base")

In [2]:
%%time
## Load raw data
DATA_RAW = pl.read_parquet("../temp_data/raw_data_subset_gm.parquet")
## Precprocess raw data
data_preprocessed = preprocess.preprocess_data(DATA_RAW=DATA_RAW)
## Chunk preproccessed data
data_chunked = chunk.chunk_preprocessed_data(
    data_preprocessed=data_preprocessed, 
    chunk_size=512, 
    chunk_overlap_pct=.2)

Dropping 1099 rows with reddit_text=='[deleted]'
Dropping 119 rows with reddit_text=='[removed]'
Dropping 79 rows with reddit_text containing:'This has been removed for breaking the sub rule of'
Dropping 235 rows with reddit_text==''


In [3]:
data_chunked.shape

(40841, 15)

In [4]:
%%time
## Vectorize data in batchs and save the resulting dataframes
vectorize.batch_vectorize_and_save(
    data_chunked=data_chunked,
    batch_size=5000,
    save_dir="../temp_data/vectorized_gm_batches/",
    file_prefix="vectorized_gm_data_"
)

Working on batch 0, rows 0 through 5000...
Done with batch 0.
Working on batch 1, rows 5000 through 10000...
Done with batch 1.
Working on batch 2, rows 10000 through 15000...
Done with batch 2.
Working on batch 3, rows 15000 through 20000...
Done with batch 3.
Working on batch 4, rows 20000 through 25000...
Done with batch 4.
Working on batch 5, rows 25000 through 30000...
Done with batch 5.
Working on batch 6, rows 30000 through 35000...
Done with batch 6.
Working on batch 7, rows 35000 through 40000...
Done with batch 7.
Working on batch 8, rows 40000 through 40841...
Done with batch 8.


In [5]:
## Look and see which files we just created
import os
os.listdir("../temp_data/vectorized_gm_batches/")

['vectorized_gm_data_pt2.parquet',
 'vectorized_gm_data_pt3.parquet',
 'vectorized_gm_data_pt8.parquet',
 'vectorized_gm_data_pt1.parquet',
 'vectorized_gm_data_pt0.parquet',
 'vectorized_gm_data_pt5.parquet',
 'vectorized_gm_data_pt4.parquet',
 'vectorized_gm_data_pt6.parquet',
 'vectorized_gm_data_pt7.parquet']

In [8]:
import lancedb

## Initialize a database
db = lancedb.connect("../temp_data/gm_lancedb")

In [9]:
%%time
## Load the first data file
data = pl.read_parquet(
    "../temp_data/vectorized_gm_batches/vectorized_gm_data_pt0.parquet")

## Create a table in the database based on the first file
table = db.create_table("gm_table", data=data)

CPU times: user 131 ms, sys: 24 ms, total: 155 ms
Wall time: 163 ms


In [10]:
## Load the remaining data into the table, one file at a time
for i in range(1,9):
    data = pl.read_parquet(
        f"../temp_data/vectorized_gm_batches/vectorized_gm_data_pt{i}.parquet")
    table.add(data)

In [11]:
%%time
## Embed a query
query_text = "What are the biggest benefits of working for GM?"
query = model.embed_query(query_text)

## Perform similarity search on database using KNN
result = table.search(query).limit(15).to_polars()

## Display the results
pl.Config.set_tbl_rows(15) ## Display up to 15 rows
pl.Config.set_fmt_str_lengths(3000) ## Display up to 3000 characters per column
result[["reddit_subreddit","reddit_text","text_chunk"]]

CPU times: user 331 ms, sys: 101 ms, total: 432 ms
Wall time: 363 ms


reddit_subreddit,reddit_text,text_chunk
str,str,str
"""GeneralMotors""","""If you work at GM most of the benefits that make this job worthwhile come from unions demanding them""","""If you work at GM most of the benefits that make this job worthwhile come from unions demanding them"""
"""GeneralMotors""","""In general GM has pretty good benefits, a good work life balance, and is on the lower side for base pay. You will have to figure out for yourself how much things like different health care plans, work life balance, and time off are worth. For me the reason I chose to look for a new position and leave GM was the lack of learning on the job and lack of mentorship inside my team at the time. But that is entirely team based. If I was on a different team there is a likelihood I would not have looked and left for a new job. Ultimately I'm happy for the way my experience at GM shaped my career priorities and for the time I was there. I'm also glad I got out when I did as I was not in a good position to grow like I wanted to.""","""In general GM has pretty good benefits, a good work life balance, and is on the lower side for base pay. You will have to figure out for yourself how much things like different health care plans, work life balance, and time off are worth."""
"""GeneralMotors""","""As a former employee who has moved on from the company I can say the benefits at GM are pretty good and are a good reason to stay. And the company is usually (but not always) good at making cost of living increases. With good benefits and salary increases its pretty easy to get comfortable at GM.""","""As a former employee who has moved on from the company I can say the benefits at GM are pretty good and are a good reason to stay. And the company is usually (but not always) good at making cost of living increases. With good benefits and salary increases its pretty easy to get comfortable at GM."""
"""GeneralMotors""","""I do agree that GM's overall benefits are superior. A couple things to note is that the other company is offering $2k more in tuition assistance than GM if I do grad school, which is what I would pursue (if I need to) since I'm almost done with my undergrad. GM's vacation policy and paternity leave is better, but do fathers get the 12 weeks too or is that for mothers? The other company has the same matching policy except they don't give you 4% of your salary for free even if you don't personally contribute. The other company doesn't have a vesting period while GM has a 3 year vesting period. The main things the other company brings to the table are additional bonuses such as a $10k sign on bonus as well as $15k in stocks and a sizeable lump sump for relocation, on top of around $7k more in base pay. They also have a similar 10% bonus system where they very often hit their 200% target compared to GM.""","""I do agree that GM's overall benefits are superior."""
"""GeneralMotors""","""What makes GM better? Better work life balance?""","""What makes GM better? Better work life balance?"""
"""GeneralMotors""","""GM has one of the best benefits out there. Look at the 1500 for life steps program. GM will give you 1500 just to get your physical done. The GM culture and holidays we get beats most companies. Did you know we get a day off just to vote? Primary and secondary elections? The shutdown for Christmas is a week. Free vacation. And did you know GM gives 12 weeks paid paternity leave? All of those summed up will be close or even more than the other company. Also GM gives you 4% on 401k and match of what you put in. It is 12% max i think. That is also huge. Sharing these benefits are not secret and is publix info if you dig it up. So what is the other company bringing to the table? Also GM will pay for your school if you were to obtain a masters degree.""","""GM has one of the best benefits out there. Look at the 1500 for life steps program. GM will give you 1500 just to get your physical done. The GM culture and holidays we get beats most companies. Did you know we get a day off just to vote? Primary and secondary elections? The shutdown for Christmas is a week. Free vacation. And did you know GM gives 12 weeks paid paternity leave? All of those summed up will be close or even more than the other company. Also GM gives you 4% on 401k and match of what you put"""
"""GeneralMotors""","""It does help go further at GM. That's a fact""","""It does help go further at GM. That's a fact"""
"""GeneralMotors""","""As an outsider, wouldn’t someone who works GM be qualified to get a solid job elsewhere? Some roles might not even have to stay in the automotive industry. I get GM has solid benefits and people still don’t want to get fired through/""","""As an outsider, wouldn’t someone who works GM be qualified to get a solid job elsewhere? Some roles might not even have to stay in the automotive industry. I get GM has solid benefits and people still don’t want to get fired through/"""
"""GeneralMotors""","""Do you work for GM?""","""Do you work for GM?"""
"""GeneralMotors""","""Do you work for GM?""","""Do you work for GM?"""
