In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import polars as pl
import re
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from scipy.spatial.distance import cosine
from transformers import AutoTokenizer
from typing import Optional, List, Tuple

DATA_PARQUET = "../temp_data/raw_data_subset_gm.parquet"
DATA_PARQUET = "../temp_data/raw_data_subset_ups_fedex_lowes.parquet"
DATA_PARQUET = "../temp_data/raw_data_subset_ups_fedex_lowes.parquet"
## Set embedding model
## https://huggingface.co/thenlper/gte-base
EMBEDDINGS_MODEL = "thenlper/gte-base"

## Load data from parquet
DATA_RAW = pl.read_parquet(DATA_PARQUET)[0:1000]
##DATA_RAW = pl.read_parquet(DATA_PARQUET)

In [2]:
def preprocess_data(DATA_RAW: pl.DataFrame)->pl.DataFrame:
    '''
    Complete basis preprocessing tasks:
        - Drop rows with comments|submissions that have been dropped|deleted
        - Drop rows with comments|submissions automatically generated by a bot
        - Drop rows with comments|submissions that are empty

    Parameters:
    -----------
        DATA_RAW: pl.DataFrame
            A dataframe loaded from the raw Aware data provided
    '''
    
    ## Clone raw data
    data_preprocessed = DATA_RAW.clone()
    
    ## Drop rows with comments|submissions that have been dropped|deleted
    values_to_drop = ["[deleted]", "[removed]"]
    for value in values_to_drop:
        mask = (data_preprocessed["reddit_text"] != value)
        print(f"Dropping {data_preprocessed.filter(~mask).shape[0]} rows with "+
              f"reddit_text=='{value}'")
        data_preprocessed = data_preprocessed.filter(mask)

    ## Drop rows with comments|submissions automatically generated by a bot
    pattern =(r"This has been removed for breaking the sub rule of")
    mask = data_preprocessed["reddit_text"].str.contains(pattern)
    print(f"Dropping {data_preprocessed.filter(mask).shape[0]} rows with "+
          "reddit_text containing:"+
          "'This has been removed for breaking the sub rule of'")
    data_preprocessed = data_preprocessed.filter(~mask)

    ## Drop rows with comments|submissions that are empty
    mask = (data_preprocessed["reddit_text"]=="")
    print(f"Dropping {data_preprocessed.filter(mask).shape[0]} rows with "+
              f"reddit_text==''")
    data_preprocessed = data_preprocessed.filter(~mask)
    
    return data_preprocessed

In [3]:
def chunk_preprocessed_data(data_preprocessed: pl.DataFrame,
                            chunk_size: int,
                            chunk_overlap_pct: float)->pl.DataFrame:
    """
    Given a pl.DataFrame with reddit data, break target data into chunks
    suitable for embedding.

    Parameters:
    -----------
    data_raw: pl.DataFrame
        A dataframe containing the raw data
    chunk_size: int
        The maximum size of any text chunk
    chunk_overlap_pct: float
        The percent adjacent text chunks should overlap

    Returns:
    --------
    pl.DataFrame
        A dataframe with a new column:
            - "text_chunk": A chunk of text determined by the splitter                          
    """
    data_chunked = data_preprocessed.clone()
        
    text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=int(chunk_size*chunk_overlap_pct),
            length_function=len,
            is_separator_regex=False,
            strip_whitespace=True)
    
    ## Break longer texts into chunks
    data_chunked = data_chunked.with_columns(
        text_chunk=pl.col("reddit_text").map_elements(
            lambda x:text_splitter.split_text(x)))
    ## Explode the text chunks
    data_chunked = data_chunked.explode("text_chunk")

    mask = (data_chunked["text_chunk"].is_null())
    print(f"There are {data_chunked.filter(mask).shape[0]} rows with "+
                  f"text_chunk is null")
    ## Return the chunked dataframe
    return data_chunked

In [4]:
def vectorize_chunked_data(data_chunked: pl.DataFrame)->pl.DataFrame:
    '''
        Vectorize the chunked texts

        Parameters:
        -----------
            data_chunked: pl.DataFrame
                A dataframe with a chunk of text to be embedded

        Returns:
            pl.DataFrame
                A datafram with a new column:
                    - vector
                        Contains the vector representing the text_chunk
                        determined by the EMBEDDINGS_MODEL
    '''
    ## Set up model
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL)
    data_vectorized = data_chunked.clone()
    
    ## Remove null chunks
    mask = (data_vectorized["text_chunk"].is_null())
    print(f"Dropping {data_vectorized.filter(mask).shape[0]} rows with "+
                  f"text_chunk is null")
    data_vectorized = data_vectorized.filter(~mask)
    
    ## Compute embeddings
    text_chunks = data_vectorized["text_chunk"].to_list()
    vectors = embeddings.embed_documents(text_chunks)
    
    ## Add embeddings to the dataframe
    data_vectorized = data_vectorized.with_columns(
        pl.Series(
            name="vector",
            values=vectors
        )
    )

    return data_vectorized

In [5]:
%%time
data_preprocessed = preprocess_data(DATA_RAW=DATA_RAW)
data_chunked = chunk_preprocessed_data(data_preprocessed=data_preprocessed,
                                       chunk_size=512,
                                       chunk_overlap_pct=.2)

Dropping 3 rows with reddit_text=='[deleted]'
Dropping 2 rows with reddit_text=='[removed]'
Dropping 1 rows with reddit_text containing:'This has been removed for breaking the sub rule of'
Dropping 5 rows with reddit_text==''
There are 0 rows with text_chunk is null
CPU times: user 15.9 ms, sys: 11.9 ms, total: 27.8 ms
Wall time: 15.8 ms


In [6]:
print(DATA_RAW.shape)
print(data_preprocessed.shape)
print(data_chunked.shape)

(1000, 15)
(989, 15)
(1098, 16)


In [7]:
%%time
data_vectorized = vectorize_chunked_data(data_chunked=data_chunked)

Dropping 0 rows with text_chunk is null
CPU times: user 3.86 s, sys: 1.88 s, total: 5.74 s
Wall time: 5.79 s


In [8]:
print(data_vectorized.shape)

(1098, 17)


In [9]:
#data_vectorized.write_parquet("temp_data/reddit_subset_gm_vectorized.parquet")

In [10]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL)

def query_results(query, num_results):
    
    query_vector = embeddings.embed_query(query)   
    df = data_vectorized.clone()
    
    similarity_list = []
    for vector in df["vector"]:
        similarity_list.append(cosine(vector,query_vector))
    
    df = df.with_columns(pl.Series(name="similarity",values=similarity_list))
    
    top_results = df.sort(pl.col("similarity"), 
                          descending=False).head(num_results)["reddit_text"]
    
    print(f"Query: {query}")
    for result in top_results:
        print("-"*80)
        print(result)
        print("-"*80)

In [11]:
%%time
query_results("How many days off do you get in a year?", 10)

Query: How many days off do you get in a year?
--------------------------------------------------------------------------------
It another bonus some people get to get during the year. Can be either 3 or 5 years in length to get the full payout.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Outlook and Teams don't tell a worker they have 3 hours to get a task done. Most people have at least a few meetings and other obligations throughout the day which require them to be aware of time and check the calendar. It's not just work but picking the kids up from soccer and putting them to bed. 

An extra workday per week is more than an hour and a half per day. An increase approaching 20%.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
145%  Enough to make people hap