This notebook shows how we could use the langchain SemanticChunker to split up text data.

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document

import matplotlib.pyplot as plt
import pandas as pd

from dsp_interview_transcripts import PROJECT_DIR
from dsp_interview_transcripts.utils.data_cleaning import clean_data, convert_timestamp

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load model
small_model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

# Embed the target sentence
target_sentence = "Are these instructions clear or do you need any further clarification?"
target_embedding = small_model.encode([target_sentence])

# Function to process each conversation
def remove_preamble(df, target_embedding=target_embedding, model=small_model):
    """Get rid of everything up until the bot asks if the instructions are clear
    """
    # Filter BOT messages
    bot_messages = df[df['role'] == 'BOT']
    
    # Embed BOT messages
    bot_embeddings = model.encode(bot_messages['text_clean'].tolist())
    
    # Calculate cosine similarity
    similarities = cosine_similarity(target_embedding, bot_embeddings).flatten()
    
    # Find the index of the most similar BOT message
    most_similar_idx = similarities.argmax()
    
    # Get the timestamp of that message
    cutoff_timestamp = bot_messages.iloc[most_similar_idx]['timestamp_clean']
    
    # Filter out messages prior to the cutoff timestamp
    return df[df['timestamp_clean'] > cutoff_timestamp]

In [None]:
# Read in the raw data
data = pd.read_csv(PROJECT_DIR / 'data/qual_af_transcripts.csv')

In [None]:
# Clean up the text a little and move audio transcriptions to the text column
interviews_df = clean_data(data)

In [None]:
# Make sure the conversations are sorted by time, so that the replies go in the right order
interviews_df['timestamp_clean'] = interviews_df['timestamp'].apply(convert_timestamp)
interviews_df = interviews_df.groupby('conversation', group_keys=False).apply(lambda x: x.sort_values('timestamp_clean'))

In [None]:
interviews_cleaned_df = interviews_df.groupby('conversation').apply(remove_preamble).reset_index(drop=True)

In [None]:
len(interviews_df) - len(interviews_cleaned_df)

In [None]:
len(interviews_cleaned_df)

In [None]:
interviews_cleaned_df.head()

In [None]:
# Turn every conversation into one big block of text (mimics the format of other interview/focus group transcripts we might see)
df_grouped = interviews_df.groupby('conversation')['text_clean'].apply(lambda x: '. '.join(x)).reset_index()

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
buffer_sizes = [1, 2, 3]

all_results = {}
actual_chunks = {}

for buffer_size in buffer_sizes:
    chunker = SemanticChunker(HuggingFaceEmbeddings(model_name=model_name), 
                              breakpoint_threshold_type="percentile", 
                              buffer_size=buffer_size)
    results = {}

    for idx, row in df_grouped.iterrows():
        text = row['text_clean']
        conv_id = row['conversation']
        # Turn it into a langchain document
        doc = Document(page_content=text)
        chunked_docs = chunker.split_documents([doc])
        results[conv_id] = [x.model_dump() for x in chunked_docs]
    
    
    all_results[buffer_size] = [len(chunk_list) for chunk_list in results.values()]
    actual_chunks[buffer_size] = results

fig, axes = plt.subplots(nrows=1, ncols=len(buffer_sizes), figsize=(15, 5), sharey=True)

for ax, buffer_size in zip(axes, buffer_sizes):
    lengths = all_results[buffer_size]
    ax.hist(lengths, bins=20, alpha=0.7)
    ax.set_title(f'Buffer Size {buffer_size}')
    ax.set_xlabel('Number of Chunks')
    ax.set_ylabel('Frequency')

plt.suptitle('Distribution of List Lengths for Different Buffer Sizes', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

See [the documentation](https://python.langchain.com/docs/how_to/semantic-chunker/) for info on different breakpoints. Percentile is the default.

See also [this notebook](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb).

Note that [the documentation](https://api.python.langchain.com/en/latest/text_splitter/langchain_experimental.text_splitter.SemanticChunker.html) suggests you can manipulate:
* the exact numerical value of the breakpoint threshold
* the regex for sentence delimiters
* the number of chunks if you have a sense of what this would be for your document

`buffer_size` = the number of sentences either side to include. So if `buffer_size` is 1, you will get 3 sentences in each group.