This script loads therapy conversation data from a CSV file, splits it into chunks of dialogue turns, embeds the chunks using OpenAI,
and stores them in a Chroma_DB database. This file was ran on Google Collab because the memory requirements were too great for my local machine.

In [None]:
!pip install python-dotenv langchain-core langchain-openai langchain-chroma

In [None]:
import os
import time
import pandas as pd
import ast
from uuid import uuid4
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import sys
import gc

In [None]:


# === LOAD API KEY ===
load_dotenv() # This loads the API key from OpenAI (in the .env file) allowing us to use their model

# === CONFIGURATION ===
DATA_PATH = r"data/preprocessed_dataset.csv"  # Path to your dataset
CHROMA_PATH = r"chroma_db"  # Directory where Chroma saves (persists) its database files
SIZE_OF_CHUNK = 4  # Number of utterances per chunk
BATCH_SIZE = 1000
SLEEP_SECONDS = 2

# === TESTING IF CHROMA DATABASE WORK ===
os.makedirs(CHROMA_PATH, exist_ok=True)
print(f"💾 Using Chroma persist directory: {CHROMA_PATH}")
sys.stdout.flush()

# === INIT EMBEDDINGS ===
embedder = OpenAIEmbeddings(model="text-embedding-3-large")
print("🧪 Testing embedding...")
sys.stdout.flush()
test_vec = embedder.embed_query("THERAPIST: How are you feeling today?")
print("✅ Embedding worked. Length:", len(test_vec))
sys.stdout.flush()

# === LOAD AND PARSE THE 'conversations' COLUMN ===
""" 
This takes in each row of the dataset, returning a list of 'all_turns' so I have a list of every single utterrance, from client and therapist alike
"""
df = pd.read_csv(DATA_PATH)

def parse_conversations(row, idx=None):
    try:
        return ast.literal_eval(row) # This line takes the words in each row and returns them as a string object
    except Exception as e:
        print(f"❌ Failed to parse row {idx}: {e}")
        return []

all_turns = []
# Loop through every row, obtain the utterrances as a str object, store each utterrance in a all_turns list
for idx, row in df['conversations'].items():
    turns = parse_conversations(row, idx)
    all_turns.extend(turns) # all_turns now holds every single utterrance (in order)

print(f"\n✅ Parsed {len(all_turns)} total dialogue turns.")
sys.stdout.flush() #This forces the 'print' output to appear immediately

# === CHUNK ===
""" 
Each 'chunk' will be 4 utterrances (which we specified), the 'chunks_text' will return all the chunks, with 4 utterrances in each.
"""
def dialogue_chunker(dialogues, max_turns=SIZE_OF_CHUNK):
    # This loops through all the utterances in all_turns, adding 4 at a time to a chunk
    chunks = []
    for i in range(0, len(dialogues), max_turns):
        chunk = dialogues[i:i + max_turns]
        """ 
        Assume that each chunk looks like: 
        chunk = [
            {'role': 'Therapist', 'text': 'How are you feeling today?'},
            {'role': 'Client', 'text': 'I’m not doing great.'},
            {'role': 'Therapist', 'text': 'Can you tell me more about that?'},
            {'role': 'Client', 'text': 'I’ve been really anxious lately.'}
            ]
        This list comprehension line below loops through every entry within 'chunk' and only outputs the values for 'role' and 'text' so that the chunks list is clean. Also, each of the four entries within chunk_text is held together by new line characters, so in the end, it looks like this:
        chunk_texts = [
        "Therapist: How are you feeling today?\nClient: I’m not doing great.\nTherapist: Can you tell me more about that?\nClient: I’ve been really anxious lately.",
        
        "Therapist: What do you think is triggering your anxiety?\nClient: I’m not really sure.\nTherapist: When did it start?\nClient: A few weeks ago.",
        
        ...
        ]
        """
        chunk_text = "\n".join([f"{d['role']}: {d['text']}" for d in chunk])
        chunks.append(chunk_text)
    return chunks

chunk_texts = dialogue_chunker(all_turns)

print(f"✅ Created {len(chunk_texts)} chunks.")
sys.stdout.flush()

# === CONVERT TO DOCUMENT OBJECTS ===
# The Document function by langchain loops through every chunk in chunk_texts and wraps it in a Document object
documents = [Document(page_content=chunk) for chunk in chunk_texts]
uuids = [str(uuid4()) for _ in documents] # Creates a list of unique string IDs, one for each document

# === SETUP VECTOR DATABASE ===
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embedder,
    persist_directory=CHROMA_PATH,
)

# === ADD DOCUMENTS TO VECTOR STORE ===
print("\n📦 Inserting ALL documents (this may take a while)...")
sys.stdout.flush()

# Loop through each document and, in batches of 1000 documents, store each document as embedded/vectorized instances in the vector store.
for i in range(0, len(documents), BATCH_SIZE):
    batch = documents[i:i + BATCH_SIZE]
    batch_ids = uuids[i:i + BATCH_SIZE]
    print(f"\n➡️ [Batch {i} – {i + len(batch)}] Generating embeddings...")
    sys.stdout.flush()

    try:
        contents = [doc.page_content for doc in batch] #loop through every document in batch
        embeds = embedder.embed_documents(contents) # embed the content
        print(f"✅ [Batch {i}] Embeddings done. Now inserting...")
        sys.stdout.flush()

        # Insert docs one-by-one (This will show us if anything crashes)
        for j in range(len(batch)):
            try:
                vector_store.add_documents([batch[j]], ids=[batch_ids[j]])
                print(f"   ⤷ ✅ Inserted doc {i + j}")
                sys.stdout.flush()
            except Exception as doc_error:
                print(f"   ⤷ ❌ Failed to insert doc {i + j}: {doc_error}")
                import traceback
                traceback.print_exc()
                continue

        print(f"✅ [Batch {i}] Batch complete.")
        gc.collect() # This forces a garbage collection pass — it goes through all objects in memory and frees up anything that’s no longer being used, right now. This is used for memory intensive tasks.
        time.sleep(SLEEP_SECONDS)

    except Exception as embed_error:
        print(f"❌ [Batch {i}] Embedding or Insert Failed:")
        import traceback
        traceback.print_exc()
        sys.stdout.flush()
        continue




In [None]:
# Finally, import the database to local machine so it can be used repeatedly for acting as retrieval source for chatbot
shutil.make_archive("my_folder_zip", 'zip', "ChromaDB")