In [1]:
from datasets import load_dataset, concatenate_datasets
from unidecode import unidecode
import re, os, uuid, openai, chromadb
from tiktoken import encoding_for_model
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Pull a mentalhealth conversations dataset from hugging face 
amod = load_dataset("Amod/mental_health_counseling_conversations", split="train")

print(len(amod))


3512


In [3]:
amod

Dataset({
    features: ['Context', 'Response'],
    num_rows: 3512
})

In [4]:
amod[0]

{'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?",
 'Response': "If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this is someh

In [5]:
def clean(text: str) -> str:
    text = unidecode(text)               # normalize accents
    text = re.sub(r"\s+", " ", text)     # collapse whitespace
    text = text.strip()

    # Throw‑away PII patterns (emails, phones, @handles)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b", "[email]", text)
    text = re.sub(r"\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b", "[phone]", text)
    text = re.sub(r"@\w{3,}", "[user]", text)
    return text


In [None]:
docs = []

def add_doc(text, source, speaker, turn_idx):
    docs.append({
            "chunk_id": str(uuid.uuid4()),
            "text": text,
            "source": source,
            "speaker": speaker,
            "turn_idx": turn_idx
        })

In [7]:
for i, row in tqdm(enumerate(amod), total=len(amod)):
    #print(row)
    joined = f"PATIENT: {clean(row['Context'])}\nCOUNSELLOR: {clean(row['Response'])}"
    add_doc(joined, "Amod", "dialog_pair", i)

100%|██████████| 3512/3512 [00:00<00:00, 9833.90it/s] 


In [8]:
docs

[{'chunk_id': '4924f234-6b3a-47c5-ac70-c656f0809574',
  'text': "PATIENT: I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here. I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it. How can I change my feeling of being worthless to everyone?\nCOUNSELLOR: If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't

In [10]:
import os

openai_api_key=os.getenv("OPENAI_API_KEY")

## Ingest the data for RAG

In [None]:
from openai import OpenAI

embed_model = "text-embedding-3-small"

PERSIST_DIR = "./data/mental_health_counseling"

client = chromadb.PersistentClient(
    path=PERSIST_DIR
)

openai_client = OpenAI(api_key=openai_api_key)

#client = chromadb.Client()
collection = client.get_or_create_collection("mental_health_counseling")

for i in tqdm(range(0, len(docs), 50)):
    batch = docs[i:i+50]
    embeds = openai_client.embeddings.create(
        model=embed_model,
        input=[d["text"] for d in batch]
    ).data
    collection.add(
        ids   =[d["chunk_id"] for d in batch],
        embeddings=[e.embedding for e in embeds],
        documents=[d["text"] for d in batch],
        metadatas=[{k:v for k,v in d.items() if k!="text"} for d in batch]
    )
# Getting rate limited at 69%

In [12]:
len(collection.get()['ids'])


2450

In [None]:
import hashlib
# Check for duplicates
res = collection.get(include=["documents"])   
docs, ids = res["documents"], res["ids"]

seen = {}
dupes = []  

for doc, id_ in zip(docs, ids):
    h = hashlib.sha1(doc.encode("utf‑8")).hexdigest()
    if h in seen:
        dupes.append(id_)
    else:
        seen[h] = id_

if dupes:
    print(f"Found {len(dupes)} duplicates, deleting…")
    collection.delete(ids=dupes)
else:
    print("No duplicate texts detected!")


Found 433 duplicates, deleting…


In [None]:
len(collection.get()['ids'])
# Our final collection: 2017 documents. That will be good enough for this POC

2017

In [None]:
# Test the retrieval
query = "Patient says they want to end it all and feel hopeless."
query_embedding = openai_client.embeddings.create(input=query, model=embed_model).data[0].embedding
res = collection.query(query_embeddings=[query_embedding], n_results=3)

for t in res["documents"][0]:
    print("----\n", t[:300])


----
 PATIENT: After he got home from the hospital he was angry, then for a time wonderful. Now he is depressed and hopeless again.
COUNSELLOR: This is actually more common then we often realize, and actually understandable, as he has gone through a trauma, an unresolved existential crisis. He was taken c
----
 PATIENT: After he got home from the hospital he was angry, then for a time wonderful. Now he is depressed and hopeless again.
COUNSELLOR: A failed suicide attempt is commonly thought of as a "cry for help," however it can also be a serious attempt to find a permanent solution to depression, hopeless
----
 PATIENT: I am broke, but I am sure I have been depressed for the past couple of years. I have always had anxiety. I just need someone to talk to right now. I don't have any friends or family I can talk to. I'm on the verge of just giving up.
COUNSELLOR: Good question. There are resources out there -


In [31]:
docs[0]["text"]

"PATIENT: I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here. I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it. How can I change my feeling of being worthless to everyone?\nCOUNSELLOR: If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this is somehow terrible.Bad fe