In [1]:
from transformers import pipeline

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0"
zs_text_classifier = pipeline(model=model_name)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [2]:
candidate_labels = [
    "Affirmative",
    "Empathetic",
    "Apologetic",
    "Neutral",
]

hypothesis_template = "The emotion of this text is {}"

In [5]:
text = "I know this is personal, but do you have a lot of back pain?"

zs_text_classifier(
    text, 
    candidate_labels,
    hypothesis_template=hypothesis_template,
    multi_label=True
)

{'sequence': 'I know this is personal, but do you have a lot of back pain?',
 'labels': ['Empathetic', 'Neutral', 'Affirmative', 'Apologetic'],
 'scores': [0.944321870803833,
  0.8705635070800781,
  0.018851738423109055,
  0.0028029342647641897]}

In [6]:
import pandas as pd
import numpy as np
# from nltk.tokenize import word_tokenize, RegexpTokenizer
# import nltk
import json
from IPython.display import display
import re
from fuzzywuzzy import fuzz

pd.set_option('display.max_colwidth', 500)

edits_df = pd.read_csv("edits_with_regex_concepts_fixed.csv")
edits_df



Unnamed: 0,occurs_at,question_text,default_question_text,kb_concept_display,kb_concept_id,encounter_id
0,2025-01-10 17:07:01,"What brings you back into the clinic today, miss?","What brings you back into the clinic today, miss?",Other,,2867825
1,2025-01-10 17:07:01,"It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones. Have you noticed any changes or do you have any concerns regarding these issues?","It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones. Have you noticed any changes or do you have any concerns regarding these issues?",Follow-up Visit,,2867825
2,2025-01-10 17:07:01,"Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure?","Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure?",Fever/Chills,,2867825
3,2025-01-10 17:07:01,"Great. Also, for our records, how old are you and what race do you identify yourself as?","Great. Also, for our records, how old are you and what race do you identify yourself as?",Age Inquiry,,2867825
4,2025-04-06 02:33:27,How're you feeling today?,How're you feeling today?,Mood Inquiry,,5614226
...,...,...,...,...,...,...
5804,2025-09-15 04:24:04,"I'm originally from Kentucky. And I have to do the routine ask, but any drinking, smoking, or illicit drug use?","I'm originally from Kentucky. And I have to do the routine ask, but any drinking, smoking, or illicit drug use?",Other,,6453426
5805,2025-09-15 04:24:04,Got it.,Got it.,Other,,6453426
5806,2025-07-24 01:48:07,"Looks like the nurse came in and asked you everything. Ah, she has everything documented here, but I will ask one more time. So, tell me, where do you live?","Looks like the nurse came in and asked you everything. Ah, she has everything documented here, but I will ask one more time. So, tell me, where do you live?",Other,,9025733
5807,2025-07-24 01:48:07,Do you smoke?,Do you smoke?,Other,,9025733


In [8]:
empathy_splitters = "-,.;:!"
question_splitters = "?"

def create_question_splits(line):
    """
    Filter out empty items and strip of any trailing whitespace after splitting on sentence splits
    """
    regex_splitter = f"([{empathy_splitters}{question_splitters}])"
    split = list(filter(None, [x.strip() for x in re.split(regex_splitter, line)]))
    split = [x for x in split if len(x) > 1]
    return split

def split_into_columns(x):
    return x["empathy"], x["question"]

def select_question(candidates, orig_question):
    scores = [fuzz.ratio(c, orig_question) for c in candidates]
    return np.argmax(scores)

def get_scores(row):
    split = create_question_splits(row["question_text"])

    return [fuzz.ratio(c, row["default_question_text"]) for c in split]


def parse_question_text(row):
    text = row["question_text"] or ""
    orig = row["default_question_text"] or ""
    split = create_question_splits(text)

    # 1) If there was nothing to split, just return the whole text as “question”
    if not split:
        return {"empathy": None, "question": text}

    # 2) Try to find where the actual question starts
    try:
        question_start_idx = select_question(split, orig)
    except ValueError:
        # select_question did an argmax on empty or similar
        question_start_idx = None

    # 3) If select_question failed or returned None/out of range, 
    #    dump everything into 'question' and leave empathy empty
    if question_start_idx is None or not (0 <= question_start_idx <= len(split)):
        return {"empathy": None, "question": text}

    # 4) Otherwise split out empathy vs question
    empathy_part  = split[:question_start_idx]
    question_part = split[question_start_idx:]

    pred_empathy  = ". ".join(empathy_part).strip() or None
    pred_question = ". ".join(question_part).strip() or None

    return {
        "empathy":  pred_empathy,
        "question": pred_question,
    }

In [None]:
empathy_splitters = "-.;:!"
question_splitters = "?"

def create_question_splits(line):
    """
    Filter out empty items and strip of any trailing whitespace after splitting on sentence splits
    """
    # question_splitter = f"([{empathy_splitters}{question_splitters}])"

    # if there are multiple questions, split them. 
    question_splitter = f"([{question_splitters}])"
    questions = list(filter(None, [x.strip() for x in re.split(question_splitter, line)]))

    split = [x for x in split if len(x) > 1]
    return split

df = edits_df.head(5)
for _, row in df.iterrows():
    text = row["question_text"] or ""
    split = create_question_splits(text)
    print(split)

['What brings you back into the clinic today, miss']
['It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Have you noticed any changes or do you have any concerns regarding these issues']
['Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure']
['Great. Also, for our records, how old are you and what race do you identify yourself as']
["How're you feeling today"]
